Browse Source

cleanup mxm.c, support DAGS1.

Duc Nguyen 1 year ago
parent
commit
a00e31d446
4 changed files with 123 additions and 207 deletions
  1. 1 0
      .gitignore
  2. 12 10
      include/gf/gf.h
  3. 4 18
      src/gf/gf.c
  4. 106 179
      src/structures/mxm.c

+ 1 - 0
.gitignore

@@ -47,3 +47,4 @@ dags*
 binaries/
 .vscode/
 makefile
+result

+ 12 - 10
include/gf/gf.h

@@ -88,11 +88,12 @@ static inline gf gf_q_m_mult(const gf in0, const  gf in1) {
 
 
 #ifdef DAGS_1
-static inline gf gf_mult(gf in0, gf in1) {
-	uint64_t i, tmp, t0 = in0, t1 = in1;
 
+static inline gf gf_mult(gf in0, gf in1) {
+	int i; 
+	gf tmp = 0; 
+	gf t0 = in0, t1 = in1;
 	//Multiplication
-	tmp = 0;
 
 	for (i = 0; i < 7; i++)
 		tmp ^= (t0 * (t1 & (1 << i)));
@@ -100,28 +101,28 @@ static inline gf gf_mult(gf in0, gf in1) {
 	//reduction
 	tmp = tmp & 0xFFF;	// tmp & 0000 1111 1111 1111
 	tmp ^= (tmp >> 6);
-	tmp ^= (tmp >> 5) ;
+	tmp ^= (tmp >> 5) & 0x3E;
 	tmp = tmp & 0x3F;
 	return tmp;
 }
 
 static inline gf gf_q_m_mult(gf in0, gf in1) {
-	int i, tmp; 
+	int i; 
+	uint32_t tmp = 0; 
 	gf t0 = in0, t1 = in1;
-	gf reduction
+	gf reduction;
 	//Multiplication
-	tmp = t0 * (t1 & 1);
 
-	for (i = 1; i < 18; i++)
+	for (i = 0; i < 18; i++)
 		tmp ^= (t0 * (t1 & (1 << i)));
 
 	//reduction
-	tmp = tmp & 0x7FFFFF;	// tmp & 0111 1111 1111 1111
+	// tmp = tmp & 0x7FFFFF;	// tmp & 0111 1111 1111 1111
 	//first step of reduction
 	//second step of reduction
 	
 	for (i=0; i < 2; i++){
-		reduction = (tmp >> 12);
+		reduction = (tmp >> 12) & 0x7FF;
 		tmp = tmp & 0xFFF;
 		tmp ^= reduction;
 		tmp ^= reduction << 1;
@@ -132,6 +133,7 @@ static inline gf gf_q_m_mult(gf in0, gf in1) {
 	tmp = tmp & 0xFFF;
 	return tmp;
 }
+
 #endif
 
 

+ 4 - 18
src/gf/gf.c

@@ -74,7 +74,7 @@ gf absolut_field_representation(gf *element) {
 #endif
 
 #ifdef DAGS_1
-static inline gf relative_field_representation(gf a, int k) {
+gf relative_field_representation(gf a, int k) {
 	gf x[extension] = { 0 };
 	gf b_0_t = a & 0x1;
 	gf b_1_t = (a & 0x2) >> 1;
@@ -90,38 +90,24 @@ static inline gf relative_field_representation(gf a, int k) {
 	gf b_11_t = (a & 0x800) >> 11;
 
 	gf b_0 = b_0_t ^ b_2_t ^ b_6_t ^ b_7_t ^ b_8_t ^ b_9_t ^ b_10_t ^ b_11_t;
-
 	gf b_1 = b_4_t ^ b_6_t ^ b_9_t ^ b_10_t;
-
 	gf b_2 = b_2_t ^ b_3_t ^ b_5_t ^ b_6_t ^ b_8_t;
-
 	gf b_3 = b_2_t ^ b_7_t ^ b_8_t ^ b_9_t;
-
 	gf b_4 = b_2_t ^ b_5_t ^ b_7_t ^ b_8_t ^ b_9_t;
-
 	gf b_5 = b_2_t ^ b_4_t ^ b_7_t ^ b_8_t ^ b_11_t;
-
 	gf b_6 = b_1_t ^ b_2_t ^ b_4_t ^ b_5_t ^ b_7_t ^ b_11_t;
-
 	gf b_7 = b_2_t ^ b_4_t ^ b_5_t ^ b_6_t ^ b_8_t ^ b_11_t;
-
 	gf b_8 = b_4_t;
-
 	gf b_9 = b_3_t ^ b_4_t ^ b_8_t ^ b_10_t ^ b_11_t;
-
 	gf b_10 = b_3_t ^ b_5_t ^ b_8_t ^ b_9_t ^ b_11_t;
-
 	gf b_11 = b_3_t ^ b_5_t ^ b_8_t ^ b_9_t;
 
-	x[0] = (b_0) | (b_1 << 1) | (b_2 << 2) | (b_3 << 3) | (b_4 << 4)
-			| (b_5 << 5);
-	x[1] = (b_6) | (b_7 << 1) | (b_8 << 2) | (b_9 << 3) | (b_10 << 4)
-			| (b_11 << 5);
-	return x[k];
+	x[0] = (b_0) | (b_1 << 1) | (b_2 << 2) | (b_3 << 3) | (b_4 << 4) | (b_5 << 5);
+	x[1] = (b_6) | (b_7 << 1) | (b_8 << 2) | (b_9 << 3) | (b_10 << 4) | (b_11 << 5);
 	return x[k];
 }
 
-static inline gf absolut_field_representation(gf *element) {
+gf absolut_field_representation(gf *element) {
 	gf beta = 197;
 	gf tmp1 = 0, tmp2 = 0, in0 = element[0], in1 = element[1];
 

+ 106 - 179
src/structures/mxm.c

@@ -1,11 +1,13 @@
 #include "../../include/structures/mxm.h"
+#include "../cpucycles.h"
 
 #define BLK_SIZE 128
 #define min(a,b) (((a)<(b))?(a):(b))
+
 #define bound 32
 
-#include "../cpucycles.h"
 
+#if defined(DAGS_3) || defined(DAGS_5) || defined(DAGS_TOY)
 void gf_q_m_mult_bound(const gf A, const  gf* restrict B, gf* restrict C) {
 	gf reduction[bound]; 
 	uint32_t tmp[bound] = {0};
@@ -13,104 +15,40 @@ void gf_q_m_mult_bound(const gf A, const  gf* restrict B, gf* restrict C) {
 	int i,j; 
 
 	//Multiplication
-	for (i = 0; i < 18; i++){
-		tmp[0] ^= (A * (B[0] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[1] ^= (A * (B[1] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[2] ^= (A * (B[2] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[3] ^= (A * (B[3] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[4] ^= (A * (B[4] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[5] ^= (A * (B[5] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[6] ^= (A * (B[6] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[7] ^= (A * (B[7] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[8] ^= (A * (B[8] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[9] ^= (A * (B[9] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[10] ^= (A * (B[10] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[11] ^= (A * (B[11] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[12] ^= (A * (B[12] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[13] ^= (A * (B[13] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[14] ^= (A * (B[14] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[15] ^= (A * (B[15] & (1 << i)));
-	}
+	for (i = 0; i < 18; i++)	tmp[0] ^= (A * (B[0] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[1] ^= (A * (B[1] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[2] ^= (A * (B[2] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[3] ^= (A * (B[3] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[4] ^= (A * (B[4] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[5] ^= (A * (B[5] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[6] ^= (A * (B[6] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[7] ^= (A * (B[7] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[8] ^= (A * (B[8] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[9] ^= (A * (B[9] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[10] ^= (A * (B[10] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[11] ^= (A * (B[11] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[12] ^= (A * (B[12] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[13] ^= (A * (B[13] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[14] ^= (A * (B[14] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[15] ^= (A * (B[15] & (1 << i))); 
 	// /*
-	
-	for (i = 0; i < 18; i++){
-		tmp[16+ 0] ^= (A * (B[0+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 1] ^= (A * (B[1+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 2] ^= (A * (B[2+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 3] ^= (A * (B[3+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 4] ^= (A * (B[4+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 5] ^= (A * (B[5+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 6] ^= (A * (B[6+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 7] ^= (A * (B[7+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 8] ^= (A * (B[8+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 9] ^= (A * (B[9+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 10] ^= (A * (B[10+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 11] ^= (A * (B[11+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 12] ^= (A * (B[12+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 13] ^= (A * (B[13+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 14] ^= (A * (B[14+16] & (1 << i)));
-	}
-	for (i = 0; i < 18; i++){
-		tmp[16+ 15] ^= (A * (B[15+16] & (1 << i)));
-	}
+	for (i = 0; i < 18; i++)	tmp[16+0]  ^= (A * (B[16+0] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+1]  ^= (A * (B[16+1] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+2]  ^= (A * (B[16+2] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+3]  ^= (A * (B[16+3] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+4]  ^= (A * (B[16+4] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+5]  ^= (A * (B[16+5] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+6]  ^= (A * (B[16+6] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+7]  ^= (A * (B[16+7] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+8]  ^= (A * (B[16+8] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+9]  ^= (A * (B[16+9] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+10] ^= (A * (B[16+10] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+11] ^= (A * (B[16+11] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+12] ^= (A * (B[16+12] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+13] ^= (A * (B[16+13] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+14] ^= (A * (B[16+14] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+15] ^= (A * (B[16+15] & (1 << i))); 
+
 	// */
 	for (j = 0; j < bound; j++){
 		for (i = 0; i < 2; i++){
@@ -128,37 +66,74 @@ void gf_q_m_mult_bound(const gf A, const  gf* restrict B, gf* restrict C) {
 	}
 }
 
+#endif
 
-void mxm_block_reorder_reuse(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
-	int i, j, k, ii, jj, kk, Aik, bs = BLK_SIZE;
+#ifdef DAGS_1
+void gf_q_m_mult_bound(const gf A, const  gf* restrict B, gf* restrict C) {
+	gf reduction[bound]; 
+	uint32_t tmp[bound] = {0};
+	
+	int i,j; 
+	
+	//Multiplication
+	for (i = 0; i < 18; i++)	tmp[0] ^= (A * (B[0] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[1] ^= (A * (B[1] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[2] ^= (A * (B[2] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[3] ^= (A * (B[3] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[4] ^= (A * (B[4] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[5] ^= (A * (B[5] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[6] ^= (A * (B[6] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[7] ^= (A * (B[7] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[8] ^= (A * (B[8] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[9] ^= (A * (B[9] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[10] ^= (A * (B[10] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[11] ^= (A * (B[11] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[12] ^= (A * (B[12] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[13] ^= (A * (B[13] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[14] ^= (A * (B[14] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[15] ^= (A * (B[15] & (1 << i))); 
+	// /*
+	for (i = 0; i < 18; i++)	tmp[16+0]  ^= (A * (B[16+0] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+1]  ^= (A * (B[16+1] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+2]  ^= (A * (B[16+2] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+3]  ^= (A * (B[16+3] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+4]  ^= (A * (B[16+4] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+5]  ^= (A * (B[16+5] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+6]  ^= (A * (B[16+6] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+7]  ^= (A * (B[16+7] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+8]  ^= (A * (B[16+8] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+9]  ^= (A * (B[16+9] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+10] ^= (A * (B[16+10] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+11] ^= (A * (B[16+11] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+12] ^= (A * (B[16+12] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+13] ^= (A * (B[16+13] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+14] ^= (A * (B[16+14] & (1 << i))); 
+	for (i = 0; i < 18; i++)	tmp[16+15] ^= (A * (B[16+15] & (1 << i))); 
+
+	//reduction
+	//first step of reduction
+	//second step of reduction
 
-	for (ii = 0; ii < m; ii += bs)
-		for (kk = 0; kk < n; kk += bs)
-			for (jj = 0; jj < p; jj += bs)
-				for (i = ii; i < min(m, ii + bs); i++)
-					for (k = kk; k < min(n, kk + bs); k++) {
-						Aik = A[n * i + k];
-						for (j = jj; j < min(p, jj + bs); j++)
-							C[p * i + j] ^= gf_q_m_mult(Aik, B[p * k + j]);
-					}
+	for (j = 0; j < bound; j++){
+		for (i = 0; i < 2; i++){
+            reduction[j] = (tmp[j] >> 12) &0x7FF;
+            tmp[j] = tmp[j] & 0xFFF;
+            tmp[j] ^= reduction[j];
+            tmp[j] ^= reduction[j] << 1;
+            tmp[j] ^= reduction[j] << 4;
+            tmp[j] ^= reduction[j] << 6;
+        }
+	}
+	
+	for (j = 0; j < bound; j++){
+		C[j] ^=  (tmp[j] & 0xFFF);
+	}
 }
 
-void mxm_block_reorder_reuse_unroll_8(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
-	int i, j, k, ii, jj, kk, Aik, bs = BLK_SIZE;
+#endif
 
-	for (ii = 0; ii < m; ii += bs)
-		for (kk = 0; kk < n; kk += bs)
-			for (jj = 0; jj < p; jj += bs)
-				for (i = ii; i < min(m, ii + bs); i++)
-					for (k = kk; k < min(n, kk + bs); k++) {
-						Aik = A[n * i + k];
-						for (j = jj; j < min(p, jj + bs); j += 8) {
-							gf_q_m_mult_bound(Aik, &B[p * k + j], &C[p * i + j]);
-						}
-					}
-}
 
-void mxm_block_reorder_reuse_unroll_16(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
+void mxm_block_reorder_reuse(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
 	int i, j, k, ii, jj, kk, Aik, bs = BLK_SIZE;
 
 	for (ii = 0; ii < m; ii += bs)
@@ -167,14 +142,12 @@ void mxm_block_reorder_reuse_unroll_16(gf* restrict C, const gf* restrict A, con
 				for (i = ii; i < min(m, ii + bs); i++)
 					for (k = kk; k < min(n, kk + bs); k++) {
 						Aik = A[n * i + k];
-						for (j = jj; j < min(p, jj + bs); j += 16) {
-							gf_q_m_mult_bound(Aik, &B[p * k + j], &C[p * i + j]);
-							
-						}
+						for (j = jj; j < min(p, jj + bs); j++)
+							C[p * i + j] ^= gf_q_m_mult(Aik, B[p * k + j]);
 					}
 }
 
-void mxm_block_reorder_reuse_unroll_32(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
+void mxm_block_reorder_reuse_unroll_bound(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p) {
 	int i, j, k, ii, jj, kk, Aik, bs = BLK_SIZE;
 
 	for (ii = 0; ii < m; ii += bs)
@@ -183,68 +156,22 @@ void mxm_block_reorder_reuse_unroll_32(gf* restrict C, const gf* restrict A, con
 				for (i = ii; i < min(m, ii + bs); i++)
 					for (k = kk; k < min(n, kk + bs); k++) {
 						Aik = A[n * i + k];
-						for (j = jj; j < min(p, jj + bs); j += 32) {
+						for (j = jj; j < min(p, jj + bs); j += bound) {
 							gf_q_m_mult_bound(Aik, &B[p * k + j], &C[p * i + j]);
 							
 						}
 					}
 }
 
-/*
-void mxm(gf* restrict C, const gf* restrict A, const gf* restrict B, const int m, const int n, const int p, const int uf) {
-	//mxm_naive(C, A, B, m, n, p);
-	//mxm_block(C, A, B, m, n, p);
-	//mxm_block_reorder(C, A, B, m, n, p);
-	PRINT_DEBUG("m n p = %d %d %d\n", m, n, p);
-	switch (uf) {
-	case 16:
-		mxm_block_reorder_reuse_unroll_16(C, A, B, m, n, p);
-		break;
-	case 8:
-		mxm_block_reorder_reuse_unroll_8(C, A, B, m, n, p);
-		break;
-	case 4:
-		mxm_block_reorder_reuse_unroll_4(C, A, B, m, n, p);
-		break;
-	case 2:
-		mxm_block_reorder_reuse_unroll_2(C, A, B, m, n, p);
-		break;
-	default:
-		mxm_block_reorder_reuse(C, A, B, m, n, p);
-		break;
-	}
-}
-*/
-
 // Cmxp = Amxn * Bnxp
 void mxm_product(gf* restrict dest,const gf* restrict a, const gf* restrict b, const int m, const int n, const int p) {
 	long long start, end; 
 	start = cpucycles();
-	// mxm_block_reorder_reuse_unroll_8(dest, a, b, m, n, p);
-	// mxm_block_reorder_reuse_unroll_16(dest, a, b, m, n, p);
-	mxm_block_reorder_reuse_unroll_32(dest, a, b, m, n, p);
+	// Option for GCC 
+	mxm_block_reorder_reuse_unroll_bound(dest, a, b, m, n, p);
+	// Option for CLANG
 	// mxm_block_reorder_reuse(dest, a, b, m, n, p);
-	// mxm_block_reorder_reuse_unroll_2(dest, a, b, m, n, p);
 	end = cpucycles() - start;
 	PRINT_DEBUG("[++++] timming %lld \n", end/1000000);
-	/*
-	int uf;
-
-	if (p % 16 == 0)
-		uf = 16;
-	else if (p % 8 == 0)
-		uf = 8;
-	else if (p % 4 == 0)
-		uf = 4;
-	else if (p % 2 == 0)
-		uf = 2;
-	else
-		uf = 0;
-
-	printf("Matrix dimensions %dx%d\n", m, n);
-	printf("Block size: %d\n", BLK_SIZE);
-	printf("Unrolls: %d\n", uf);
-
-	mxm(dest, a, b, m, n, p, uf);
-	*/
+	
 }