/* matrix5.c - Parallel Matrix Multiplication improvement on matrix4.c: send rows of M1 once compile: mpicc -Wall -O -o matrix5 matrix5.c run: mpirun -np num_procs matrix5 file */ #include #include #include #include /* Aux fcts ------------------------------------------------------- */ int * allocVector(int n) { return (int *)malloc(n*sizeof(int)); } int ** allocMatrix(int m, int n) { int ** newM = (int **)malloc(m*sizeof(int *)); int i; for (i = 0; i < m; i++) newM[i] = allocVector(n); return newM; } void readMatrix(FILE * fin, int ** M, int m, int n) { int i, j; for (i = 0; i < m; i++) for (j = 0; j < n; j++) fscanf(fin, "%d", &(M[i][j])); } void writeMatrix(FILE * fout, int ** M, int m, int n) { int i, j; for (i = 0; i < m; i++) { for (j = 0; j < n; j++) fprintf(fout, "%d ", M[i][j]); putc('\n', fout); } } // Auxiliary function: transpose m*n matrix into n*m matrix int ** transpose(int ** M, int m, int n) { int ** MT; int i, j; MT = allocMatrix(n, m); for (i = 0; i < m; i++) for (j = 0; j < n; j++) MT[j][i] = M[i][j]; return MT; } // Auxiliary function: dot product of two vectors of length n int dotProd(int * V1, int * V2, int n) { int dp = 0; int i; for (i = 0; i < n; i++) dp = dp + V1[i] * V2[i]; return dp; } // Master: Spread mult M1*M2 (a m*m matrix) over m workers; each computes 1 row int ** matrixProdMaster(int ** M1, int ** M2T, int m, int n) { int i, j; MPI_Status status; int ** M3 = allocMatrix(m,m); for (i = 0; i < m; i++) MPI_Send(M1[i], n, MPI_INT, i+1, 0, MPI_COMM_WORLD); for (j = 0; j < m; j++) { for (i = 0; i < m; i++) MPI_Send(M2T[j], n, MPI_INT, i+1, 0, MPI_COMM_WORLD); for(i = 0; i < m; i++) MPI_Recv(&(M3[i][j]), 1, MPI_INT, i+1, 0, MPI_COMM_WORLD, &status); } return M3; } // Worker: Compute 1 row of resulting m*m matrix void matrixProdWorker(int m, int n) { MPI_Status status; int dp, i; int * R = allocVector(n); int * C = allocVector(n); MPI_Recv(R, n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); for (i = 0; i < m; i++) { MPI_Recv(C, n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); dp = dotProd(R, C, n); MPI_Send(&dp, 1, MPI_INT, 0, 0, MPI_COMM_WORLD); } } int main(int argc, char ** argv) { int p, id, m, n; FILE * fin; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &id); if (id == 0) { fin = fopen(argv[1], "r"); fscanf(fin, "%d %d", &m, &n); int ** M1 = allocMatrix(m,n); int ** M2 = allocMatrix(n,m); readMatrix(fin, M1, m, n); readMatrix(fin, M2, n, m); fclose(fin); int i; for(i = 1; i < p; i++) { MPI_Send(&m, 1, MPI_INT, i, 0, MPI_COMM_WORLD); MPI_Send(&n, 1, MPI_INT, i, 0, MPI_COMM_WORLD); } if (p == m+1) { int ** M2T = transpose(M2, n, m); int ** M3 = matrixProdMaster(M1, M2T, m, n); writeMatrix(stdout, M3, m, m); } else printf("Must have %d processors\n", m+1); } else { MPI_Status status; MPI_Recv(&m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(&n, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); if (p == m+1) matrixProdWorker(m, n); } MPI_Finalize(); return 0; }