/* 
   matrix5.c - Parallel Matrix Multiplication
               improvement on matrix4.c: send rows of M1 once

   compile: mpicc -Wall -O -o matrix5 matrix5.c
   run:     mpirun -np num_procs matrix5 file
*/

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>

/* Aux fcts ------------------------------------------------------- */
int * allocVector(int n)
{
  return (int *)malloc(n*sizeof(int));
}

int ** allocMatrix(int m, int n)
{
  int ** newM = (int **)malloc(m*sizeof(int *));
  int i;
  for (i = 0; i < m; i++)
    newM[i] = allocVector(n);
  return newM;
}

void readMatrix(FILE * fin, int ** M, int m, int n)
{
  int i, j;
  for (i = 0; i < m; i++)
    for (j = 0; j < n; j++)
      fscanf(fin, "%d", &(M[i][j]));
}

void writeMatrix(FILE * fout, int ** M, int m, int n)
{
  int i, j;
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++)
      fprintf(fout, "%d ", M[i][j]);
    putc('\n', fout);
  }
}

// Auxiliary function: transpose m*n matrix into n*m matrix
int ** transpose(int ** M, int m, int n)
{
  int ** MT;
  int i, j;
  MT = allocMatrix(n, m);
  for (i = 0; i < m; i++)
    for (j = 0; j < n; j++)
      MT[j][i] = M[i][j];
  return MT;
}


// Auxiliary function: dot product of two vectors of length n
int dotProd(int * V1, int * V2, int n)
{
  int dp = 0;
  int i;
  for (i = 0; i < n; i++)
    dp = dp + V1[i] * V2[i];
  return dp;
}


// Master: Spread mult M1*M2 (a m*m matrix) over m workers; each computes 1 row
int ** matrixProdMaster(int ** M1, int ** M2T, int m, int n)
{
  int i, j;
  MPI_Status status;

  int ** M3 = allocMatrix(m,m);

  for (i = 0; i < m; i++)
    MPI_Send(M1[i], n, MPI_INT, i+1, 0, MPI_COMM_WORLD);

  for (j = 0; j < m; j++) {
    for (i = 0; i < m; i++)
      MPI_Send(M2T[j], n, MPI_INT, i+1, 0, MPI_COMM_WORLD);

    for(i = 0; i < m; i++)
      MPI_Recv(&(M3[i][j]), 1, MPI_INT, i+1, 0, MPI_COMM_WORLD, &status);
  }
  return M3;
}


// Worker: Compute 1 row of resulting m*m matrix
void matrixProdWorker(int m, int n)
{
  MPI_Status status;
  int dp, i;
  int * R = allocVector(n);
  int * C = allocVector(n);

  MPI_Recv(R, n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);

  for (i = 0; i < m; i++) {
    MPI_Recv(C, n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    dp = dotProd(R, C, n);
    MPI_Send(&dp, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
  }
}


int main(int argc, char ** argv)
{
  int p, id, m, n;
  FILE * fin;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &p);
  MPI_Comm_rank(MPI_COMM_WORLD, &id);

  if (id == 0) {
    fin = fopen(argv[1], "r");
    fscanf(fin, "%d %d", &m, &n);
    int ** M1 = allocMatrix(m,n);
    int ** M2 = allocMatrix(n,m);
    readMatrix(fin, M1, m, n);
    readMatrix(fin, M2, n, m);
    fclose(fin);

    int i;
    for(i = 1; i < p; i++) {
      MPI_Send(&m, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
      MPI_Send(&n, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
    }

    if (p == m+1) {
      int ** M2T = transpose(M2, n, m);
      int ** M3 = matrixProdMaster(M1, M2T, m, n);
      writeMatrix(stdout, M3, m, m);
    }
    else
      printf("Must have %d processors\n", m+1);
  }
  else {
    MPI_Status status;

    MPI_Recv(&m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    MPI_Recv(&n, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);

    if (p == m+1)
      matrixProdWorker(m, n);
  }

  MPI_Finalize();
  return 0;
}