/* 
   matrix8.c -  Parallel Matrix Multiplication
                improvement on matrix7: timing

   compile: mpicc -Wall -O -o matrix8 matrix8.c
   run:     mpirun -np num_procs matrix8 file
*/

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>

/* Aux fcts ------------------------------------------------------- */
int * allocVector(int n)
{
  return (int *)malloc(n*sizeof(int));
}

int ** allocMatrix(int m, int n)
{
  int ** newM = (int **)malloc(m*sizeof(int *));
  int i;
  for (i = 0; i < m; i++)
    newM[i] = allocVector(n);
  return newM;
}

void readMatrix(FILE * fin, int ** M, int m, int n)
{
  int i, j;
  for (i = 0; i < m; i++)
    for (j = 0; j < n; j++)
      fscanf(fin, "%d", &(M[i][j]));
}

void writeMatrix(FILE * fout, int ** M, int m, int n)
{
  int i, j;
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++)
      fprintf(fout, "%d ", M[i][j]);
    putc('\n', fout);
  }
}

/* Auxiliary function: transpose m*n matrix into n*m matrix */
int ** transpose(int ** M, int m, int n)
{
  int ** MT;
  int i, j;
  MT = allocMatrix(n, m);
  for (i = 0; i < m; i++)
    for (j = 0; j < n; j++)
      MT[j][i] = M[i][j];
  return MT;
}


/* Auxiliary function: dot product of two vectors of length n */
int dotProd(int * V1, int * V2, int n)
{
  int dp = 0;
  int i;
  for (i = 0; i < n; i++)
    dp = dp + V1[i] * V2[i];
  return dp;
}


/* Master: Spread mult M1*M2 (a m*m matrix) over p workers;
           each computes approx m/p rows */
int ** matrixProdMaster(int ** M1, int ** M2T, int m, int n, int p)
{
  int i, j, r, s;
  MPI_Status status;

  int ** M3 = allocMatrix(m,m);

  s = m/p;
  r = m%p;

  // First r workers get sent s+1 rows of M1
  for (i = 0; i < r; i++)
    for (j = 0; j < s+1; j++)
      MPI_Send(M1[i * (s+1) + j], n, MPI_INT, i+1, 0, MPI_COMM_WORLD);

  // Remaining p-r workers get sent s rows of M1
  for (i = r; i < p; i++)
    for (j = 0; j < s; j++)
      MPI_Send(M1[r + i * s + j], n, MPI_INT, i+1, 0, MPI_COMM_WORLD);

  // All rows of M2T sent to all workers
  for (j = 0; j < m; j++)
    for (i = 0; i < p; i++)
      MPI_Send(M2T[j], n, MPI_INT, i+1, 0, MPI_COMM_WORLD);

  // First r workers deliver s+1 rows of M3
  for (i = 0; i < r; i++)
    for (j = 0; j < s+1; j++)
      MPI_Recv(M3[i * (s+1) + j], m, MPI_INT, i+1, 0, MPI_COMM_WORLD, &status);

  // Remaining p-r workers deliver s rows of M3
  for (i = r; i < p; i++)
    for (j = 0; j < s; j++)
      MPI_Recv(M3[r + i * s + j], m, MPI_INT, i+1, 0, MPI_COMM_WORLD, &status);

  return M3;
}


/* Worker: Compute approx m/p rows of resulting m*m matrix */
void matrixProdWorker(int m, int n, int p, int id)
{
  MPI_Status status;
  int i, j, s, r;

  s = m/p;
  r = m%p;
  if (id <= r)
    s++;

  int ** R = allocMatrix(s, n);
  int * C = allocVector(n);
  int ** RR = allocMatrix(s, m);

  for (i = 0; i < s; i++)
    MPI_Recv(R[i], n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);

  for (j = 0; j < m; j++) {
    MPI_Recv(C, n, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    for (i = 0; i < s; i++)
      RR[i][j] = dotProd(R[i], C, n);
  }

  for (i = 0; i < s; i++)
    MPI_Send(RR[i], m, MPI_INT, 0, 0, MPI_COMM_WORLD);
}


/* Main fct ------------------------------------------------------- */
int main(int argc, char ** argv)
{
  int p, id, m, n;
  double elapsed_time;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &p);
  MPI_Comm_rank(MPI_COMM_WORLD, &id);

  if (id == 0) { /* master porcessor */
    FILE * fin = fopen(argv[1], "r");
    fscanf(fin, "%d %d", &m, &n);
    int ** M1 = allocMatrix(m,n);
    int ** M2 = allocMatrix(n,m);
    readMatrix(fin, M1, m, n);
    readMatrix(fin, M2, n, m);
    fclose(fin);

    // start the timer
    MPI_Barrier(MPI_COMM_WORLD);
    elapsed_time = - MPI_Wtime();

    int i;
    for(i = 1; i < p; i++) {
      MPI_Send(&m, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
      MPI_Send(&n, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
    }

    if (p > 1) {
      int ** M2T = transpose(M2, n, m);
      int ** M3 = matrixProdMaster(M1, M2T, m, n, p-1);

      // stop the timer
      elapsed_time += MPI_Wtime();

      // write matrix to /dev/null
      FILE * fout = fopen("/dev/null", "w");
      writeMatrix(fout, M3, m, m);
      // printf("Multiplying %dx%d matrices on %d workers: %f secs\n", m, n, p-1, elapsed_time);
      // printf("%d %d %2d %2f\n", m, n, p-1, elapsed_time);
      printf("%d * %d; %2d processors; %f secs\n",
             m,n,p,elapsed_time);

    }
    else
      printf("Must have at least 2 processors\n");
  }
  else {
    MPI_Status status;

    // start the timer
    MPI_Barrier(MPI_COMM_WORLD);
    // elapsed_time = - MPI_Wtime();

    MPI_Recv(&m, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    MPI_Recv(&n, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);

    matrixProdWorker(m, n, p-1, id);

    // stop the timer
    // elapsed_time += MPI_Wtime();
  }

  MPI_Finalize();
  return 0;
}