/* 
   vector addition, using an expensive sum 
   Based on example from the slides by Michele Weiland, EPCC:
    http://www.macs.hw.ac.uk/~hwloidl/Courses/F21DP/UPC_GuestLecture12.pdf

   Env:      export PATH=/u1/staff/dsg/OPT/local/berkeley_upc/bin:$PATH
   Compile:  upcc --pthreads -o v4 vecadd4.c
   Run:      export UPC_PTHREADS_PER_PROC=1; upcrun -n=8 v4 555555
   Measure:  for ((n=1; n<9; n++)) ; do  upcrun -n=$n v4 555555 ; done
*/

#include <upc.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define	N	1000000*THREADS

shared	int	v1[N],  v2[N],  v1plusv2[N];
double startTime, stopTime;

int bogo_max (int x, int y) {
  if (x==0) {
    return y;
  } else if (y==0) {
    return x;
  } else {
    return 1+bogo_max(x-1,y-1);
  }
}

int bogo_sum (int x, int y) {
  int i=0, s=0, x0, y0;
  x0 = bogo_max(x,y);
  y0 = (x==x0) ? y : x;
  for (i=0; i<y0; i++) s+=x0;
  return s;
}

int main(int argc, char **argv) {
  int i, j, n;

  if (argc<2) {
    fprintf(stderr, "Usage: ./vecadd4 <vector length>");
    exit(1);
  } 

  n = atoi(argv[1]);

  if (n>N) {
    fprintf(stderr, "Too many elements %d; can only handle up to %d elements");
    exit(1);
  }

  for (j=0; j<n ; j++) { v1[j] = j % 65536 ; v2[j] = j % 65536; }
  
  startTime = clock();
  upc_barrier;
  upc_forall(i=0; i<n; i++; i)
    v1plusv2[i] = bogo_sum(v1[i], v2[i]);
  upc_barrier;
  stopTime = clock();
  if (MYTHREAD==0) 
    printf("vecadd of 2 vectors of length %d, using %d threads; %f secs\n",
	   n, THREADS, (stopTime-startTime)/CLOCKS_PER_SEC);
}
