// Sum-with-average example, but now over an 2D array rather than an interval (problems when using TIMER, though)
// This version covers:
// . row-wise iteration over matrix in C (sumav_ij_c)
// . column-wise iteration over matrix in C (sumav_ji_c)
// . row-wise iteration over matrix in Asm (sumav_asm)
// . row-wise iteration over matrix in Asm w/ bad branch prediction (sumav_asm_bad_pred)
// Compile:  gcc -DASM -UTIMER -UDEBUG -UCHECK   -o sumav3_asm sumav3_asm.c
// Run:      ./sumav3_asm 8000
// NB: uses RPi's builtin timer with -DTIMER flag for higher precision timing on such short runs
// NB: should use -O or -O2 in compilation for better performance

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <time.h>

// =======================================================
// Tunable:
// // run assembler version (as well as C version)?
// ASM
// // use RPi's on-chip timer (rather than system calls)?
// #undef TIMER
// // debugging info?
// #undef DEBUG
// // check the result?
// #define CHECK
// =======================================================

#if defined(TIMER)
# include <sys/mman.h>
# include <stdint.h>
# include <fcntl.h>
#include <errno.h>

# define BLOCK_SIZE              (4*1024)
#endif

#ifdef COLOR
# ifdef LIGHT_ON_BLACK
# define HI_ON "\033[1;33m"
# define HI_OFF "\033[1;37m"
# else
# define HI_ON "\033[0;31m"
# define HI_OFF "\033[0;30m"
# endif
#else
# define HI_ON ""
# define HI_OFF ""
#endif

// Note: the following is already defined in sys/types.h
// typedef unsigned long int ulong;

// Note: the following is already defined in stdint.h
// typedef unsigned int            uint32_t

typedef struct {
  ulong count;
  ulong sum;
  ulong avg;
} result_t;

result_t res1, res2, res3, res03, res4, res5, res_ex;

#if defined(TIMER)
static volatile unsigned int timerbase ;
static volatile uint32_t *timer ;

typedef uint32_t some_timer_t;
#else
typedef double some_timer_t;
#endif

// -----------------------------------------------------------------------------

void show_result(result_t *res) {
  // printf("count: %d, sum: %d, average: %d\n", res->count, res->sum, res->avg); WRONG (prints large unsigned no.s as neg no.s)
  printf("count: %u, sum: %u, average: %u\n", res->count, res->sum, res->avg);
}

// sum and average over an array
static inline void sumav_ij_c(ulong n, ulong *arr, result_t *res1) {
  int i, j, count;
  ulong  sum;

#if defined(DEBUG)
  fprintf(stderr, ".. C_ij version (row-major) w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *(arr), n, n);
#endif
  count = sum = 0;
  for (i = 0; i<n; i++) 
    for (j = 0; j<n; j++) {
      sum += *(arr+(i*n+j)); 
      count++;
    }
   res1->count = count;
   res1->sum = sum;
   res1->avg = sum/count;
}

// sum and average over an array
static inline void sumav_ji_c(ulong n, ulong *arr, result_t *res1) {
  int i, j, count;
  ulong  sum;

#if defined(DEBUG)
  fprintf(stderr, ".. C_ji (col-major) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n);
#endif
  count = sum = 0;
  for (j = 0; j<n; j++) 
    for (i = 0; i<n; i++) {
      sum += *(arr+(i*n+j)); 
      count++;
    }
   res1->count = count;
   res1->sum = sum;
   res1->avg = sum/count;
}


#ifdef ASM
static inline void sumav_asm(int n, ulong *arr, result_t *res1) {
#if defined(DEBUG)
  fprintf(stderr, ".. Assembler version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n);
#endif
   asm volatile(/* inline assembler version of sum-with-average */
	        ".global		_sumav_ij_asm\n"
	        "\t  		MOVS     R3, #0x00	@ initialise result register\n"
	        "\t  		LDR      R2, %[arrp]	@ initialise pointer to array\n"
	        "\t 		MOVS     R5, #0x00	@ OVERALL counter\n"
	        "\t 		MOVS     R4, #0x00	@ initialise accumulator\n"
	        "\t 		MOVS     R1, #0x00	@ initialise col-count register\n"
	        "\t 		MOVS     R0, #0x00	@ initialise row-count register\n"
	        "\t		B        TESTi%=		@ uncond. jump\n"
	        "LOOPi%=:		                @ loop over rows in R0\n"
	        "LOOPj%=:		                @ loop over cols in R1\n"
	        "\t		LDR      R4, [R2, #0]	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R3, R3, R4	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R2, R2, #4	@ increment pointer	\n"
	        "\t		ADD      R1, R1, #1	@ increment counter	\n"
	        "\t		ADD      R5, R5, #1	@ increment OVERALL counter (silly!!)	\n"
	        "TESTj%=:	CMP      R1, %[len]	@ test end value\n"
	        "\t		BLT      LOOPj%=\n"
	        "\t		MOV      R1, #00	@ re-init col-counter	\n"
	        "\t		ADD      R0, R0, #1	@ increment counter	\n"
	        "TESTi%=:	CMP      R0, %[len]	@ test end of outer loop (rows)\n"
	        "\t		BLT      LOOPi%=\n"
	        "\t		@SUBS     R1, R1, #1	@ adjust counter	\n"
		"\t             MUL      R2, R0, R1\n"
	        "\t		STR	 R5, [%[resultp], #00]		@ store count component in memory\n"
	        "\t		STR	 R3, [%[resultp], #04]		@ store sum component in memory\n"
		: 
	        : [len] "r" (n)
		  , [arrp] "m" (arr)
		  , [resultp] "r" (res1)
	        : "r0", "r1", "r2", "r3", "r4", "r5", "cc");
   res1->avg = res1->sum/res1->count;
#if defined(DEBUG)
   fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count);
#endif
}

static inline void sumav_asm_wb(int n, ulong *arr, result_t *res1) {
#if defined(DEBUG)
  fprintf(stderr, ".. Assembler version (with write-back instructions) w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n);
#endif
   asm volatile(/* inline assembler version of sum-with-average */
	        ".global		_sumav_ij_asm\n"
	        "\t  		LDR      R2, %[arrp]	@ initialise pointer to array\n"
	        "\t		LDR      R3, [R2]	@ initialise accumulator\n"
	        "\t 		MOVS     R5, #0x00	@ OVERALL counter\n"
	        "\t 		MOVS     R1, #0x01	@ initialise col-count register\n"
	        "\t 		MOVS     R0, #0x00	@ initialise row-count register\n"
	        "\t		B        TESTi%=	@ uncond. jump\n"
	        "LOOPi%=:		                @ loop over rows in R0\n"
	        "LOOPj%=:		                @ loop over cols in R1\n"
	        "\t		LDR      R4, [R2, #4]!	@ add contents at location R2 to sum in R3 and write-back address to R2\n"
	        "\t		ADD      R3, R3, R4	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R1, R1, #1	@ increment counter	\n"
	        "\t		ADD      R5, R5, #1	@ increment OVERALL counter (silly!!)	\n"
	        "TESTj%=:	CMP      R1, %[len]	@ test end value\n"
	        "\t		BLT      LOOPj%=\n"
	        "\t		MOV      R1, #00	@ re-init col-counter	\n"
	        "\t		ADD      R0, R0, #1	@ increment counter	\n"
	        "TESTi%=:	CMP      R0, %[len]	@ test end of outer loop (rows)\n"
	        "\t		BLT      LOOPi%=\n"
	        "\t		@SUBS     R1, R1, #1	@ adjust counter	\n"
		"\t             MUL      R2, R0, R1\n"
	        "\t		STR	 R5, [%[resultp], #00]		@ store count component in memory\n"
	        "\t		STR	 R3, [%[resultp], #04]		@ store sum component in memory\n"
		: 
	        : [len] "r" (n)
		  , [arrp] "m" (arr)
		  , [resultp] "r" (res1)
	        : "r0", "r1", "r2", "r3", "r4", "r5", "cc");
   res1->avg = res1->sum/res1->count;
#if defined(DEBUG)
   fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count);
#endif
}

static inline void sumav_asm_bad_pred(int n, ulong *arr, result_t *res1) {
#if defined(DEBUG)
  fprintf(stderr, ".. Assembler (bad branch pred) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n);
#endif
   asm volatile(/* inline assembler version of sum-with-average */
	        ".global		_sumav_ij_asm\n"
	        "\t  		MOVS     R3, #0x00	@ initialise result register\n"
	        "\t  		LDR      R2, %[arrp]	@ initialise pointer to array\n"
	        "\t 		MOVS     R5, #0x00	@ OVERALL counter\n"
	        "\t 		MOVS     R1, #0x00	@ initialise col-count register\n"
	        "\t 		MOVS     R0, #0x00	@ initialise row-count register\n"
	        "\t		B        TESTi%=	@ uncond. jump\n"
	        "LOOPi%=:		                @ loop over rows in R0\n"
	        "LOOPj%=:		                @ loop over cols in R1\n"
	        "\t		LDR      R4, [R2, #0]	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R3, R3, R4	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R2, R2, #4	@ increment pointer	\n"
	        "\t		ADD      R1, R1, #1	@ increment counter	\n"
	        "\t		ADD      R5, R5, #1	@ increment OVERALL counter (silly!!)	\n"
	        "TESTj%=:	CMP      R1, %[len]	@ test end value\n"
	        "\t		BGE      BONZOj%=       @ BAD branch (for prediction): taken only once\n"
	        "\t		B        LOOPj%=\n"
		"BONZOj%=:                              @ fall through\n"
	        "\t		MOV      R1, #00	@ re-init col-counter	\n"
	        "\t		ADD      R0, R0, #1	@ increment counter	\n"
	        "TESTi%=:	CMP      R0, %[len]	@ test end of outer loop (rows)\n"
	        "\t		BGE      BONZOi%=        @ BAD branch (for prediction): taken only once\n"
	        "\t		B        LOOPi%=\n"
		"BONZOi%=:                              @ fall through\n"
	        "\t		@SUBS     R1, R1, #1	@ adjust counter	\n"
		"\t             MUL      R2, R0, R1\n"
	        "\t		STR	 R5, [%[resultp], #00]		@ store count component in memory\n"
	        "\t		STR	 R3, [%[resultp], #04]		@ store sum component in memory\n"
		: 
	        : [len] "r" (n)
		  , [arrp] "m" (arr)
		  , [resultp] "r" (res1)
	        : "r0", "r1", "r2", "r3", "r4", "r5", "cc");
   res1->avg = res1->sum/res1->count;
#if defined(DEBUG)
   fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count);
#endif
}

#if 1
// main version for bad branch pred
static inline void sumav_asm_bad_pred0(int n, ulong *arr, result_t *res1) {
#if defined(DEBUG)
  fprintf(stderr, ".. Assembler (bad branch pred) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n);
#endif
   asm volatile(/* inline assembler version of sum-with-average */
	        ".global		_sumav_ij0_asm\n"
	        "\t  		MOVS     R3, #0x00	@ initialise result register\n"
	        "\t  		LDR      R2, %[arrp]	@ initialise pointer to array\n"
	        "\t 		MOVS     R5, #0x00	@ OVERALL counter\n"
	        "\t 		MOVS     R1, #0x00	@ initialise col-count register\n"
	        "\t 		MOVS     R0, #0x00	@ initialise row-count register\n"
	        "\t		@ B        TESTi%=		@ uncond. jump\n"
	        "CHECKi%=:      CMP     R0, %[len]	@ test end of outer loop (rows)\n"
	        "\t		BGE      LEAVEi%=       @ BAD branch (for prediction): taken only once\n"
	        "CHECKj%=:      CMP      R1, %[len]	@ test end value\n"
	        "\t		BGE      LEAVEj%=       @ BAD branch (for prediction): taken only once\n"
	        "\t		LDR      R4, [R2, #0]	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R3, R3, R4	@ add contents at location R2 to sum in R3\n"
	        "\t		ADD      R2, R2, #4	@ increment pointer	\n"
	        "\t		ADD      R1, R1, #1	@ increment counter	\n"
	        "\t		ADD      R5, R5, #1	@ increment OVERALL counter (silly!!)	\n"
	        "\t		B        CHECKj%=	@ uncond. jump\n"

		"LEAVEj%=:                                @ leaving row-loop\n"
	        "\t		MOV      R1, #00	@ re-init col-counter	\n"
	        "\t		ADD      R0, R0, #1	@ increment counter	\n"
	        "\t		B        CHECKi%=	@ uncond. jump\n"

		"LEAVEi%=:                              @ done with row-loop\n"
	        "\t		@SUBS     R1, R1, #1	@ adjust counter	\n"
		"\t             MUL      R2, R0, R1\n"
	        "\t		STR	 R5, [%[resultp], #00]		@ store count component in memory\n"
	        "\t		STR	 R3, [%[resultp], #04]		@ store sum component in memory\n"
		: 
	        : [len] "r" (n)
		  , [arrp] "m" (arr)
		  , [resultp] "r" (res1)
	        : "r0", "r1", "r2", "r3", "r4", "r5", "cc");
   res1->avg = res1->sum/res1->count;
#if defined(DEBUG)
   fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count);
#endif
}
#endif /* 1 */

#endif

/* Initialise the secret sequence of colors (of legnth seqlen) */
void initArr(ulong *arr, int len) {
  int i, j, r;
  ulong x;
  if (arr==NULL)
    arr = (ulong*)malloc(len*len*sizeof(ulong));

#ifdef CHECK
  res_ex.count = 0;
  res_ex.sum = 0;
#endif
  srand((unsigned int)1701); 
  // srand((unsigned int)time(0));
  for (i=0; i<len; i++) {
    // arr[i] = (ulong*)malloc(len*sizeof(ulong));
    for (j=0; j<len; j++) {
      r = rand();
      x = (ulong)(rand() % 65535L + 1); // (rand() * (65535L / RAND_MAX) % 65535L + 1);
      // fprintf(stderr, "Random value btw 1 and 65536 is (RAND_MAX %d; r = %d): %d\n", RAND_MAX, r, x);
      // x = i*len+j + 10;
      *(arr+(i*len+j)) = x;
#ifdef CHECK
      res_ex.sum += x;
#endif
    }
  }
}


#if defined(TIMER)
void rpi_init_timer(){
  int   fd ;
  timerbase = (unsigned int)0x3F003000 ;

  if ((fd = open ("/dev/mem", O_RDWR | O_SYNC | O_CLOEXEC) ) < 0) {
    fprintf (stderr, "wiringPiSetup: Unable to open /dev/mem: %s\n", strerror (errno)) ;
    exit(2);
  }

  // TIMER:
  timer = (int32_t *)mmap(0, BLOCK_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, timerbase) ;
  if ((int32_t)timer == -1) {
    fprintf (stderr, "wiringPiSetup: mmap (TIMER) failed: %s\n", strerror (errno)) ;
    exit(2);
  }
  fprintf(stderr, "NB: timer = %x for timerbase %x\n", timer, timerbase);
}

// this 1-liner is the clock() function on the RPi2 using its on-chip timer; TODO: cover hi-bits as well
uint32_t rpi_clock() { fprintf (stderr, "Reading from %x\n", (timer+1)); return *(timer+1); }


#if 0
// DEAD CODE: DELETE ME

// this is the version from the arm-rpi-tutorial
// reads the low 32-bits of the RPi timer and uses it
// to wait a certain amount of time
typedef struct {
    volatile uint32_t control_status;
    volatile uint32_t counter_lo;
    volatile uint32_t counter_hi;
    volatile uint32_t compare0;
    volatile uint32_t compare1;
    volatile uint32_t compare2;
    volatile uint32_t compare3;
    } rpi_sys_timer_t;

// orig from a delay function, hence the loop;
uint32_t RPI_WaitMicroSeconds( uint32_t us )
{
  volatile uint32_t ts = ((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo;
  uint32_t n=0, m=0;
  uint32_t last, curr;

#if defined(DEBUG)
  fprintf (stderr, "Waiting for %d micro-seconds (reading counter value from %x)\n", us, &((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo) ; 
  last = ((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo;
#endif
  while( ( (curr=((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo) - ts ) < us )
    {
        /* BLANK */
#if defined(DEBUG)
      if (curr == last) {
	n++;
      } else {
	n++; m++;
      }
      if (n % (uint32_t)1000000 == 0) {
	fprintf(stderr, "%d iterations, spotted %d differences\n", n, m);
      }
#endif
    }
  return curr;
}
#endif  /* 0 */

#if 0
// DEAD CODE: DELETE ME
uint32_t rpi_clock(uint32_t howLong) 
{
  volatile uint32_t ts = *(timer+4);
  uint32_t n=0, m=0;
  uint32_t last, curr;

#if defined(DEBUG)
  fprintf (stderr, "Waiting for %d micro-seconds (reading counter value from %x)\n", howLong, (timer+4));
  last = *(timer+4); 
#endif
  while( ( (curr=*(timer+4)) - ts ) < howLong )
    {
        /* BLANK */
#if defined(DEBUG)
      if (curr == last) {
	n++;
      } else {
	n++; m++;
      }
      if (n % (uint32_t)1000000 == 0) {
	fprintf(stderr, "%d iterations, spotted %d differences\n", n, m);
      }
#endif
    }
  return last;
}
#endif 

#endif  /* TIMER */

#if defined(TIMER)
static inline void init_timer() { rpi_init_timer(); }
static inline some_timer_t get_time() { return /* RPI_WaitMicroSeconds(10); */   rpi_clock(10); }
static inline double time_diff(some_timer_t startTime, some_timer_t stopTime) { return (stopTime-startTime)/1000000.0 ; /* clock-rate is 1MHz hence in micro-sec */ }
#else
static inline void init_timer() { /* nothing */ }
static inline some_timer_t get_time() { return clock(); }
static inline double time_diff(some_timer_t startTime, some_timer_t stopTime) { return (stopTime-startTime)/CLOCKS_PER_SEC ; }
#endif

/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */

main(int argc,char ** argv) { 
  int n;
  ulong *arr;

  if (argc<2) {
    fprintf(stderr, "Usage: %s <n> ... compute sum and average over an array of <len> random int values\n", argv[0]);
    exit(1);
  }
  n = atoi(argv[1]);
  fprintf(stderr, "Length = %d\n", n);
  arr = (ulong*)malloc(n*n*sizeof(ulong)); // all allocation now done in initArr
  initArr(arr, n);

  fprintf(stderr, "sizeof(ulong) = %d\n", sizeof(ulong));

#if 1
  init_timer();
  {
    /* double */ some_timer_t startTime, stopTime;
    // fprintf(stdout, "Computing sum and average from 1 to %d in C  (expect %d)...\n", n, (n*(n+1)/2));  // WRONG: overflow on large n's
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "%sRow-wise%s sum and average over an %d x %d matrix in C ...\n", HI_ON, HI_OFF, n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_ij_c(n, arr, &res1);
    stopTime = get_time();
    show_result(&res1);
#ifdef CHECK
    if (res1.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res1.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }
#endif 

#if 1
  {
    /* double */ some_timer_t startTime, stopTime;
    // fprintf(stdout, "Computing sum and average from 1 to %d in C  (expect %d)...\n", n, (n*(n+1)/2));  // WRONG: overflow on large n's
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "%sColumn-wise%s sum and average over an %d x %d matrix in C ...\n", HI_ON, HI_OFF, n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_ji_c(n, arr, &res2);
    stopTime = get_time();
    show_result(&res2);
#ifdef CHECK
    if (res2.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res2.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }
#endif

#ifdef ASM
  {
    /* double */ some_timer_t startTime, stopTime;
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "Computing sum and average over  an %d x %d matrix (row-wise iteration) in assembler ...\n", n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_asm(n, arr, &res3);
    stopTime = get_time();
    show_result(&res3);
#ifdef CHECK
    if (res3.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res3.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }

  {
    /* double */ some_timer_t startTime, stopTime;
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (with write-back) ...\n", n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_asm_wb(n, arr, &res03);
    stopTime = get_time();
    show_result(&res03);
#ifdef CHECK
    if (res03.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res03.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }

  {
    /* double */ some_timer_t startTime, stopTime;
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (bad branch pred) ...\n", n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_asm_bad_pred(n, arr, &res4);
    stopTime = get_time();
    show_result(&res4);
#ifdef CHECK
    if (res3.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res4.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }
  {
    /* double */ some_timer_t startTime, stopTime;
    fprintf(stdout, "--------------------------------------------------------------------------- \n");
    fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (another bad branch pred) ...\n", n, n);
    fflush(stdout);
    startTime = get_time();
    sumav_asm_bad_pred(n, arr, &res5);
    stopTime = get_time();
    show_result(&res5);
#ifdef CHECK
    if (res5.sum == res_ex.sum)
      printf("%sResult OK.%s\n", HI_ON, HI_OFF);
    else
      printf("WRONG result: computed %lu, expected %lu\n", res5.sum, res_ex.sum);
#endif
    printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime));
    // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime);
  }
#endif
  exit(0);
}

/* Measurements (on my Rpi2):
pi@hawopi ~/F28HS $ gcc -UASM -UTIMER -UDEBUG -DCHECK   -o sumav3_asm sumav3_asm.c
pi@hawopi ~/F28HS $ ./sumav3_asm 8000Length = 8000
sizeof(ulong) = 4
Row-wise sum and average over an 8000 x 8000 matrix in C ...
count: 64000000, sum: 1336439042, average: 20
WRONG result: computed 1336439042l, expected 1336404922l
Elapsed time: 0.540000 secs
Column-wise sum and average over an 8000 x 8000 matrix in C ...
count: 64000000, sum: 1336404922, average: 20
Result OK.
Elapsed time: 12.150000 secs
*/
