// Sum-with-average example, but now over an 2D array rather than an interval (problems when using TIMER, though) // This version covers: // . row-wise iteration over matrix in C (sumav_ij_c) // . column-wise iteration over matrix in C (sumav_ji_c) // . row-wise iteration over matrix in Asm (sumav_asm) // . row-wise iteration over matrix in Asm w/ bad branch prediction (sumav_asm_bad_pred) // Compile: gcc -DASM -UTIMER -UDEBUG -UCHECK -o sumav3_asm sumav3_asm.c // Run: ./sumav3_asm 8000 // NB: uses RPi's builtin timer with -DTIMER flag for higher precision timing on such short runs // NB: should use -O or -O2 in compilation for better performance #include #include #include #include // ======================================================= // Tunable: // // run assembler version (as well as C version)? // ASM // // use RPi's on-chip timer (rather than system calls)? // #undef TIMER // // debugging info? // #undef DEBUG // // check the result? // #define CHECK // ======================================================= #if defined(TIMER) # include # include # include #include # define BLOCK_SIZE (4*1024) #endif #ifdef COLOR # ifdef LIGHT_ON_BLACK # define HI_ON "\033[1;33m" # define HI_OFF "\033[1;37m" # else # define HI_ON "\033[0;31m" # define HI_OFF "\033[0;30m" # endif #else # define HI_ON "" # define HI_OFF "" #endif // Note: the following is already defined in sys/types.h // typedef unsigned long int ulong; // Note: the following is already defined in stdint.h // typedef unsigned int uint32_t typedef struct { ulong count; ulong sum; ulong avg; } result_t; result_t res1, res2, res3, res03, res4, res5, res_ex; #if defined(TIMER) static volatile unsigned int timerbase ; static volatile uint32_t *timer ; typedef uint32_t some_timer_t; #else typedef double some_timer_t; #endif // ----------------------------------------------------------------------------- void show_result(result_t *res) { // printf("count: %d, sum: %d, average: %d\n", res->count, res->sum, res->avg); WRONG (prints large unsigned no.s as neg no.s) printf("count: %u, sum: %u, average: %u\n", res->count, res->sum, res->avg); } // sum and average over an array static inline void sumav_ij_c(ulong n, ulong *arr, result_t *res1) { int i, j, count; ulong sum; #if defined(DEBUG) fprintf(stderr, ".. C_ij version (row-major) w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *(arr), n, n); #endif count = sum = 0; for (i = 0; icount = count; res1->sum = sum; res1->avg = sum/count; } // sum and average over an array static inline void sumav_ji_c(ulong n, ulong *arr, result_t *res1) { int i, j, count; ulong sum; #if defined(DEBUG) fprintf(stderr, ".. C_ji (col-major) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n); #endif count = sum = 0; for (j = 0; jcount = count; res1->sum = sum; res1->avg = sum/count; } #ifdef ASM static inline void sumav_asm(int n, ulong *arr, result_t *res1) { #if defined(DEBUG) fprintf(stderr, ".. Assembler version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n); #endif asm volatile(/* inline assembler version of sum-with-average */ ".global _sumav_ij_asm\n" "\t MOVS R3, #0x00 @ initialise result register\n" "\t LDR R2, %[arrp] @ initialise pointer to array\n" "\t MOVS R5, #0x00 @ OVERALL counter\n" "\t MOVS R4, #0x00 @ initialise accumulator\n" "\t MOVS R1, #0x00 @ initialise col-count register\n" "\t MOVS R0, #0x00 @ initialise row-count register\n" "\t B TESTi%= @ uncond. jump\n" "LOOPi%=: @ loop over rows in R0\n" "LOOPj%=: @ loop over cols in R1\n" "\t LDR R4, [R2, #0] @ add contents at location R2 to sum in R3\n" "\t ADD R3, R3, R4 @ add contents at location R2 to sum in R3\n" "\t ADD R2, R2, #4 @ increment pointer \n" "\t ADD R1, R1, #1 @ increment counter \n" "\t ADD R5, R5, #1 @ increment OVERALL counter (silly!!) \n" "TESTj%=: CMP R1, %[len] @ test end value\n" "\t BLT LOOPj%=\n" "\t MOV R1, #00 @ re-init col-counter \n" "\t ADD R0, R0, #1 @ increment counter \n" "TESTi%=: CMP R0, %[len] @ test end of outer loop (rows)\n" "\t BLT LOOPi%=\n" "\t @SUBS R1, R1, #1 @ adjust counter \n" "\t MUL R2, R0, R1\n" "\t STR R5, [%[resultp], #00] @ store count component in memory\n" "\t STR R3, [%[resultp], #04] @ store sum component in memory\n" : : [len] "r" (n) , [arrp] "m" (arr) , [resultp] "r" (res1) : "r0", "r1", "r2", "r3", "r4", "r5", "cc"); res1->avg = res1->sum/res1->count; #if defined(DEBUG) fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count); #endif } static inline void sumav_asm_wb(int n, ulong *arr, result_t *res1) { #if defined(DEBUG) fprintf(stderr, ".. Assembler version (with write-back instructions) w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n); #endif asm volatile(/* inline assembler version of sum-with-average */ ".global _sumav_ij_asm\n" "\t LDR R2, %[arrp] @ initialise pointer to array\n" "\t LDR R3, [R2] @ initialise accumulator\n" "\t MOVS R5, #0x00 @ OVERALL counter\n" "\t MOVS R1, #0x01 @ initialise col-count register\n" "\t MOVS R0, #0x00 @ initialise row-count register\n" "\t B TESTi%= @ uncond. jump\n" "LOOPi%=: @ loop over rows in R0\n" "LOOPj%=: @ loop over cols in R1\n" "\t LDR R4, [R2, #4]! @ add contents at location R2 to sum in R3 and write-back address to R2\n" "\t ADD R3, R3, R4 @ add contents at location R2 to sum in R3\n" "\t ADD R1, R1, #1 @ increment counter \n" "\t ADD R5, R5, #1 @ increment OVERALL counter (silly!!) \n" "TESTj%=: CMP R1, %[len] @ test end value\n" "\t BLT LOOPj%=\n" "\t MOV R1, #00 @ re-init col-counter \n" "\t ADD R0, R0, #1 @ increment counter \n" "TESTi%=: CMP R0, %[len] @ test end of outer loop (rows)\n" "\t BLT LOOPi%=\n" "\t @SUBS R1, R1, #1 @ adjust counter \n" "\t MUL R2, R0, R1\n" "\t STR R5, [%[resultp], #00] @ store count component in memory\n" "\t STR R3, [%[resultp], #04] @ store sum component in memory\n" : : [len] "r" (n) , [arrp] "m" (arr) , [resultp] "r" (res1) : "r0", "r1", "r2", "r3", "r4", "r5", "cc"); res1->avg = res1->sum/res1->count; #if defined(DEBUG) fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count); #endif } static inline void sumav_asm_bad_pred(int n, ulong *arr, result_t *res1) { #if defined(DEBUG) fprintf(stderr, ".. Assembler (bad branch pred) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n); #endif asm volatile(/* inline assembler version of sum-with-average */ ".global _sumav_ij_asm\n" "\t MOVS R3, #0x00 @ initialise result register\n" "\t LDR R2, %[arrp] @ initialise pointer to array\n" "\t MOVS R5, #0x00 @ OVERALL counter\n" "\t MOVS R1, #0x00 @ initialise col-count register\n" "\t MOVS R0, #0x00 @ initialise row-count register\n" "\t B TESTi%= @ uncond. jump\n" "LOOPi%=: @ loop over rows in R0\n" "LOOPj%=: @ loop over cols in R1\n" "\t LDR R4, [R2, #0] @ add contents at location R2 to sum in R3\n" "\t ADD R3, R3, R4 @ add contents at location R2 to sum in R3\n" "\t ADD R2, R2, #4 @ increment pointer \n" "\t ADD R1, R1, #1 @ increment counter \n" "\t ADD R5, R5, #1 @ increment OVERALL counter (silly!!) \n" "TESTj%=: CMP R1, %[len] @ test end value\n" "\t BGE BONZOj%= @ BAD branch (for prediction): taken only once\n" "\t B LOOPj%=\n" "BONZOj%=: @ fall through\n" "\t MOV R1, #00 @ re-init col-counter \n" "\t ADD R0, R0, #1 @ increment counter \n" "TESTi%=: CMP R0, %[len] @ test end of outer loop (rows)\n" "\t BGE BONZOi%= @ BAD branch (for prediction): taken only once\n" "\t B LOOPi%=\n" "BONZOi%=: @ fall through\n" "\t @SUBS R1, R1, #1 @ adjust counter \n" "\t MUL R2, R0, R1\n" "\t STR R5, [%[resultp], #00] @ store count component in memory\n" "\t STR R3, [%[resultp], #04] @ store sum component in memory\n" : : [len] "r" (n) , [arrp] "m" (arr) , [resultp] "r" (res1) : "r0", "r1", "r2", "r3", "r4", "r5", "cc"); res1->avg = res1->sum/res1->count; #if defined(DEBUG) fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count); #endif } #if 1 // main version for bad branch pred static inline void sumav_asm_bad_pred0(int n, ulong *arr, result_t *res1) { #if defined(DEBUG) fprintf(stderr, ".. Assembler (bad branch pred) version w/ data starting at %p (first elem %lu) and size %d x %d\n", arr, *arr, n, n); #endif asm volatile(/* inline assembler version of sum-with-average */ ".global _sumav_ij0_asm\n" "\t MOVS R3, #0x00 @ initialise result register\n" "\t LDR R2, %[arrp] @ initialise pointer to array\n" "\t MOVS R5, #0x00 @ OVERALL counter\n" "\t MOVS R1, #0x00 @ initialise col-count register\n" "\t MOVS R0, #0x00 @ initialise row-count register\n" "\t @ B TESTi%= @ uncond. jump\n" "CHECKi%=: CMP R0, %[len] @ test end of outer loop (rows)\n" "\t BGE LEAVEi%= @ BAD branch (for prediction): taken only once\n" "CHECKj%=: CMP R1, %[len] @ test end value\n" "\t BGE LEAVEj%= @ BAD branch (for prediction): taken only once\n" "\t LDR R4, [R2, #0] @ add contents at location R2 to sum in R3\n" "\t ADD R3, R3, R4 @ add contents at location R2 to sum in R3\n" "\t ADD R2, R2, #4 @ increment pointer \n" "\t ADD R1, R1, #1 @ increment counter \n" "\t ADD R5, R5, #1 @ increment OVERALL counter (silly!!) \n" "\t B CHECKj%= @ uncond. jump\n" "LEAVEj%=: @ leaving row-loop\n" "\t MOV R1, #00 @ re-init col-counter \n" "\t ADD R0, R0, #1 @ increment counter \n" "\t B CHECKi%= @ uncond. jump\n" "LEAVEi%=: @ done with row-loop\n" "\t @SUBS R1, R1, #1 @ adjust counter \n" "\t MUL R2, R0, R1\n" "\t STR R5, [%[resultp], #00] @ store count component in memory\n" "\t STR R3, [%[resultp], #04] @ store sum component in memory\n" : : [len] "r" (n) , [arrp] "m" (arr) , [resultp] "r" (res1) : "r0", "r1", "r2", "r3", "r4", "r5", "cc"); res1->avg = res1->sum/res1->count; #if defined(DEBUG) fprintf(stderr, ".. Assembler result: sum = %lu; count = %lu\n", res1->sum, res1->count); #endif } #endif /* 1 */ #endif /* Initialise the secret sequence of colors (of legnth seqlen) */ void initArr(ulong *arr, int len) { int i, j, r; ulong x; if (arr==NULL) arr = (ulong*)malloc(len*len*sizeof(ulong)); #ifdef CHECK res_ex.count = 0; res_ex.sum = 0; #endif srand((unsigned int)1701); // srand((unsigned int)time(0)); for (i=0; icounter_lo; uint32_t n=0, m=0; uint32_t last, curr; #if defined(DEBUG) fprintf (stderr, "Waiting for %d micro-seconds (reading counter value from %x)\n", us, &((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo) ; last = ((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo; #endif while( ( (curr=((rpi_sys_timer_t*)timer/*timerbase*/)->counter_lo) - ts ) < us ) { /* BLANK */ #if defined(DEBUG) if (curr == last) { n++; } else { n++; m++; } if (n % (uint32_t)1000000 == 0) { fprintf(stderr, "%d iterations, spotted %d differences\n", n, m); } #endif } return curr; } #endif /* 0 */ #if 0 // DEAD CODE: DELETE ME uint32_t rpi_clock(uint32_t howLong) { volatile uint32_t ts = *(timer+4); uint32_t n=0, m=0; uint32_t last, curr; #if defined(DEBUG) fprintf (stderr, "Waiting for %d micro-seconds (reading counter value from %x)\n", howLong, (timer+4)); last = *(timer+4); #endif while( ( (curr=*(timer+4)) - ts ) < howLong ) { /* BLANK */ #if defined(DEBUG) if (curr == last) { n++; } else { n++; m++; } if (n % (uint32_t)1000000 == 0) { fprintf(stderr, "%d iterations, spotted %d differences\n", n, m); } #endif } return last; } #endif #endif /* TIMER */ #if defined(TIMER) static inline void init_timer() { rpi_init_timer(); } static inline some_timer_t get_time() { return /* RPI_WaitMicroSeconds(10); */ rpi_clock(10); } static inline double time_diff(some_timer_t startTime, some_timer_t stopTime) { return (stopTime-startTime)/1000000.0 ; /* clock-rate is 1MHz hence in micro-sec */ } #else static inline void init_timer() { /* nothing */ } static inline some_timer_t get_time() { return clock(); } static inline double time_diff(some_timer_t startTime, some_timer_t stopTime) { return (stopTime-startTime)/CLOCKS_PER_SEC ; } #endif /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ main(int argc,char ** argv) { int n; ulong *arr; if (argc<2) { fprintf(stderr, "Usage: %s ... compute sum and average over an array of random int values\n", argv[0]); exit(1); } n = atoi(argv[1]); fprintf(stderr, "Length = %d\n", n); arr = (ulong*)malloc(n*n*sizeof(ulong)); // all allocation now done in initArr initArr(arr, n); fprintf(stderr, "sizeof(ulong) = %d\n", sizeof(ulong)); #if 1 init_timer(); { /* double */ some_timer_t startTime, stopTime; // fprintf(stdout, "Computing sum and average from 1 to %d in C (expect %d)...\n", n, (n*(n+1)/2)); // WRONG: overflow on large n's fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "%sRow-wise%s sum and average over an %d x %d matrix in C ...\n", HI_ON, HI_OFF, n, n); fflush(stdout); startTime = get_time(); sumav_ij_c(n, arr, &res1); stopTime = get_time(); show_result(&res1); #ifdef CHECK if (res1.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res1.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } #endif #if 1 { /* double */ some_timer_t startTime, stopTime; // fprintf(stdout, "Computing sum and average from 1 to %d in C (expect %d)...\n", n, (n*(n+1)/2)); // WRONG: overflow on large n's fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "%sColumn-wise%s sum and average over an %d x %d matrix in C ...\n", HI_ON, HI_OFF, n, n); fflush(stdout); startTime = get_time(); sumav_ji_c(n, arr, &res2); stopTime = get_time(); show_result(&res2); #ifdef CHECK if (res2.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res2.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } #endif #ifdef ASM { /* double */ some_timer_t startTime, stopTime; fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler ...\n", n, n); fflush(stdout); startTime = get_time(); sumav_asm(n, arr, &res3); stopTime = get_time(); show_result(&res3); #ifdef CHECK if (res3.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res3.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } { /* double */ some_timer_t startTime, stopTime; fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (with write-back) ...\n", n, n); fflush(stdout); startTime = get_time(); sumav_asm_wb(n, arr, &res03); stopTime = get_time(); show_result(&res03); #ifdef CHECK if (res03.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res03.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } { /* double */ some_timer_t startTime, stopTime; fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (bad branch pred) ...\n", n, n); fflush(stdout); startTime = get_time(); sumav_asm_bad_pred(n, arr, &res4); stopTime = get_time(); show_result(&res4); #ifdef CHECK if (res3.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res4.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } { /* double */ some_timer_t startTime, stopTime; fprintf(stdout, "--------------------------------------------------------------------------- \n"); fprintf(stdout, "Computing sum and average over an %d x %d matrix (row-wise iteration) in assembler (another bad branch pred) ...\n", n, n); fflush(stdout); startTime = get_time(); sumav_asm_bad_pred(n, arr, &res5); stopTime = get_time(); show_result(&res5); #ifdef CHECK if (res5.sum == res_ex.sum) printf("%sResult OK.%s\n", HI_ON, HI_OFF); else printf("WRONG result: computed %lu, expected %lu\n", res5.sum, res_ex.sum); #endif printf("Elapsed time: %f secs\n", time_diff(startTime,stopTime)); // printf("Elapsed time: %u micro-secs (from %u to %u)\n", (stopTime-startTime), startTime, stopTime); } #endif exit(0); } /* Measurements (on my Rpi2): pi@hawopi ~/F28HS $ gcc -UASM -UTIMER -UDEBUG -DCHECK -o sumav3_asm sumav3_asm.c pi@hawopi ~/F28HS $ ./sumav3_asm 8000Length = 8000 sizeof(ulong) = 4 Row-wise sum and average over an 8000 x 8000 matrix in C ... count: 64000000, sum: 1336439042, average: 20 WRONG result: computed 1336439042l, expected 1336404922l Elapsed time: 0.540000 secs Column-wise sum and average over an 8000 x 8000 matrix in C ... count: 64000000, sum: 1336404922, average: 20 Result OK. Elapsed time: 12.150000 secs */