/* ------------------------------------------------------------------------- * * (c) Hans-Wolfgang Loidl, 2000- * * Parallel ticky profiling, monitoring basic RTS operations in GUM. * Similar in structure to TICKY_TICKY profiling, but doesn't need a * separate way of building GHC. *-------------------------------------------------------------------------- */ #if defined(PARALLEL_RTS) && defined(PAR_TICKY) #include "Rts.h" #include "RtsFlags.h" #include "RtsUtils.h" //#include "StoragePriv.h" //#include "MBlock.h" //#include "Schedule.h" #include "sm/GC.h" #include "Stats.h" #include "Sparks.h" #include "ParTicky.h" // ToDo: move into Rts.h #include "ParallelRts.h" #if defined(PARALLEL_RTS) && defined(HAVE_GETRUSAGE) #include #endif /* external data */ extern double ElapsedTimeStart; extern ullong GC_tot_alloc; extern ullong GC_tot_copied; extern lnat MaxResidency; /* in words; for stats only */ extern lnat ResidencySamples; /* for stats only */ /* ngIplu' {Stats.c}vo' */ #define BIG_STRING_LEN 512 /* ngIplu' {Ticky.c}vo' */ #define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b))) #define PC(a) (100.0 * a) #define AVG(thing) \ StgDouble avg##thing = INTAVG(tot##thing,ctr##thing) #if 0 void set_foo_time(double *x) { *x = usertime(); } double get_foo_time(double x) { fprintf(stderr, "get_foo_time: %7.2f (%7.5f,%7.5f) \n", usertime()-x,usertime(),x); return (usertime()-x); } #endif static double start_time_GA = 0.0; static double start_mark = 0.0; static double start_pack = 0.0; static double start_unpack = 0.0; static double start_StgRun = 0.0; // this was in Stat.c in GUM4 /* static */ double ElapsedTimeStart = 0.0; static double TicksPerSecond = 0.0; #if 0 // ngoq ngo' (from GUM4) void start_time(void) { #ifdef HAVE_SYSCONF long ticks; /* Determine TicksPerSecond ... */ ticks = sysconf(_SC_CLK_TCK); if ( ticks == -1 ) { fprintf(stderr, "stat_init: bad call to 'sysconf'!\n"); stg_exit(EXIT_FAILURE); } TicksPerSecond = (double) ticks; /* no "sysconf"; had better guess */ #elif defined(HZ) TicksPerSecond = (StgDouble) (HZ); #elif defined(CLOCKS_PER_SEC) TicksPerSecond = (StgDouble) (CLOCKS_PER_SEC); #else /* had better guess wildly */ /* We will #ifdef around the fprintf for machines we *know* are unsupported. (WDP 94/05) */ fprintf(stderr, "NOTE: Guessing `TicksPerSecond = 60'!\n"); TicksPerSecond = 60.0; #endif ElapsedTimeStart = msTime(); } double usertime(void) { # if ! (defined(HAVE_GETRUSAGE) || defined(HAVE_TIMES)) /* We will #ifdef around the fprintf for machines we *know* are unsupported. (WDP 94/05) */ fprintf(stderr, "NOTE: `usertime' does nothing!\n"); return 0.0; # else /* not stumped */ # if defined(HAVE_TIMES) struct tms t; times(&t); //fprintf(stderr, "usertime: %7.2f\n", (double)(t.tms_utime)/TicksPerSecond); return(((double)(t.tms_utime))/TicksPerSecond); # else /* HAVE_GETRUSAGE */ struct rusage t; getrusage(RUSAGE_SELF, &t); return(t.ru_utime.tv_sec + 1e-6*t.ru_utime.tv_usec); # endif /* HAVE_GETRUSAGE */ # endif /* not stumped */ } double elapsedtime(void) { # if ! (defined(HAVE_TIMES) || defined(HAVE_FTIME)) /* We will #ifdef around the fprintf for machines we *know* are unsupported. (WDP 94/05) */ fprintf(stderr, "NOTE: `elapsedtime' does nothing!\n"); return 0.0; # else /* not stumped */ /* "ftime" may be nicer, but "times" is more standard; but, on a Sun, if you do not get the SysV one, you are *hosed*... */ # if defined(HAVE_TIMES) && ! sunos4_TARGET_OS struct tms t; clock_t r = times(&t); return (((double)r)/TicksPerSecond); # else /* HAVE_FTIME */ struct timeb t; ftime(&t); return (fabs(t.time + 1e-3*t.millitm)); # endif /* HAVE_FTIME */ # endif /* not stumped */ } #endif void par_ticky_Par_start (void) { /* init of globalStats struct */ // see memset(3) memset(&globalParStats, 0, sizeof(GlobalParStats)); ElapsedTimeStart = msTime(); IF_PAR_DEBUG(verbose, debugBelch(".... initialised globalParStats to all-0\n")); } void par_ticky_createSparkThread (void) { if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { //set_foo_time(&start_time_GA); globalParStats.tot_threads_created++; } } void par_ticky_GC_start (void) { # if !defined(HAVE_GETRUSAGE) || irix_HOST_OS || defined(_WIN32) fprintf(stderr, "|| sorry don't have RUSAGE\n"); return ; # else FILE *sf = RtsFlags.GcFlags.statsFile; struct rusage t; double utime, stime; if (RtsFlags.GcFlags.giveStats>1 && sf != NULL) { getrusage(RUSAGE_SELF, &t); utime = t.ru_utime.tv_sec + 1e-6*t.ru_utime.tv_usec; stime = t.ru_stime.tv_sec + 1e-6*t.ru_stime.tv_usec; fprintf(stderr, "|| user time: %5.2f; system time: %5.2f\n", utime, stime); fprintf(stderr, "|| max RSS: %ld; int SM size: %ld; int USM data size: %ld; int USS size: %ld\n", t.ru_maxrss, t.ru_ixrss, t.ru_idrss, t.ru_isrss); } #endif } #if 0 FYI: struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ long ru_maxrss; /* maximum resident set size */ long ru_ixrss; /* integral shared memory size */ long ru_idrss; /* integral unshared data size */ long ru_isrss; /* integral unshared stack size */ long ru_minflt; /* page reclaims */ long ru_majflt; /* page faults */ long ru_nswap; /* swaps */ long ru_inblock; /* block input operations */ long ru_oublock; /* block output operations */ long ru_msgsnd; /* messages sent */ long ru_msgrcv; /* messages received */ long ru_nsignals; /* signals received */ long ru_nvcsw; /* voluntary context switches */ long ru_nivcsw; /* involuntary context switches */ }; #endif void par_ticky_rebuildGAtables_start(void) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { //set_foo_time(&start_time_GA); start_time_GA = msTime(); } } void par_ticky_rebuildGAtables_end(nat n, nat size_GA) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { static double foo = 0.0; foo = msTime() - start_time_GA; // get_foo_time(start_time_GA); globalParStats.cnt_rebuild_GA++; globalParStats.tot_rebuild_GA += n; if ( n > globalParStats.res_rebuild_GA ) globalParStats.res_rebuild_GA = n; // fprintf(stderr, "rebuildGAtables: footime=%7.2f (%11.5f, %11.5f)\n", // foo, msTime(), start_time_GA); globalParStats.time_rebuild_GA += foo; globalParStats.tot_size_GA += size_GA; if ( size_GA > globalParStats.res_size_GA ) globalParStats.res_size_GA = size_GA; } // fprintf(stderr, ">> n: %d; size: %d;; tot: %d; res: %d\n", // n, size_GA, globalParStats.tot_size_GA, globalParStats.res_size_GA); } void par_ticky_markInPtrs_start(void) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { start_time_GA = msTime(); } } void par_ticky_markInPtrs_end(nat n) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { globalParStats.cnt_mark_GA++; globalParStats.tot_mark_GA += n; if ( n > globalParStats.res_mark_GA ) globalParStats.res_mark_GA = n; globalParStats.time_mark_GA += msTime() - start_time_GA; } } void par_ticky_markSparkQueue_start(void) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { start_mark=msTime(); } } void par_ticky_markSparkQueue_end(nat n) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { globalParStats.time_sparks += msTime() - start_mark; globalParStats.tot_sparks_marked += n; if ( n > globalParStats.res_sparks_marked ) globalParStats.res_sparks_marked = n; } } void par_ticky_PackNearbyGraph_start (void) { if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { start_pack=msTime(); } } void par_ticky_PackNearbyGraph_end(nat n, nat thunks) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { globalParStats.time_pack += msTime() - start_pack; globalParStats.tot_packets++; globalParStats.tot_packet_size += n; if ( n > globalParStats.res_packet_size ) globalParStats.res_packet_size = n; globalParStats.tot_thunks += thunks; if ( thunks > globalParStats.res_thunks ) globalParStats.res_thunks = thunks; } } void par_ticky_UnpackGraph_start (void) { if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { start_unpack=msTime(); } } void par_ticky_UnpackGraph_end(nat n, nat thunks) { // collect parallel global statistics (currently done together with GC stats) if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { globalParStats.time_unpack += msTime() - start_unpack; globalParStats.rec_packets++; globalParStats.rec_packet_size += n; /* if ( n > globalParStats.res_packet_size ) globalParStats.res_packet_size = n; */ globalParStats.rec_thunks += thunks; /* if ( thunks > globalParStats.res_thunks ) globalParStats.res_thunks = thunks; */ } } //MSA:#if 1 void par_ticky_before_StgRun (void) { if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { start_StgRun =(double) msTime(); } } void par_ticky_after_StgRun(void) { // collect parallel global statistics if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { globalParStats.time_active +=(double) msTime() - start_StgRun; } } //MSA:#endif void par_ticky_TP (Capability *cap) { // StgSparkPool *pool; nat tp_size, sp_size; // stats only // Global stats gathering /* the spark pool for the current PE */ // pool = &(cap->r.rSparks); // Global statistics: residency of thread and spark pool if (RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS) { tp_size = run_queue_len(cap) + 1; // add the TSO just poped // No: there may be many blocked threads being awoken at the same time // ASSERT(tp_size <= RtsFlags.ParFlags.maxThreads); globalParStats.tot_tp += tp_size; globalParStats.emp_tp += (tp_size==0) ? 1 : 0; globalParStats.cnt_tp++; if ( tp_size > globalParStats.res_tp) globalParStats.res_tp = tp_size; // fprintf(stderr, "run_queue_len() = %d (max %d)\n", run_queue_len(), globalParStats.res_tp); sp_size = spark_queue_len(cap); //ASSERT(sp_size <= RtsFlags.ParFlags.maxLocalSparks); globalParStats.tot_sp += sp_size; globalParStats.emp_sp += (sp_size==0) ? 1 : 0; globalParStats.cnt_sp++; if ( sp_size > globalParStats.res_sp) globalParStats.res_sp = sp_size; IF_PAR_DEBUG(paranoia, debugBelch(" spark_queue_len(pool) = %d (max %d)\n", /* cap->r.rSparks, */ spark_queue_len(cap), globalParStats.res_sp)); } } void globalParStat_exit(void) { FILE *sf = RtsFlags.GcFlags.statsFile; double time, etime; parStatsPrintf("\n The total time spent executing Haskell code is %11.2fs\n", (((double)globalParStats.time_active)/TICKS_PER_SECOND)); parStatsPrintf(" globalParStat_exit: RtsFlags.ParFlags.ParStats.Global = %d; RtsFlags.GcFlags.giveStats = %d; RtsFlags.GcFlags.statsFile = %s\n", RtsFlags.ParFlags.ParStats.Global, RtsFlags.GcFlags.giveStats, (RtsFlags.GcFlags.statsFile)?"NON-NULL":"NULL"); /* if (RtsFlags.GcFlags.statsFile==NULL) { sf = stderr; } */ /* print only if GC stats is enabled, too; i.e. -sstderr */ if (!(RtsFlags.ParFlags.ParStats.Global && RtsFlags.GcFlags.giveStats > NO_GC_STATS)) { parStatsPrintf("\n Aborting globalParStat_exit\n"); return; } time = msTime(); etime = msTime() - ElapsedTimeStart; // parStatsPrintf("foo=%7.2f\n", time); if (par_log_file != NULL){ char temp[BIG_STRING_LEN]; ullong tot_mess = globalParStats.tot_fish_mess+globalParStats.tot_fetch_mess+ globalParStats.tot_resume_mess+globalParStats.tot_schedule_mess; ullong tot_rec_mess = globalParStats.rec_fish_mess+globalParStats.rec_fetch_mess+ globalParStats.rec_resume_mess+globalParStats.rec_schedule_mess; // GC_tot_alloc += alloc; parStatsPrintf("\n"); parStatsPrintf( "%11d threads created\n", globalParStats.tot_threads_created); /* Would need to add a ++ to the par macro to use this parStatsPrintf( "%11d sparks created\n", globalParStats.tot_sparks_created); parStatsPrintf( "%11d sparks ignored\n", globalParStats.tot_sparks_ignored); */ ullong_format_string(globalParStats.res_tp, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s thread pool residency", temp); parStatsPrintf( " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", (double)globalParStats.tot_tp/(double)globalParStats.cnt_tp, globalParStats.emp_tp, globalParStats.emp_tp*100.0/(double)globalParStats.cnt_tp, globalParStats.cnt_tp); ullong_format_string(globalParStats.res_sp, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s spark pool residency", temp); parStatsPrintf( " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", (double)globalParStats.tot_sp/(double)globalParStats.cnt_sp, globalParStats.emp_sp, globalParStats.emp_sp*100.0/(double)globalParStats.cnt_sp, globalParStats.cnt_sp); ullong_format_string(globalParStats.tot_db, temp, rtsTrue/*commas*/); /*MSA : add deathBirth */ parStatsPrintf( "%11s StillBirths \n", temp); ullong_format_string(tot_mess, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s messages sent (%d fish, %d fetch, %d resume, %d schedule", temp, globalParStats.tot_fish_mess, globalParStats.tot_fetch_mess, globalParStats.tot_resume_mess, globalParStats.tot_schedule_mess); #if defined(DIST) parStatsPrintf( "%d revals", globalParStats.tot_reval_mess); #endif parStatsPrintf(")\n"); ullong_format_string(tot_rec_mess, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s messages received (%d fish, %d fetch, %d resume, %d schedule", temp, globalParStats.rec_fish_mess, globalParStats.rec_fetch_mess, globalParStats.rec_resume_mess, globalParStats.rec_schedule_mess); #if defined(DIST) parStatsPrintf( "%d revals", globalParStats.rec_reval_mess); #endif parStatsPrintf(")\n Note: boring ACK messages are currently not counted\n\n"); #if 0 // Global Heap statistics; TODO: re-enable!! ullong_format_string(globalParStats.tot_size_GA*sizeof(W_), temp, rtsTrue/*commas*/); parStatsPrintf( "%11s bytes of global heap in total ", temp); parStatsPrintf( "(%5.2f%% of total allocated heap)\n", globalParStats.tot_size_GA*sizeof(W_)*100.0/(double)GC_tot_alloc*sizeof(W_)); ullong_format_string(globalParStats.res_size_GA*sizeof(W_), temp, rtsTrue/*commas*/); parStatsPrintf( "%11s bytes global heap residency ", temp); parStatsPrintf( "(%5.2f%% of max heap residency)\n", globalParStats.res_size_GA*sizeof(W_)*100.0/(double)MaxResidency*sizeof(W_)); #endif //ullong_format_string(globalParStats.res_mark_GA, temp, rtsTrue/*commas*/); //parStatsPrintf( "%11s GAs residency in GALA table ", temp); // ullong_format_string(globalParStats.tot_mark_GA, temp, rtsTrue/*commas*/); //parStatsPrintf( "(avg %5.2f; %d samples)\n", // (double)globalParStats.tot_mark_GA/(double)globalParStats.cnt_mark_GA, // globalParStats.cnt_mark_GA); ullong_format_string(globalParStats.local_alloc_GA, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s GAs locally allocated (calls to makeGlobal)\n", temp); //MSA:#if 0 ullong_format_string(globalParStats.tot_rebuild_GA, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s live GAs in total (after rebuilding tables)\n", temp); ullong_format_string(globalParStats.res_rebuild_GA, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s GAs residency (after rebuilding tables) ", temp); parStatsPrintf( "(avg %5.2f; %d samples)\n", (double)globalParStats.tot_rebuild_GA/(double)globalParStats.cnt_rebuild_GA, globalParStats.cnt_rebuild_GA); ullong_format_string(globalParStats.res_free_GA, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s residency of freeing GAs", temp); parStatsPrintf( " (avg %5.2f; %d samples)\n", (double)globalParStats.tot_free_GA/(double)globalParStats.cnt_free_GA, globalParStats.cnt_free_GA); parStatsPrintf( "%11.2fs spent marking GAs (%7.2f%% of %7.2fs)\n", globalParStats.time_mark_GA, globalParStats.time_mark_GA*100./time, time); parStatsPrintf( "%11.2fs spent rebuilding GALA tables (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs)\n", globalParStats.time_rebuild_GA, globalParStats.time_rebuild_GA*100./time, time, globalParStats.time_rebuild_GA*100./etime, etime); ullong_format_string(globalParStats.tot_sparks_marked, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s sparks marked\t", temp); ullong_format_string(globalParStats.res_sparks_marked, temp, rtsTrue/*commas*/); parStatsPrintf( "%6s spark mark residency\n", temp); parStatsPrintf( "%11.2fs spent marking sparks (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", globalParStats.time_sparks, globalParStats.time_sparks*100./time, time, globalParStats.time_sparks*100./etime, etime); //MSA:#endif parStatsPrintf("\n"); //MSA:#if 0 ullong_format_string(globalParStats.tot_packets, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s packets sent\n", temp); ullong_format_string(globalParStats.tot_packet_size, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s bytes of graph sent in total (max %d; avg %.2f)\n", temp, globalParStats.res_packet_size, (double)globalParStats.tot_packet_size/(double)globalParStats.tot_packets); ullong_format_string(globalParStats.tot_thunks, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s thunks sent in total (max %d; avg %.2f)\n", temp, globalParStats.res_thunks, (double)globalParStats.tot_thunks/(double)globalParStats.tot_packets); parStatsPrintf( "%11.2fs spent packing graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", globalParStats.time_pack, globalParStats.time_pack*100./time, time, globalParStats.time_pack*100./etime, etime); ullong_format_string(globalParStats.rec_packets, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s packets received\n", temp); ullong_format_string(globalParStats.rec_packet_size, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s bytes of graph received in total (max %d; avg %.2f)\n", temp, globalParStats.rec_res_packet_size, (double)globalParStats.rec_packet_size/(double)globalParStats.rec_packets); ullong_format_string(globalParStats.rec_thunks, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s thunks received in total (max %d; avg %.2f)\n", temp, globalParStats.rec_res_thunks, (double)globalParStats.rec_thunks/(double)globalParStats.rec_packets); parStatsPrintf( "%11.2fs spent unpacking graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", globalParStats.time_unpack, globalParStats.time_unpack*100./time, time, globalParStats.time_unpack*100./etime, etime); parStatsPrintf("\n"); ullong_format_string(globalParStats.tot_arrs, temp, rtsTrue/*commas*/); parStatsPrintf( "%11s bytearrays sent; ", temp); ullong_format_string(globalParStats.tot_arr_size, temp, rtsTrue/*commas*/); parStatsPrintf( " %s bytes in total (avg %.2f)\n", temp, (double)globalParStats.tot_arr_size/(double)globalParStats.tot_arrs); parStatsPrintf("\n"); //MSA:#endif #if 0 parStatsPrintf( "%11d yields, %d blocks, %d stack overflows, %d heap overflows\n", globalParStats.tot_yields, globalParStats.tot_blocks, globalParStats.tot_stackover, globalParStats.tot_heapover); parStatsPrintf( "%11d waits (hit porocessMessages)\n", globalParStats.tot_waits); parStatsPrintf("\n"); //parStatsPrintf("Printing this pathetic statistics took %7.2fs (start @ %7.2f)\n", // msTime()-time, time); parStatsPrintf( "%11.2fs active (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", globalParStats.time_active, globalParStats.time_active*100./time, time, globalParStats.time_active*100./etime, etime); parStatsPrintf( "%11.2fs busy-wait (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", globalParStats.time_wait, globalParStats.time_wait*100./time, time, globalParStats.time_wait*100./etime, etime); #endif fflush(par_log_file); // Open filehandle needed by other stats printing fcts // fclose(par_log_file); } } /* ----------------------------------------------------------------------------- see Stats.c for orig, seq code */ #if defined(PARALLEL_RTS) void parStatsPrintf( char *s, ... ) { FILE *sf = par_log_file; va_list ap; va_start(ap,s); if (sf == NULL) { vdebugBelch(s,ap); } else { vfprintf(sf, s, ap); } va_end(ap); } #endif #endif