Skip to content

Commit 6dcab81

Browse files
committed
[prof] in gux_taptamggux.mad counters.h, improve the handling of counter overhead
These are the results (1) keep overhead ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING RDTSC-BASED TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.4766s [COUNTERS] Fortran Other ( 0 ) : 0.1202s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0685s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 3.2400s for 1087437 events => throughput is 3.36E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1007s for 32768 events => throughput is 3.25E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1673s for 16384 events => throughput is 9.79E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0521s for 16384 events => throughput is 3.14E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0687s for 16384 events => throughput is 2.38E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1237s for 1087437 events => throughput is 8.79E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4728s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0269s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.3496s for 14136681 events => throughput is 6.02E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.4409s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING STD::CHRONO TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 5.3144s [COUNTERS] Fortran Other ( 0 ) : 0.1588s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 4.0191s for 1087437 events => throughput is 2.71E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0996s for 32768 events => throughput is 3.29E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1660s for 16384 events => throughput is 9.87E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0508s for 16384 events => throughput is 3.22E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0704s for 16384 events => throughput is 2.33E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1482s for 1087437 events => throughput is 7.34E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4718s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.8646s for 14136681 events => throughput is 4.94E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 5.2787s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s (2) remove overhead CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0338s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.8244s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.8905s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.9339s [COUNTERS] Fortran Other ( 0 ) : 0.2954s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.7332s for 1087437 events => throughput is 3.98E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1003s for 32768 events => throughput is 3.27E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1688s for 16384 events => throughput is 9.71E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0507s for 16384 events => throughput is 3.23E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0695s for 16384 events => throughput is 2.36E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0924s for 1087437 events => throughput is 1.18E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4692s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0263s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.8723s for 14136681 events => throughput is 7.55E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 3.8982s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0637s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 5.8826s [COUNTERS] PROGRAM COUNTEROVERHEAD : 1.6786s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.2040s [COUNTERS] Fortran Other ( 0 ) : 0.4831s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0691s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.9924s for 1087437 events => throughput is 3.63E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0983s for 32768 events => throughput is 3.33E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1669s for 16384 events => throughput is 9.81E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0506s for 16384 events => throughput is 3.24E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0676s for 16384 events => throughput is 2.42E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0698s for 1087437 events => throughput is 1.56E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4712s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.9227s for 14136681 events => throughput is 7.35E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.1690s [COUNTERS] OVERALL MEs ( 32 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s (3) remove overhead, disable individual timers (so here the overhead is 0) CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0333s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.1897s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.3330s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8567s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0659s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.5119s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.6594s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8525s (4) do not remove overhead, disable individual timers (remove also the overhead from the estimation of the overhead) (this test was done on another day on the same machine and build, but the results are compatible with the previous ones) CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING RDTSC-BASED TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8072s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING STD::CHRONO TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8214s
1 parent 3577a55 commit 6dcab81

File tree

1 file changed

+12
-10
lines changed
  • epochX/cudacpp/gux_taptamggux.mad/SubProcesses

1 file changed

+12
-10
lines changed

epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,11 @@ extern "C"
171171
void counters_initialise_()
172172
{
173173
using namespace counters;
174-
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
175-
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
176174
#ifdef MGONGPU_HASRDTSC
177175
if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) usechronotimers = true;
178176
#endif
179177
if( getenv( "CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD" ) ) removetimeroverhead = true;
180-
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
178+
for( int icounter = 0; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
181179
{
182180
array_tags[icounter] = ""; // ensure that this is initialized to ""
183181
array_istesttimer[icounter] = false; // ensure that this is initialized to false
@@ -193,7 +191,7 @@ extern "C"
193191
counters_register_counter_( &icalibcounter, "OVERHEAD CALIBRATION" );
194192
mgOnGpu::ChronoTimer<std::chrono::high_resolution_clock> calibtimer;
195193
calibtimer.start();
196-
constexpr size_t ncall = 1000000;
194+
constexpr size_t ncall = 10000000; // 10M calls are expected to take slightly less than ~1s (this will be in counter overhead)
197195
for( size_t icall = 0; icall < ncall; icall++ )
198196
{
199197
counters_start_counter_( &icalibcounter, &nevtdummy );
@@ -202,6 +200,8 @@ extern "C"
202200
calibtimer.stop();
203201
overheadpercallseconds = calibtimer.getTotalDurationSeconds() / ncall;
204202
}
203+
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
204+
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
205205
return;
206206
}
207207

@@ -216,10 +216,12 @@ extern "C"
216216
float program_totaltime = ( usechronotimers ? program_chronotimer.getTotalDurationSeconds() : program_rdtsctimer.getTotalDurationSeconds() );
217217
float program_overhead = 0;
218218
// Extract time duration from all timers
219-
float array_totaltimes[NCOUNTERSMAX + 3] = { 0 };
220-
float array_overheads[NCOUNTERSMAX + 3] = { 0 };
221-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
219+
float array_totaltimes[NCOUNTERSMAX + 4] = { 0 };
220+
float array_overheads[NCOUNTERSMAX + 4] = { 0 };
221+
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
222222
{
223+
if( icounter == NCOUNTERSMAX + 1 ) continue;
224+
if( icounter == NCOUNTERSMAX + 2 ) continue;
223225
if( usechronotimers )
224226
array_totaltimes[icounter] = array_chronotimers[icounter].getTotalDurationSeconds();
225227
else
@@ -235,7 +237,7 @@ extern "C"
235237
// Remove overheads of included timers if any
236238
if( removetimeroverhead )
237239
{
238-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
240+
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) // no need to include icalibcounter = NCOUNTERSMAX+3
239241
{
240242
for( int icounterIn : array_included[icounter] )
241243
array_totaltimes[icounter] -= array_overheads[icounterIn];
@@ -259,7 +261,7 @@ extern "C"
259261
array_tags[0] = "Fortran Other";
260262
array_counters[0] = 1;
261263
array_totaltimes[0] = program_totaltime;
262-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
264+
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
263265
{
264266
if( !array_istesttimer[icounter] ) // skip TEST counters
265267
array_totaltimes[0] -= array_totaltimes[icounter];
@@ -280,7 +282,7 @@ extern "C"
280282
array_counters[NCOUNTERSMAX + 1] = 1;
281283
array_totaltimes[NCOUNTERSMAX + 1] = program_totaltime - array_totaltimes[NCOUNTERSMAX + 2];
282284
// Dump individual counters
283-
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
285+
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) // exclude icalibcounter = NCOUNTERSMAX+3 (would print a negative value here!)
284286
{
285287
if( array_tags[icounter] != "" )
286288
{

0 commit comments

Comments
 (0)