diff --git a/conf.py b/conf.py index 8b09ed50..9ef7c04e 100644 --- a/conf.py +++ b/conf.py @@ -116,4 +116,8 @@ def setup(app): rst_prolog = r""" .. |R| replace:: \ :sup:`®` + +.. |br| raw:: html + +
""" diff --git a/images/scorep_error.png b/images/scorep_error.png new file mode 100644 index 00000000..49aadbac Binary files /dev/null and b/images/scorep_error.png differ diff --git a/images/scorep_filter_functions.png b/images/scorep_filter_functions.png new file mode 100644 index 00000000..6b86ae5e Binary files /dev/null and b/images/scorep_filter_functions.png differ diff --git a/images/scorep_filtering.png b/images/scorep_filtering.png new file mode 100644 index 00000000..09948e00 Binary files /dev/null and b/images/scorep_filtering.png differ diff --git a/images/scorep_filtering_results.png b/images/scorep_filtering_results.png new file mode 100644 index 00000000..fe17f84e Binary files /dev/null and b/images/scorep_filtering_results.png differ diff --git a/images/vampir_call_tree.png b/images/vampir_call_tree.png new file mode 100644 index 00000000..bdac64c6 Binary files /dev/null and b/images/vampir_call_tree.png differ diff --git a/images/vampir_communication_matrix.png b/images/vampir_communication_matrix.png new file mode 100644 index 00000000..e505cb2d Binary files /dev/null and b/images/vampir_communication_matrix.png differ diff --git a/images/vampir_communication_matrix_max_message.png b/images/vampir_communication_matrix_max_message.png new file mode 100644 index 00000000..9a61ab05 Binary files /dev/null and b/images/vampir_communication_matrix_max_message.png differ diff --git a/images/vampir_communication_matrix_max_time.png b/images/vampir_communication_matrix_max_time.png new file mode 100644 index 00000000..07dfce39 Binary files /dev/null and b/images/vampir_communication_matrix_max_time.png differ diff --git a/images/vampir_communication_matrix_min_data_rate.png b/images/vampir_communication_matrix_min_data_rate.png new file mode 100644 index 00000000..325c34ac Binary files /dev/null and b/images/vampir_communication_matrix_min_data_rate.png differ diff --git a/images/vampir_counter_data_timeline.png b/images/vampir_counter_data_timeline.png new file mode 100644 index 00000000..707bb3f0 Binary files /dev/null and b/images/vampir_counter_data_timeline.png differ diff --git a/images/vampir_counter_message_data_rates.png b/images/vampir_counter_message_data_rates.png new file mode 100644 index 00000000..aa30d914 Binary files /dev/null and b/images/vampir_counter_message_data_rates.png differ diff --git a/images/vampir_counter_mpi_latencies.png b/images/vampir_counter_mpi_latencies.png new file mode 100644 index 00000000..b1d17215 Binary files /dev/null and b/images/vampir_counter_mpi_latencies.png differ diff --git a/images/vampir_counter_select_metric.png b/images/vampir_counter_select_metric.png new file mode 100644 index 00000000..7372bf95 Binary files /dev/null and b/images/vampir_counter_select_metric.png differ diff --git a/images/vampir_io_summary.png b/images/vampir_io_summary.png new file mode 100644 index 00000000..1821eaca Binary files /dev/null and b/images/vampir_io_summary.png differ diff --git a/images/vampir_io_summary_operations.png b/images/vampir_io_summary_operations.png new file mode 100644 index 00000000..abf6795d Binary files /dev/null and b/images/vampir_io_summary_operations.png differ diff --git a/images/vampir_io_timeline.png b/images/vampir_io_timeline.png new file mode 100644 index 00000000..cfe94589 Binary files /dev/null and b/images/vampir_io_timeline.png differ diff --git a/images/vampir_main_view.png b/images/vampir_main_view.png new file mode 100644 index 00000000..1fe38754 Binary files /dev/null and b/images/vampir_main_view.png differ diff --git a/images/vampir_main_view2.png b/images/vampir_main_view2.png new file mode 100644 index 00000000..a4ec51da Binary files /dev/null and b/images/vampir_main_view2.png differ diff --git a/images/vampir_message_summary.png b/images/vampir_message_summary.png new file mode 100644 index 00000000..23645b9c Binary files /dev/null and b/images/vampir_message_summary.png differ diff --git a/images/vampir_message_summary_menu.png b/images/vampir_message_summary_menu.png new file mode 100644 index 00000000..a51db375 Binary files /dev/null and b/images/vampir_message_summary_menu.png differ diff --git a/images/vampir_performance_radar.png b/images/vampir_performance_radar.png new file mode 100644 index 00000000..e0935675 Binary files /dev/null and b/images/vampir_performance_radar.png differ diff --git a/images/vampir_performance_radar_data_rate.png b/images/vampir_performance_radar_data_rate.png new file mode 100644 index 00000000..78137810 Binary files /dev/null and b/images/vampir_performance_radar_data_rate.png differ diff --git a/images/vampir_performance_radar_menu.png b/images/vampir_performance_radar_menu.png new file mode 100644 index 00000000..57a9c998 Binary files /dev/null and b/images/vampir_performance_radar_menu.png differ diff --git a/images/vampir_process_summary.png b/images/vampir_process_summary.png new file mode 100644 index 00000000..f2001b82 Binary files /dev/null and b/images/vampir_process_summary.png differ diff --git a/images/vampir_process_summary_menu.png b/images/vampir_process_summary_menu.png new file mode 100644 index 00000000..6ce7f827 Binary files /dev/null and b/images/vampir_process_summary_menu.png differ diff --git a/images/vampir_process_summary_timeline.png b/images/vampir_process_summary_timeline.png new file mode 100644 index 00000000..7170a3c0 Binary files /dev/null and b/images/vampir_process_summary_timeline.png differ diff --git a/images/vampir_process_timeline.png b/images/vampir_process_timeline.png new file mode 100644 index 00000000..7b83110e Binary files /dev/null and b/images/vampir_process_timeline.png differ diff --git a/images/vampir_process_timeline_exclusive.png b/images/vampir_process_timeline_exclusive.png new file mode 100644 index 00000000..dc86cb23 Binary files /dev/null and b/images/vampir_process_timeline_exclusive.png differ diff --git a/images/vampir_prrocess_sumamry_2_clusters.png b/images/vampir_prrocess_sumamry_2_clusters.png new file mode 100644 index 00000000..b2f0d3b9 Binary files /dev/null and b/images/vampir_prrocess_sumamry_2_clusters.png differ diff --git a/images/vampir_set_cluster.png b/images/vampir_set_cluster.png new file mode 100644 index 00000000..643aa62a Binary files /dev/null and b/images/vampir_set_cluster.png differ diff --git a/images/vampir_zoom.png b/images/vampir_zoom.png new file mode 100644 index 00000000..d866d340 Binary files /dev/null and b/images/vampir_zoom.png differ diff --git a/software/profiling/Score-P.rst b/software/profiling/Score-P.rst new file mode 100644 index 00000000..1b1217ad --- /dev/null +++ b/software/profiling/Score-P.rst @@ -0,0 +1,979 @@ +.. _scorep: + +******* +Score-P +******* + +The Score-P (Scalable Performance Measurement Infrastructure for Parallel +Codes) instrumenting tool is a scalable and easy-to-use tool suite for +profiling, event tracing, and online analysis of HPC applications. It has been +created in the German BMBF project SILC and the US DOE project PRIMA. Score-P +is developed under a BSD 3-Clause License and governed by a meritocratic +governance model. + +| Website: https://www.vi-hps.org/projects/score-p/ +| Email: support@score-p.org + +Score-P is installed with `Program Database Toolkit (PDT) +`_ on Summit. PDT is a +framework for analyzing source code written in several programming languages. +Moreover, `Performance Application Programming Interface (PAPI) +`_ is supported. PAPI counters are used to assess +CPU performance. In this section, some approaches for profiling and tracing +will be presented. + +Automatic Source Code Instrumentation +===================================== + +Prefix method +~~~~~~~~~~~~~ + +In this approach we have to edit the Makefile and add the corresponding commands declarations. + +.. code:: + + CC = scorep gcc + CXX = scorep g++ + F90 = scorep gfortran + +.. code:: + + CC = gcc + .. + tager: target.c + corep $(CC) -o $@ $^ + + +Wrapper method +~~~~~~~~~~~~~~ + +In this approach we do not need to edit any file as we use CMake. Some times only one of the methods works. + +.. code:: + + SCOREP_WRAPPER=off cmake -DCMAKE_C_COMPILER=scorep-gcc -DCMAKE_CXX_COMPILER=scorep-g++ + +.. code:: + + SCOREP_WRAPPER=off ../configure CC=scorep-gcc CXX=scorep-g++ --disable-dependency-tracking + +.. code:: + + make SCOREP_WRAPPER_INSTRUMENTER_FLAGS= + + + +Instrumentation Overview +~~~~~~~~~~~~~~~~~~~~~~~~~ + + +The following Score-P options are useful . + ++-------------------------+------------------------+----------------+-----------------------------+ +| Type of Instrumentation | Instrumenter Switch | Default value | Instrumented routines | ++=========================+========================+================+=============================+ +| MPI | | ``--mpp=mpi`` | (auto) | configured by install | +| | | ``--mpp=none`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| SHMEM | | ``--mpp=shmem`` | (auto) | configured by install | +| | | ``--mpp=none`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| OpenCL | | ``--opencl`` | enabled | configured by install | +| | | ``--noopencl`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| OpenACC | | ``--openacc`` | enabled | configured by install | +| | | ``--noopenacc`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| CUDA | | ``--cuda`` | enabled | configured by install | +| | | ``--nocuda`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| OpenMP | | ``--thread=omp``/ | (auto) | all parallel constructs | +| | | ``--thread=none`` | | | +| | | ``--openmp`` | | | +| | | ``--noopenmp`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| Pthread | | ``--thread=pthread`` | (auto) | basic Pthread library calls | ++-------------------------+------------------------+----------------+-----------------------------+ +| Compiler | | ``--compiler`` | enabled | all | +| | | ``--nocompiler`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| PDT | | ``--pdt`` | disabled | all | +| | | ``--nopdt`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| POMP2 | | ``--pomp`` | disabled | manually annotated | +| | | ``--nopomp`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ +| Manual | | ``--user`` | disabled | manually annotated | +| | | ``--nouser`` | | | ++-------------------------+------------------------+----------------+-----------------------------+ + + + +Run-Time Environment Variables +============================== + +The following Score-P environment variables may be useful in job submission scripts. See Score-P manual for more information. + ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Variable | Default | Description | ++=======================================+==================================+=============================================================================================================+ +| SCOREP_ENABLE_PROFILING | TRUE | Enable profiling | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_ENABLE_TRACING | FALSE | Enable tracing | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_VERBOSE | FALSE | Activate verbose mode | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_TOTAL_MEMORY | 16000k | Total memory in bytes per process to be consumed by the measurement system | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_EXPERIMENT_DIRECTORY | directory based on current time | Declare the path with the directory for the data to be saved | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_OVERWRITE_EXPERIMENT_DIRECTORY | TRUE | Overwrite an existing experiment directory | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_EXECUTABLE | "" | Full path to the executable if Score-P cannot find it | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_PROFILING_MAX_CALLPATH_DEPTH | 30 | Maximum depth of the calltree | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_FILTERING_FILE | "" | A filename with the filter rules | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_METRIC_PAPI | "" | PAPI metric names to measure | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_METRIC_PAPI_PER_PROCESS | "" | List of requested PAPI metric names that will be recorded only by first thread of a process | ++---------------------------------------+----------------------------------+----------------+--------------------------------------------------------------------------------------------+ +| SCOREP_MPI_ENABLE_GROUPS | default | **Value** | **Description** | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``all`` | All MPI functions | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``cg`` | Communication and group management | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``coll`` | Collective functions | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``default`` | Includes cg, coll, env, io, p2p, rma, topo, xnonblock | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``env`` | Environmental management | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``err`` | MPI Error handling | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``ext`` | External interface functions | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``io`` | MPI file I/O | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``p2p`` | Peer-to-perr communication | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``misc`` | Miscellaneous | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``perf`` | PControl | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``rma`` | One sided communication | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``spawn`` | Process management | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``topo`` | Topology | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``type`` | MPI datatype functions | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``xnonblock`` | Ectended non-blocking events | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``xreqtest`` | Test events for uncompleted requests | +| | +----------------+--------------------------------------------------------------------------------------------+ +| | | ``none/no`` | Disable feature | ++---------------------------------------+----------------------------------+----------------+--------------------------------------------------------------------------------------------+ +| SCOREP_MPI_MEMORY_RECORDING | FALSE |Enable tracing of memory allocations done by calls to MPI_ALLOC_MEM and MPI_FREE_MEM, requires the MISC group| ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ +| SCOREP_MPI_ONLINE_ANALYSIS | FALSE | Enable online MPI wait states analysis | ++---------------------------------------+----------------------------------+---------------------+---------------------------------------------------------------------------------------+ +| SCOREP_CUDA_ENABLE | no | **Value** | **Description** | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``runtime`` | CUDA runtime API | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``driver`` | CUDA driver API | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``kernel`` | CUDA kernels | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``kernel_serial`` | Serialized kernel recording | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``kernel_counter`` | Fixed CUDA kernel metrics | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``memcpy`` | CUDA memory copies | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``sync`` | Record implicit and explicit CUDA synchronization | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``idle`` | GPU compute idle time | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``pure_idle`` | GPU idle time (memory copies are not idle) | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``gpumemusage`` | Record CUDA memory (de)allocations as a counter | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``references`` | Record references between CUDA activities | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``flushatexit`` | Flush CUDA activity buffer at program exit | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``default/yes/1`` | Includes runtime, kernel, memcpy | +| | +---------------------+---------------------------------------------------------------------------------------+ +| | | ``none/no`` | Disable feature | ++---------------------------------------+----------------------------------+---------------------+---------------------------------------------------------------------------------------+ +| SCOREP_CUDA_BUFFER | 1M | Total memory in bytes for the CUDA record buffer | ++---------------------------------------+----------------------------------+-----------------------+-------------------------------------------------------------------------------------+ +| SCOREP_OPENACC_ENABLE | no | **Value** | **Description** | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``regions`` | OpenACC regions | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``wait`` | OpenACC wait operations | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``enqueue`` | OpenACC enqueue operations | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``device_alloc`` | OpenACC device memory allocations | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``kernel_properties`` | Record kernel properties such as the kernel name, gang, worker and vector size | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``variable_names`` | Record variable names for OpenACC data allocation and enqueue upload/download | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``default/yes/1`` | OpenACC regions,enqueue and wait operations | +| | +-----------------------+-------------------------------------------------------------------------------------+ +| | | ``none/no`` | Disable feature | ++---------------------------------------+----------------------------------+-----------------------+-------------------------------------------------------------------------------------+ +| SCOREP_MEMORY_RECORDING | FALSE | Memory (de)allocations are recorded via libc/C++ API | ++---------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ + +Example Application: MiniWeather +================================ + +We'll use the open-source `MiniWeather +`_ application to demonstrate the +capabilities of Score-P. + +Get the Source Code +~~~~~~~~~~~~~~~~~~~ + +.. code:: + + $ git clone https://github.com/mrnorman/miniWeather.git + $ cd miniWeather/c/build + +Compile the Application +~~~~~~~~~~~~~~~~~~~~~~~ + +MiniWeather supports several build modes: serial, MPI, MPI+OpenMP, and +MPI+OpenACC. In order to compile the application, we'll be using the PGI +toolchain, and bring into our environment both ``cmake`` and a parallel +installation of ``NetCDF``. + +.. code:: + + $ module load pgi parallel-netcdf cmake + $ ./cmake_summit_pgi.sh + + +After the compilation ends, there will be the executables called `serial`, `openacc`, `mpi`, `openmp` + +Below, we'll look at using Score-P to profile each case. + + +Modifications +------------- + +- Edit the makefile and replace ``mpic++`` with ``scorep --mpp=mpi mpic++``. + + +Instrumenting the Serial Version of MiniWeather +----------------------------------------------- + +For a serial application, we should not use a Makefile with a programming +model such as MPI or OpenMP. However, as the source code for this **specific** +case includes MPI headers that are not excluded during the compilation of the +serial version, we should declare a Makefile with MPI. + +- Edit the `cmake_summit_pgi.sh` and replace + +.. code:: + + cmake -DCMAKE_CXX_COMPILER=mpicxx + +with + +.. code:: + + SCOREP_WRAPPER=off cmake -DCMAKE_CXX_COMPILER=scorep-mpicxx + + +and execute + +.. code:: + + $ module load pgi + $ module load parallel-netcdf + $ module load scorep/6.0 + $ make serial SCOREP_WRAPPER_INSTRUMENTER_FLAGS="--mpp=mpi + +If there were no MPI headers, you should edit the `cmake_summit_pgi.sh` with: + +.. code:: + + cmake -DCMAKE_CXX_COMPILER=scorep-pgc++ + +and execute: + +.. code:: + + make serial + +If you want to add PDT, then use the option ``--pdt`` in the variable ``SCOREP_WRAPPER_INSTRUMENTER_FLAGS`` + +Add to your submission script the Score-P variables that you want to use (or +uncomment them below). By default the Score-P will apply profiling, and not apply tracing. + +.. code:: + + #PAPI metrics + export SCOREP_METRIC_PAPI=PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS + + export SCOREP_MPI_ENABLE_GROUPS=ALL + export SCOREP_TOTAL_MEMORY=20MB + + time jsrun -n 1 -r 1 -a 1 -c 1 ./serial + + +- When the execution finishes, one directory is created named ``scorep-_