diff --git a/groups/amd_gpu/GDS.txt b/groups/amd_gpu_sdk/GDS.txt similarity index 100% rename from groups/amd_gpu/GDS.txt rename to groups/amd_gpu_sdk/GDS.txt diff --git a/groups/amd_gpu/MEM.txt b/groups/amd_gpu_sdk/MEM.txt similarity index 100% rename from groups/amd_gpu/MEM.txt rename to groups/amd_gpu_sdk/MEM.txt diff --git a/groups/amd_gpu/PCI.txt b/groups/amd_gpu_sdk/PCI.txt similarity index 100% rename from groups/amd_gpu/PCI.txt rename to groups/amd_gpu_sdk/PCI.txt diff --git a/groups/amd_gpu/POWER.txt b/groups/amd_gpu_sdk/POWER.txt similarity index 100% rename from groups/amd_gpu/POWER.txt rename to groups/amd_gpu_sdk/POWER.txt diff --git a/groups/amd_gpu/SALU.txt b/groups/amd_gpu_sdk/SALU.txt similarity index 100% rename from groups/amd_gpu/SALU.txt rename to groups/amd_gpu_sdk/SALU.txt diff --git a/groups/amd_gpu/SFETCH.txt b/groups/amd_gpu_sdk/SFETCH.txt similarity index 100% rename from groups/amd_gpu/SFETCH.txt rename to groups/amd_gpu_sdk/SFETCH.txt diff --git a/groups/amd_gpu/STALLED.txt b/groups/amd_gpu_sdk/STALLED.txt similarity index 100% rename from groups/amd_gpu/STALLED.txt rename to groups/amd_gpu_sdk/STALLED.txt diff --git a/groups/amd_gpu/UTIL.txt b/groups/amd_gpu_sdk/UTIL.txt similarity index 100% rename from groups/amd_gpu/UTIL.txt rename to groups/amd_gpu_sdk/UTIL.txt diff --git a/groups/amd_gpu/VALU.txt b/groups/amd_gpu_sdk/VALU.txt similarity index 100% rename from groups/amd_gpu/VALU.txt rename to groups/amd_gpu_sdk/VALU.txt diff --git a/groups/amd_gpu/WAVE.txt b/groups/amd_gpu_sdk/WAVE.txt similarity index 100% rename from groups/amd_gpu/WAVE.txt rename to groups/amd_gpu_sdk/WAVE.txt diff --git a/groups/amd_gpu_v1/GDS.txt b/groups/amd_gpu_v1/GDS.txt new file mode 100644 index 000000000..39c3446be --- /dev/null +++ b/groups/amd_gpu_v1/GDS.txt @@ -0,0 +1,15 @@ +SHORT GDS Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_GDS +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU GDS rw insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES +-- +The average number of GDS read or GDS write instructions executed +per work item (affected by flow control). diff --git a/groups/amd_gpu_v1/MEM.txt b/groups/amd_gpu_v1/MEM.txt new file mode 100644 index 000000000..acc63a627 --- /dev/null +++ b/groups/amd_gpu_v1/MEM.txt @@ -0,0 +1,18 @@ +SHORT Memory utilization + +EVENTSET +ROCM0 ROCP_TA_TA_BUSY +ROCM1 ROCP_GRBM_GUI_ACTIVE +ROCM2 ROCP_SE_NUM + +METRICS +GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 + +LONG +Formulas: +GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM +-- +The percentage of GPUTime the memory unit is active. The result includes +the stall time (MemUnitStalled). This is measured with all extra fetches +and writes and any cache or memory effects taken into account. +Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu_v1/PCI.txt b/groups/amd_gpu_v1/PCI.txt new file mode 100644 index 000000000..cefaf307d --- /dev/null +++ b/groups/amd_gpu_v1/PCI.txt @@ -0,0 +1,23 @@ +SHORT PCI Transfers + +EVENTSET +ROCM0 RSMI_PCI_THROUGHPUT_SENT +ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED + + +METRICS +Runtime time +PCI sent ROCM0 +PCI received ROCM1 +PCI send bandwidth 1E-6*ROCM0/time +PCI recv bandwidth 1E-6*ROCM1/time + +LONG +Formulas: +PCI sent = RSMI_PCI_THROUGHPUT_SENT +PCI received = RSMI_PCI_THROUGHPUT_RECEIVED +PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime +PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime +-- +Currently not usable since the RSMI_PCI_THROUGHPUT_* events require +one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu_v1/POWER.txt b/groups/amd_gpu_v1/POWER.txt new file mode 100644 index 000000000..49830efc0 --- /dev/null +++ b/groups/amd_gpu_v1/POWER.txt @@ -0,0 +1,21 @@ +SHORT Power, temperature and voltage + +EVENTSET +ROCM0 RSMI_POWER_AVE[0] +ROCM1 RSMI_TEMP_EDGE +ROCM2 RSMI_VOLT_VDDGFX + + +METRICS +Power average 1E-6*ROCM0 +Edge temperature 1E-3*ROCM1 +Voltage 1E-3*ROCM2 + +LONG +Formulas: +Power average = RSMI_POWER_AVE[0] +Edge temperature = 1E-3*RSMI_TEMP_EDGE +Voltage = 1E-3*RSMI_VOLT_VDDGFX +-- +Gets the current average power consumption in watts, the +temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu_v1/SALU.txt b/groups/amd_gpu_v1/SALU.txt new file mode 100644 index 000000000..a693421d1 --- /dev/null +++ b/groups/amd_gpu_v1/SALU.txt @@ -0,0 +1,15 @@ +SHORT SALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES +-- +The average number of scalar ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/SFETCH.txt b/groups/amd_gpu_v1/SFETCH.txt new file mode 100644 index 000000000..bd0dfc3ff --- /dev/null +++ b/groups/amd_gpu_v1/SFETCH.txt @@ -0,0 +1,15 @@ +SHORT SFetch Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SMEM +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SFETCH insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES +-- +The average number of scalar fetch instructions from the video memory +executed per work-item (affected by flow control). diff --git a/groups/amd_gpu_v1/STALLED.txt b/groups/amd_gpu_v1/STALLED.txt new file mode 100644 index 000000000..9d6dc42c4 --- /dev/null +++ b/groups/amd_gpu_v1/STALLED.txt @@ -0,0 +1,19 @@ +SHORT ALU stalled by LDS + +EVENTSET +ROCM0 ROCP_SQ_WAIT_INST_LDS +ROCM1 ROCP_SQ_WAVES +ROCM2 ROCP_GRBM_GUI_ACTIVE + +METRICS +GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 + +LONG +Formulas: +GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE +-- +The percentage of GPUTime ALU units are stalled by the LDS input queue +being full or the output queue being not ready. If there are LDS bank +conflicts, reduce them. Otherwise, try reducing the number of LDS +accesses if possible. +Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu_v1/UTIL.txt b/groups/amd_gpu_v1/UTIL.txt new file mode 100644 index 000000000..7d9271e11 --- /dev/null +++ b/groups/amd_gpu_v1/UTIL.txt @@ -0,0 +1,18 @@ +SHORT GPU utilization + +EVENTSET +ROCM0 ROCP_GRBM_COUNT +ROCM1 ROCP_GRBM_GUI_ACTIVE + + +METRICS +GPU utilization 100*ROCM1/ROCM0 + + +LONG +Formulas: +GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT +-- +This group reassembles the 'GPUBusy' metric provided by RocProfiler. +We should add, that we can select the GPUBusy metric directly and the +calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu_v1/VALU.txt b/groups/amd_gpu_v1/VALU.txt new file mode 100644 index 000000000..5d57b9b20 --- /dev/null +++ b/groups/amd_gpu_v1/VALU.txt @@ -0,0 +1,15 @@ +SHORT VALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_VALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU VALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES +-- +The average number of vector ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/WAVE.txt b/groups/amd_gpu_v1/WAVE.txt new file mode 100644 index 000000000..fe8914ae1 --- /dev/null +++ b/groups/amd_gpu_v1/WAVE.txt @@ -0,0 +1,15 @@ +SHORT Wavefronts + +EVENTSET +ROCM0 ROCP_SQ_WAVES + + +METRICS +GPU wavefronts ROCM0 + + +LONG +Formulas: +GPU wavefronts = ROCP_SQ_WAVES +-- +Total Wavefronts diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index 187a04f63..76e3b42f0 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -397,7 +397,6 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context, if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE) { rocprofiler_record_counter_t* r = h->payload; - printf("Counter ID %d Value %f Dispatch %ld\n", r->id, r->counter_value, r->dispatch_id); rocprofiler_counter_id_t cid = {.handle = 0}; (*rocprofiler_query_record_counter_id_ptr)(r->id, &cid); for (int j = 0; j < context->numDevices; j++) @@ -619,8 +618,6 @@ _rocmon_sdk_set_profile(rocprofiler_context_id_t context_id, - - rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t version, const char* runtime_version, @@ -658,7 +655,6 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) } if (rocmon_sdk_initialized) { - return 0; } diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h index 686d8e92a..bfc9ce156 100644 --- a/src/includes/rocmon_smi.h +++ b/src/includes/rocmon_smi.h @@ -1156,7 +1156,7 @@ void rocmon_smi_finalize(RocmonContext* context) } ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI); RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); + ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI failed); // fall through }); rocmon_smi_initialized = FALSE; diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index bf0fe7e03..2cae677dd 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -575,7 +575,7 @@ rocmon_v1_finalize(RocmonContext* context) } ROCM_CALL(hsa_shut_down, (), { - //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA failed); // fall through }); }