From d3e1eb829e1dbeb1af8da7602cde0fc7ed2a94f6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 10 Nov 2024 17:24:40 +0100 Subject: [PATCH] Update code to work again but only v1 and smi, sdk still fails to init --- src/includes/rocmon_sdk.h | 156 ++++++++++++++++++++++++++++++-------- src/includes/rocmon_smi.h | 9 ++- src/includes/rocmon_v1.h | 2 +- src/rocmon.c | 83 ++++++++++---------- 4 files changed, 171 insertions(+), 79 deletions(-) diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index 76e3b42f0..9aa6820f6 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -179,6 +179,7 @@ _rocmon_sdk_link_libraries() DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_record_counter_id); DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_is_initialized); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); @@ -452,32 +453,12 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context, } -int -tool_init(rocprofiler_client_finalize_t fini, void* udata) +static int _rocmon_sdk_create_devices(RocmonContext** stat_context) { rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; - RocmonContext** stat_context = (RocmonContext**)udata; RocmonContext* context = *stat_context; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); - - // initialize libraries - if (_rocmon_sdk_link_libraries() < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return -EFAULT; - } - -/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ -/* hsa_status_t hstat = (*hsa_init_ptr)();*/ -/* if (hstat != HSA_STATUS_SUCCESS)*/ -/* {*/ -/* ERROR_PRINT(Failed to initialize HSA);*/ -/* return -EFAULT;*/ -/* }*/ - - //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); - stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata); + stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), stat_context); if (stat != ROCPROFILER_STATUS_SUCCESS) { ERROR_PRINT(Failed to query available agents); @@ -488,7 +469,6 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) FREE_IF_NOT_NULL(context->devices); return -1; } - for (int i = 0; i < context->numDevices; i++) { rocprofiler_context_id_t device_context; @@ -544,6 +524,99 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) return 0; } +int +tool_init(rocprofiler_client_finalize_t fini, void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); + + // initialize libraries + if (_rocmon_sdk_link_libraries() < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return -EFAULT; + } + return _rocmon_sdk_create_devices(stat_context); + +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ +/* hsa_status_t hstat = (*hsa_init_ptr)();*/ +/* if (hstat != HSA_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to initialize HSA);*/ +/* return -EFAULT;*/ +/* }*/ + + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents);*/ +/* stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to query available agents);*/ +/* return -EFAULT;*/ +/* }*/ +/* if (context->numDevices == 0)*/ +/* {*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -1;*/ +/* }*/ + +/* for (int i = 0; i < context->numDevices; i++)*/ +/* {*/ +/* rocprofiler_context_id_t device_context;*/ +/* rocprofiler_buffer_id_t buffer;*/ +/* rocprofiler_callback_thread_t thread;*/ +/* RocmonDevice* device = &context->devices[i];*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_context_ptr)(&device_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId);*/ +/* rocmon_sdk_read_buffers_cb devdata = {*/ +/* .context = stat_context,*/ +/* .devid = device->deviceId,*/ +/* .devcontext = device_context*/ +/* };*/ +/* stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_callback_thread_ptr)(&thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* */ +/* device->sdk_context = device_context;*/ +/* device->buffer = buffer;*/ +/* device->thread = thread;*/ +/* }*/ +/* return 0;*/ +} + void tool_fini(void* udata) @@ -648,6 +721,7 @@ int rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) { int ret = 0; + rocprofiler_context_id_t text_context; rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; if ((numGpus < 0) || (!gpuIds) || (!context)) { @@ -666,11 +740,23 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) return ret; } - stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure); - if (stat != ROCPROFILER_STATUS_SUCCESS) +/* stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to configure rocprofiler: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ +/* stat = (*rocprofiler_create_context_ptr)(&text_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to create test context: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ + ret = _rocmon_sdk_create_devices(&rocmon_context); + if (ret < 0) { - ERROR_PLAIN_PRINT(Failed to configure rocprofiler); - return -EFAULT; + ERROR_PRINT(Failed to create SDK devices); + return ret; } if (context->numDevices == 0) @@ -1085,12 +1171,16 @@ _rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device) return -EFAULT; } } -/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ -/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ -/* {*/ -/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ -/* return -EFAULT;*/ -/* }*/ + else + { + ERROR_PRINT(Device context for device %d not active, device->deviceId); + } + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } return 0; } diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h index bfc9ce156..d40990a64 100644 --- a/src/includes/rocmon_smi.h +++ b/src/includes/rocmon_smi.h @@ -932,6 +932,7 @@ rocmon_smi_startCounters(RocmonContext* context) for (int i = 0; i < context->numDevices; i++) { RocmonDevice* device = &context->devices[i]; + fprintf(stderr, "Device %d with %d SMI events\n", device->deviceId, device->numActiveSmiEvents); // Check if there are any counters to start if (device->numActiveSmiEvents <= 0) { @@ -940,11 +941,11 @@ rocmon_smi_startCounters(RocmonContext* context) // Save baseline values RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) + for (int j = 0; j < device->numActiveSmiEvents; j++) { double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + RocmonSmiEvent* event = &device->activeSmiEvents[j]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+j]; // Measure counter if (event->measureFunc) @@ -1156,7 +1157,7 @@ void rocmon_smi_finalize(RocmonContext* context) } ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI); RSMI_CALL(rsmi_shut_down, (), { - ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI failed); + ERROR_PRINT(Shutdown SMI failed); // fall through }); rocmon_smi_initialized = FALSE; diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index 2cae677dd..3fe05b0c7 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -575,7 +575,7 @@ rocmon_v1_finalize(RocmonContext* context) } ROCM_CALL(hsa_shut_down, (), { - ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA failed); + ERROR_PRINT(Shutdown HSA failed); // fall through }); } diff --git a/src/rocmon.c b/src/rocmon.c index cde753b78..f767b29dc 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -372,23 +372,8 @@ rocmon_setupCounters(int gid) // // Separate rocprofiler and SMI events // - const char **smiEvents = NULL, **rocEvents = NULL; int numSmiEvents = 0, numRocEvents = 0; - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } // Go through each event and sort it for (int i = 0; i < group->nevents; i++) @@ -397,13 +382,11 @@ rocmon_setupCounters(int gid) if (strncmp(name, "RSMI_", 5) == 0) { // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix numSmiEvents++; } else if (strncmp(name, "ROCP_", 5) == 0) { // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix numRocEvents++; } else @@ -414,48 +397,66 @@ rocmon_setupCounters(int gid) } } - // Add events to each device for (int i = 0; i < rocmon_context->numDevices; i++) { RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; + device->numActiveRocEvents = 0; + } - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); - if (rocmon_context->use_rocprofiler_v1) - { - ret = rocmon_v1_setupCounters(rocmon_context, gid); - } + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_setupCounters(rocmon_context, gid); + } #ifdef LIKWID_ROCPROF_SDK - else - { - ret = rocmon_sdk_setupCounters(rocmon_context, gid); - } + else + { + ret = rocmon_sdk_setupCounters(rocmon_context, gid); + } #endif - if (ret < 0) - { - ERROR_PRINT(Setting up rocprofiler counters failed); - free(smiEvents); - free(rocEvents); - return ret; - } + if (ret < 0) + { + ERROR_PRINT(Setting up rocprofiler counters failed); +/* free(smiEvents);*/ +/* free(rocEvents);*/ + return ret; + } - // Add SMI events + // Add SMI events + if (numSmiEvents > 0) + { ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents); ret = rocmon_smi_setupCounters(rocmon_context, gid); if (ret < 0) { ERROR_PRINT(Setting up SMI counters failed); - free(smiEvents); - free(rocEvents); +/* free(smiEvents);*/ +/* free(rocEvents);*/ return ret; } + } + else + { + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; device->activeGroup = gid; } rocmon_context->activeGroup = gid; rocmon_context->state = ROCMON_STATE_SETUP; - // Cleanup - free(smiEvents); - free(rocEvents); +/* // Cleanup*/ +/* free(smiEvents);*/ +/* free(rocEvents);*/ return 0; }