Skip to content

Commit

Permalink
Update code to work again but only v1 and smi, sdk still fails to init
Browse files Browse the repository at this point in the history
  • Loading branch information
TomTheBear committed Nov 10, 2024
1 parent 741c2ad commit d3e1eb8
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 79 deletions.
156 changes: 123 additions & 33 deletions src/includes/rocmon_sdk.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ _rocmon_sdk_link_libraries()
DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread);
DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_record_counter_id);
DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_is_initialized);
DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string);

DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init);
DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down);
Expand Down Expand Up @@ -452,32 +453,12 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context,
}


int
tool_init(rocprofiler_client_finalize_t fini, void* udata)
static int _rocmon_sdk_create_devices(RocmonContext** stat_context)
{
rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS;
RocmonContext** stat_context = (RocmonContext**)udata;
RocmonContext* context = *stat_context;
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init);

// initialize libraries
if (_rocmon_sdk_link_libraries() < 0)
{
ERROR_PLAIN_PRINT(Failed to initialize libraries);
return -EFAULT;
}

/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/
/* hsa_status_t hstat = (*hsa_init_ptr)();*/
/* if (hstat != HSA_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to initialize HSA);*/
/* return -EFAULT;*/
/* }*/

//ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;);
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents);
stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata);
stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), stat_context);
if (stat != ROCPROFILER_STATUS_SUCCESS)
{
ERROR_PRINT(Failed to query available agents);
Expand All @@ -488,7 +469,6 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata)
FREE_IF_NOT_NULL(context->devices);
return -1;
}

for (int i = 0; i < context->numDevices; i++)
{
rocprofiler_context_id_t device_context;
Expand Down Expand Up @@ -544,6 +524,99 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata)
return 0;
}

int
tool_init(rocprofiler_client_finalize_t fini, void* udata)
{
rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS;
RocmonContext** stat_context = (RocmonContext**)udata;
RocmonContext* context = *stat_context;
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init);

// initialize libraries
if (_rocmon_sdk_link_libraries() < 0)
{
ERROR_PLAIN_PRINT(Failed to initialize libraries);
return -EFAULT;
}
return _rocmon_sdk_create_devices(stat_context);

/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/
/* hsa_status_t hstat = (*hsa_init_ptr)();*/
/* if (hstat != HSA_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to initialize HSA);*/
/* return -EFAULT;*/
/* }*/

//ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;);
/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents);*/
/* stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to query available agents);*/
/* return -EFAULT;*/
/* }*/
/* if (context->numDevices == 0)*/
/* {*/
/* FREE_IF_NOT_NULL(context->devices);*/
/* return -1;*/
/* }*/

/* for (int i = 0; i < context->numDevices; i++)*/
/* {*/
/* rocprofiler_context_id_t device_context;*/
/* rocprofiler_buffer_id_t buffer;*/
/* rocprofiler_callback_thread_t thread;*/
/* RocmonDevice* device = &context->devices[i];*/
/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId);*/
/* stat = (*rocprofiler_create_context_ptr)(&device_context);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* errno = EFAULT;*/
/* ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/
/* FREE_IF_NOT_NULL(context->devices);*/
/* return -EFAULT;*/
/* }*/
/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId);*/
/* rocmon_sdk_read_buffers_cb devdata = {*/
/* .context = stat_context,*/
/* .devid = device->deviceId,*/
/* .devcontext = device_context*/
/* };*/
/* stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* errno = EFAULT;*/
/* ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/
/* FREE_IF_NOT_NULL(context->devices);*/
/* return -EFAULT;*/
/* }*/
/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId);*/
/* stat = (*rocprofiler_create_callback_thread_ptr)(&thread);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* errno = EFAULT;*/
/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/
/* FREE_IF_NOT_NULL(context->devices);*/
/* return -EFAULT;*/
/* }*/
/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId);*/
/* stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* errno = EFAULT;*/
/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/
/* FREE_IF_NOT_NULL(context->devices);*/
/* return -EFAULT;*/
/* }*/
/* */
/* device->sdk_context = device_context;*/
/* device->buffer = buffer;*/
/* device->thread = thread;*/
/* }*/
/* return 0;*/
}


void
tool_fini(void* udata)
Expand Down Expand Up @@ -648,6 +721,7 @@ int
rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds)
{
int ret = 0;
rocprofiler_context_id_t text_context;
rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS;
if ((numGpus < 0) || (!gpuIds) || (!context))
{
Expand All @@ -666,11 +740,23 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds)
return ret;
}

stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure);
if (stat != ROCPROFILER_STATUS_SUCCESS)
/* stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to configure rocprofiler: %s, (*rocprofiler_get_status_string_ptr)(stat));*/
/* return -EFAULT;*/
/* }*/
/* stat = (*rocprofiler_create_context_ptr)(&text_context);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to create test context: %s, (*rocprofiler_get_status_string_ptr)(stat));*/
/* return -EFAULT;*/
/* }*/
ret = _rocmon_sdk_create_devices(&rocmon_context);
if (ret < 0)
{
ERROR_PLAIN_PRINT(Failed to configure rocprofiler);
return -EFAULT;
ERROR_PRINT(Failed to create SDK devices);
return ret;
}

if (context->numDevices == 0)
Expand Down Expand Up @@ -1085,12 +1171,16 @@ _rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device)
return -EFAULT;
}
}
/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/
/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/
/* {*/
/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/
/* return -EFAULT;*/
/* }*/
else
{
ERROR_PRINT(Device context for device %d not active, device->deviceId);
}
stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);
if (stat != ROCPROFILER_STATUS_SUCCESS)
{
ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));
return -EFAULT;
}
return 0;
}

Expand Down
9 changes: 5 additions & 4 deletions src/includes/rocmon_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,7 @@ rocmon_smi_startCounters(RocmonContext* context)
for (int i = 0; i < context->numDevices; i++)
{
RocmonDevice* device = &context->devices[i];
fprintf(stderr, "Device %d with %d SMI events\n", device->deviceId, device->numActiveSmiEvents);
// Check if there are any counters to start
if (device->numActiveSmiEvents <= 0)
{
Expand All @@ -940,11 +941,11 @@ rocmon_smi_startCounters(RocmonContext* context)

// Save baseline values
RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup];
for (int i = 0; i < device->numActiveSmiEvents; i++)
for (int j = 0; j < device->numActiveSmiEvents; j++)
{
double value = 0;
RocmonSmiEvent* event = &device->activeSmiEvents[i];
RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i];
RocmonSmiEvent* event = &device->activeSmiEvents[j];
RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+j];

// Measure counter
if (event->measureFunc)
Expand Down Expand Up @@ -1156,7 +1157,7 @@ void rocmon_smi_finalize(RocmonContext* context)
}
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI);
RSMI_CALL(rsmi_shut_down, (), {
ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI failed);
ERROR_PRINT(Shutdown SMI failed);
// fall through
});
rocmon_smi_initialized = FALSE;
Expand Down
2 changes: 1 addition & 1 deletion src/includes/rocmon_v1.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ rocmon_v1_finalize(RocmonContext* context)
}

ROCM_CALL(hsa_shut_down, (), {
ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA failed);
ERROR_PRINT(Shutdown HSA failed);
// fall through
});
}
Expand Down
83 changes: 42 additions & 41 deletions src/rocmon.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,23 +372,8 @@ rocmon_setupCounters(int gid)
//
// Separate rocprofiler and SMI events
//
const char **smiEvents = NULL, **rocEvents = NULL;
int numSmiEvents = 0, numRocEvents = 0;

// Allocate memory for string arrays
smiEvents = (const char**) malloc(group->nevents * sizeof(const char*));
if (smiEvents == NULL)
{
ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array);
return -ENOMEM;
}
rocEvents = (const char**) malloc(group->nevents * sizeof(const char*));
if (rocEvents == NULL)
{
ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array);
free(smiEvents);
return -ENOMEM;
}

// Go through each event and sort it
for (int i = 0; i < group->nevents; i++)
Expand All @@ -397,13 +382,11 @@ rocmon_setupCounters(int gid)
if (strncmp(name, "RSMI_", 5) == 0)
{
// RSMI event
smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix
numSmiEvents++;
}
else if (strncmp(name, "ROCP_", 5) == 0)
{
// Rocprofiler event
rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix
numRocEvents++;
}
else
Expand All @@ -414,48 +397,66 @@ rocmon_setupCounters(int gid)
}
}

// Add events to each device
for (int i = 0; i < rocmon_context->numDevices; i++)
{
RocmonDevice* device = &rocmon_context->devices[i];
device->numActiveSmiEvents = 0;
device->numActiveRocEvents = 0;
}

// Add rocprofiler events
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents);
if (rocmon_context->use_rocprofiler_v1)
{
ret = rocmon_v1_setupCounters(rocmon_context, gid);
}
// Add rocprofiler events
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents);
if (rocmon_context->use_rocprofiler_v1)
{
ret = rocmon_v1_setupCounters(rocmon_context, gid);
}
#ifdef LIKWID_ROCPROF_SDK
else
{
ret = rocmon_sdk_setupCounters(rocmon_context, gid);
}
else
{
ret = rocmon_sdk_setupCounters(rocmon_context, gid);
}
#endif
if (ret < 0)
{
ERROR_PRINT(Setting up rocprofiler counters failed);
free(smiEvents);
free(rocEvents);
return ret;
}
if (ret < 0)
{
ERROR_PRINT(Setting up rocprofiler counters failed);
/* free(smiEvents);*/
/* free(rocEvents);*/
return ret;
}

// Add SMI events
// Add SMI events
if (numSmiEvents > 0)
{
ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents);
ret = rocmon_smi_setupCounters(rocmon_context, gid);
if (ret < 0)
{
ERROR_PRINT(Setting up SMI counters failed);
free(smiEvents);
free(rocEvents);
/* free(smiEvents);*/
/* free(rocEvents);*/
return ret;
}
}
else
{
for (int i = 0; i < rocmon_context->numDevices; i++)
{
RocmonDevice* device = &rocmon_context->devices[i];
device->numActiveSmiEvents = 0;
}
}

// Add events to each device
for (int i = 0; i < rocmon_context->numDevices; i++)
{
RocmonDevice* device = &rocmon_context->devices[i];
device->activeGroup = gid;
}
rocmon_context->activeGroup = gid;
rocmon_context->state = ROCMON_STATE_SETUP;
// Cleanup
free(smiEvents);
free(rocEvents);
/* // Cleanup*/
/* free(smiEvents);*/
/* free(rocEvents);*/

return 0;
}
Expand Down

0 comments on commit d3e1eb8

Please sign in to comment.