Skip to content

Commit

Permalink
Merge branch 'fix-hip-profile' into 'rc'
Browse files Browse the repository at this point in the history
[P0]: fix the profiling for HIP

See merge request StanfordLegion/legion!1573
  • Loading branch information
eddy16112 committed Dec 11, 2024
2 parents a1574fb + 2f3fd1b commit 0c01a36
Show file tree
Hide file tree
Showing 10 changed files with 70 additions and 18 deletions.
7 changes: 5 additions & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ cache: &global_cache
TEST_REGENT: "0"
LEGION_WARNINGS_FATAL: "1"

.realm: &realm # test realm using makefile
TEST_REALM: "1"

.ctest: &ctest
TEST_CTEST: "1"

Expand Down Expand Up @@ -1252,10 +1255,10 @@ clang14_cxx20_release_regent:
<<: [*clang14, *terra14, *release, *cxx20_normal, *regent]

# HIP CUDA tests:
nvidia_hip_cuda125_gcc10_cxx17_debug_cuda_legion:
nvidia_hip_cuda125_gcc10_cxx17_debug_cuda_legion_realm:
<<: [*nvidia, *tests]
variables:
<<: [*gcc11, *debug, *cxx17_normal, *hip_cuda125, *legion, *short, *shared]
<<: [*gcc11, *debug, *cxx17_normal, *hip_cuda125, *legion, *realm, *short, *shared]

# CUDA tests:
nvidia_cuda117_gcc10_cxx17_debug_cmake_cuda_dynamic_legion:
Expand Down
4 changes: 3 additions & 1 deletion runtime/realm/hip/hip_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ namespace Realm {
class GPUWorkFence : public Realm::Operation::AsyncWorkItem {
public:
GPUWorkFence(Realm::Operation *op);


virtual void mark_finished(bool successful);

virtual void request_cancellation(void);

void enqueue_on_stream(GPUStream *stream);
Expand Down
8 changes: 8 additions & 0 deletions runtime/realm/hip/hip_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,14 @@ namespace Realm {
: Realm::Operation::AsyncWorkItem(op)
{}

void GPUWorkFence::mark_finished(bool successful)
{
if(op->wants_gpu_work_start()) {
op->add_gpu_work_end(Clock::current_time_in_nanoseconds());
}
AsyncWorkItem::mark_finished(successful);
}

void GPUWorkFence::request_cancellation(void)
{
// ignored - no way to shoot down HIP work
Expand Down
5 changes: 3 additions & 2 deletions runtime/realm/operation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,9 @@ namespace Realm {

void Operation::mark_gpu_work_start()
{
if(wants_gpu_timeline)
timeline_gpu.record_start_time();
if(wants_gpu_timeline) {
add_gpu_work_start(Clock::current_time_in_nanoseconds());
}
}

void Operation::add_gpu_work_start(uint64_t timestamp)
Expand Down
2 changes: 0 additions & 2 deletions runtime/realm/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,6 @@ namespace Realm {
timestamp_t start_time; // when was the GPU started?
timestamp_t end_time; // when was the GPU completed?

inline void record_start_time(void);
inline void record_end_time(void);
inline bool is_valid(void) const;
};

Expand Down
8 changes: 0 additions & 8 deletions runtime/realm/profiling.inl
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,6 @@ namespace Realm {
//
// struct OperationTimeLineGPU
//
inline void OperationTimelineGPU::record_start_time(void)
{
start_time = Clock::current_time_in_nanoseconds();
}
inline void OperationTimelineGPU::record_end_time(void)
{
end_time = Clock::current_time_in_nanoseconds();
}
inline bool OperationTimelineGPU::is_valid(void) const
{
return ((start_time != INVALID_TIMESTAMP) &&
Expand Down
1 change: 1 addition & 0 deletions test/realm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ if(Legion_USE_HIP)
set(HIPSRC_memspeed memspeed_gpu.cu)
set(HIPSRC_simple_reduce simple_reduce_gpu.cu)
set(HIPSRC_multiaffine multiaffine_gpu.cu)
set(HIPSRC_test_profiling test_profiling_gpu.cu)

# FIXME: https://github.com/StanfordLegion/legion/issues/1308
# list(APPEND REALM_TESTS cuda_arrays)
Expand Down
8 changes: 6 additions & 2 deletions test/realm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,14 @@ ifeq ($(strip $(USE_CUDA)),1)
EXTRAOBJS_memspeed := memspeed_gpu.o
EXTRAOBJS_simple_reduce := simple_reduce_gpu.o
EXTRAOBJS_multiaffine := multiaffine_gpu.o
EXTRAOBJS_test_profiling := test_profiling_gpu.o
EXTRAOBJS_task_stream := task_stream_gpu.o
EXTRAOBJS_test_cuhook := test_cuhook_gpu.o
TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_task_stream) $(EXTRAOBJS_test_cuhook)
TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_test_profiling) $(EXTRAOBJS_task_stream) $(EXTRAOBJS_test_cuhook)
memspeed : memspeed_gpu.o
simple_reduce : simple_reduce_gpu.o
multiaffine : multiaffine_gpu.o
test_profiling : test_profiling_gpu.o
task_stream : task_stream_gpu.o
test_cuhook : test_cuhook_gpu.o

Expand Down Expand Up @@ -238,11 +240,13 @@ ifeq ($(strip $(USE_HIP)),1)
EXTRAOBJS_memspeed := memspeed_gpu.o
EXTRAOBJS_simple_reduce := simple_reduce_gpu.o
EXTRAOBJS_multiaffine := multiaffine_gpu.o
EXTRAOBJS_test_profiling := test_profiling_gpu.o
EXTRAOBJS_task_stream := task_stream_gpu.o
TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_task_stream)
TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_test_profiling) $(EXTRAOBJS_task_stream)
memspeed : memspeed_gpu.o
simple_reduce : simple_reduce_gpu.o
multiaffine : multiaffine_gpu.o
test_profiling : test_profiling_gpu.o
task_stream : task_stream_gpu.o

%.o : %.cu $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)
Expand Down
13 changes: 13 additions & 0 deletions test/realm/test_profiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "realm.h"
#include "realm/cuda/cuda_module.h"
#include "realm/hip/hip_module.h"
#include "realm/profiling.h"

#include "osdep.h"
Expand All @@ -22,6 +23,10 @@ Logger log_app("app");
extern void launch_spin_kernel(uint64_t t_ns, CUstream);
#endif

#ifdef REALM_USE_HIP
extern void launch_spin_kernel(uint64_t t_ns, unifiedHipStream_t *);
#endif

// Task IDs, some IDs are reserved so start at first available number
enum {
TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE+0,
Expand Down Expand Up @@ -98,6 +103,14 @@ void child_task(const void *args, size_t arglen, const void *userdata, size_t us
}
#endif // REALM_USE_CUDA

#ifdef REALM_USE_HIP
Realm::Hip::HipModule *module =
Realm::Runtime::get_runtime().get_module<Realm::Hip::HipModule>("hip");
if(module != nullptr) {
launch_spin_kernel(10000, module->get_task_hip_stream());
}
#endif // REALM_USE_HIP

#ifdef REALM_USE_EXCEPTIONS
bool inject_fault = *(const bool *)args;
if(inject_fault) {
Expand Down
32 changes: 31 additions & 1 deletion test/realm/test_profiling_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "realm.h"
#ifdef REALM_USE_CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdint>
Expand All @@ -16,4 +18,32 @@ void launch_spin_kernel(uint64_t t_ns, CUstream stream)
void *args[] = {&t_ns};
cudaError_t err = cudaLaunchKernel(reinterpret_cast<void *>(spin_kernel), dim3(1),
dim3(1), args, 0, static_cast<cudaStream_t>(stream));
}
}
#endif

#ifdef REALM_USE_HIP
#include <hip/hip_runtime.h>

__global__ void spin_kernel(uint64_t t_ns)
{
uint64_t start = clock64();
uint64_t current = start;

const uint64_t clock_frequency = 1000000; // MHz -> ns
uint64_t target_cycles = (t_ns * clock_frequency) / 1000000000;

while ((current - start) < target_cycles) {
current = clock64();
}
}

void launch_spin_kernel(uint64_t t_ns, hipStream_t stream)
{
void *args[] = {&t_ns};
hipError_t err = hipLaunchKernel(reinterpret_cast<void*>(spin_kernel), dim3(1),
dim3(1), args, 0, stream);
if (err != hipSuccess) {
printf("Error launching spin kernel: %s\n", hipGetErrorString(err));
}
}
#endif

0 comments on commit 0c01a36

Please sign in to comment.