Merge branch 'fix-hip-profile' into 'rc'

[P0]: fix the profiling for HIP See merge request StanfordLegion/legion!1573
StanfordLegion · Dec 11, 2024 · 0c01a36 · 0c01a36
2 parents a1574fb + 2f3fd1b
commit 0c01a36
Show file tree

Hide file tree

Showing 10 changed files with 70 additions and 18 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -218,6 +218,9 @@ cache: &global_cache
   TEST_REGENT: "0"
   LEGION_WARNINGS_FATAL: "1"
 
+.realm: &realm # test realm using makefile
+  TEST_REALM: "1"
+
 .ctest: &ctest
   TEST_CTEST: "1"
 
@@ -1252,10 +1255,10 @@ clang14_cxx20_release_regent:
     <<: [*clang14, *terra14, *release, *cxx20_normal, *regent]
 
 # HIP CUDA tests:
-nvidia_hip_cuda125_gcc10_cxx17_debug_cuda_legion:
+nvidia_hip_cuda125_gcc10_cxx17_debug_cuda_legion_realm:
   <<: [*nvidia, *tests]
   variables:
-    <<: [*gcc11, *debug, *cxx17_normal, *hip_cuda125, *legion, *short, *shared]
+    <<: [*gcc11, *debug, *cxx17_normal, *hip_cuda125, *legion, *realm, *short, *shared]
 
 # CUDA tests:
 nvidia_cuda117_gcc10_cxx17_debug_cmake_cuda_dynamic_legion:

diff --git a/runtime/realm/hip/hip_internal.h b/runtime/realm/hip/hip_internal.h
@@ -122,7 +122,9 @@ namespace Realm {
     class GPUWorkFence : public Realm::Operation::AsyncWorkItem {
     public:
       GPUWorkFence(Realm::Operation *op);
-
+
+      virtual void mark_finished(bool successful);
+
       virtual void request_cancellation(void);
 
       void enqueue_on_stream(GPUStream *stream);

diff --git a/runtime/realm/hip/hip_module.cc b/runtime/realm/hip/hip_module.cc
@@ -306,6 +306,14 @@ namespace Realm {
       : Realm::Operation::AsyncWorkItem(op)
     {}
 
+    void GPUWorkFence::mark_finished(bool successful)
+    {
+      if(op->wants_gpu_work_start()) {
+        op->add_gpu_work_end(Clock::current_time_in_nanoseconds());
+      }
+      AsyncWorkItem::mark_finished(successful);
+    }
+
     void GPUWorkFence::request_cancellation(void)
     {
       // ignored - no way to shoot down HIP work

diff --git a/runtime/realm/operation.cc b/runtime/realm/operation.cc
@@ -202,8 +202,9 @@ namespace Realm {
 
   void Operation::mark_gpu_work_start()
   {
-    if(wants_gpu_timeline)
-      timeline_gpu.record_start_time();
+    if(wants_gpu_timeline) {
+      add_gpu_work_start(Clock::current_time_in_nanoseconds());
+    }
   }
 
   void Operation::add_gpu_work_start(uint64_t timestamp)

diff --git a/runtime/realm/profiling.h b/runtime/realm/profiling.h
@@ -141,8 +141,6 @@ namespace Realm {
       timestamp_t start_time; // when was the GPU started?
       timestamp_t end_time; // when was the GPU completed?
 
-      inline void record_start_time(void);
-      inline void record_end_time(void);
       inline bool is_valid(void) const;
     };
 

diff --git a/runtime/realm/profiling.inl b/runtime/realm/profiling.inl
@@ -95,14 +95,6 @@ namespace Realm {
     //
     // struct OperationTimeLineGPU
     //
-    inline void OperationTimelineGPU::record_start_time(void)
-    {
-      start_time = Clock::current_time_in_nanoseconds();
-    }
-    inline void OperationTimelineGPU::record_end_time(void)
-    {
-      end_time = Clock::current_time_in_nanoseconds();
-    }
     inline bool OperationTimelineGPU::is_valid(void) const
     {
       return ((start_time != INVALID_TIMESTAMP) &&

diff --git a/test/realm/CMakeLists.txt b/test/realm/CMakeLists.txt
@@ -84,6 +84,7 @@ if(Legion_USE_HIP)
   set(HIPSRC_memspeed memspeed_gpu.cu)
   set(HIPSRC_simple_reduce simple_reduce_gpu.cu)
   set(HIPSRC_multiaffine multiaffine_gpu.cu)
+  set(HIPSRC_test_profiling test_profiling_gpu.cu)
 
   # FIXME: https://github.com/StanfordLegion/legion/issues/1308
   # list(APPEND REALM_TESTS cuda_arrays)

diff --git a/test/realm/Makefile b/test/realm/Makefile
@@ -203,12 +203,14 @@ ifeq ($(strip $(USE_CUDA)),1)
 EXTRAOBJS_memspeed := memspeed_gpu.o
 EXTRAOBJS_simple_reduce := simple_reduce_gpu.o
 EXTRAOBJS_multiaffine := multiaffine_gpu.o
+EXTRAOBJS_test_profiling := test_profiling_gpu.o
 EXTRAOBJS_task_stream := task_stream_gpu.o
 EXTRAOBJS_test_cuhook := test_cuhook_gpu.o
-TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_task_stream) $(EXTRAOBJS_test_cuhook)
+TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_test_profiling) $(EXTRAOBJS_task_stream) $(EXTRAOBJS_test_cuhook)
 memspeed : memspeed_gpu.o
 simple_reduce : simple_reduce_gpu.o
 multiaffine : multiaffine_gpu.o
+test_profiling : test_profiling_gpu.o
 task_stream : task_stream_gpu.o
 test_cuhook : test_cuhook_gpu.o
 
@@ -238,11 +240,13 @@ ifeq ($(strip $(USE_HIP)),1)
 EXTRAOBJS_memspeed := memspeed_gpu.o
 EXTRAOBJS_simple_reduce := simple_reduce_gpu.o
 EXTRAOBJS_multiaffine := multiaffine_gpu.o
+EXTRAOBJS_test_profiling := test_profiling_gpu.o
 EXTRAOBJS_task_stream := task_stream_gpu.o
-TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_task_stream)
+TEST_OBJS += $(EXTRAOBJS_memspeed) $(EXTRAOBJS_simple_reduce) $(EXTRAOBJS_multiaffine) $(EXTRAOBJS_test_profiling) $(EXTRAOBJS_task_stream)
 memspeed : memspeed_gpu.o
 simple_reduce : simple_reduce_gpu.o
 multiaffine : multiaffine_gpu.o
+test_profiling : test_profiling_gpu.o
 task_stream : task_stream_gpu.o
 
 %.o : %.cu $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)

diff --git a/test/realm/test_profiling.cc b/test/realm/test_profiling.cc
@@ -8,6 +8,7 @@
 
 #include "realm.h"
 #include "realm/cuda/cuda_module.h"
+#include "realm/hip/hip_module.h"
 #include "realm/profiling.h"
 
 #include "osdep.h"
@@ -22,6 +23,10 @@ Logger log_app("app");
 extern void launch_spin_kernel(uint64_t t_ns, CUstream);
 #endif
 
+#ifdef REALM_USE_HIP
+extern void launch_spin_kernel(uint64_t t_ns, unifiedHipStream_t *);
+#endif
+
 // Task IDs, some IDs are reserved so start at first available number
 enum {
   TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE+0,
@@ -98,6 +103,14 @@ void child_task(const void *args, size_t arglen, const void *userdata, size_t us
   }
 #endif // REALM_USE_CUDA
 
+#ifdef REALM_USE_HIP
+  Realm::Hip::HipModule *module =
+      Realm::Runtime::get_runtime().get_module<Realm::Hip::HipModule>("hip");
+  if(module != nullptr) {
+    launch_spin_kernel(10000, module->get_task_hip_stream());
+  }
+#endif // REALM_USE_HIP
+
 #ifdef REALM_USE_EXCEPTIONS
   bool inject_fault = *(const bool *)args;
   if(inject_fault) {

diff --git a/test/realm/test_profiling_gpu.cu b/test/realm/test_profiling_gpu.cu
@@ -1,3 +1,5 @@
+#include "realm.h"
+#ifdef REALM_USE_CUDA
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <cstdint>
@@ -16,4 +18,32 @@ void launch_spin_kernel(uint64_t t_ns, CUstream stream)
   void *args[] = {&t_ns};
   cudaError_t err = cudaLaunchKernel(reinterpret_cast<void *>(spin_kernel), dim3(1),
                                      dim3(1), args, 0, static_cast<cudaStream_t>(stream));
-}
+}
+#endif
+
+#ifdef REALM_USE_HIP
+#include <hip/hip_runtime.h>
+
+__global__ void spin_kernel(uint64_t t_ns)
+{
+  uint64_t start = clock64();
+  uint64_t current = start;
+
+  const uint64_t clock_frequency = 1000000; // MHz -> ns
+  uint64_t target_cycles = (t_ns * clock_frequency) / 1000000000;
+
+  while ((current - start) < target_cycles) {
+    current = clock64();
+  }
+}
+
+void launch_spin_kernel(uint64_t t_ns, hipStream_t stream)
+{
+  void *args[] = {&t_ns};
+  hipError_t err = hipLaunchKernel(reinterpret_cast<void*>(spin_kernel), dim3(1), 
+                                   dim3(1), args, 0, stream);
+  if (err != hipSuccess) {
+    printf("Error launching spin kernel: %s\n", hipGetErrorString(err));
+  }
+}
+#endif