diff --git a/CMakeLists.txt b/CMakeLists.txt index f305e11..e6d9379 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project (RGD) # Define version information set(RGD_MAJOR_VERSION 1) -set(RGD_MINOR_VERSION 2) +set(RGD_MINOR_VERSION 3) set(RGD_PATCH_NUMBER 0) if (NOT RGD_BUILD_NUMBER) set(RGD_BUILD_NUMBER 0) diff --git a/RGD_RELEASE_NOTES.txt b/RGD_RELEASE_NOTES.txt index 7e351d9..2339952 100644 --- a/RGD_RELEASE_NOTES.txt +++ b/RGD_RELEASE_NOTES.txt @@ -1,27 +1,16 @@ -Radeon™ GPU Detective v1.2 Release Notes -======================================= +Radeon™ GPU Detective v1.3 Release Notes +======================================== Radeon GPU Detective (RGD) is a tool for post-mortem analysis of GPU crashes (TDRs). Using the tool you can capture and analyze AMD GPU crash dumps and produce information that can help narrow down the search for the crash's root cause. Such information includes page fault details, resource details and execution markers reflecting the GPU work that was in progress at the moments leading to the crash Highlights ========== -This release improves the default execution markers which are baked into the AMD drivers and provide additional information even without inserting user markers. Examples of the information added: +This release adds support for Driver Experiments, a powerful new feature that lets you change the behavior and performance characteristics of your application without modifying its source code or its configuration and can be useful in debugging crashes. -* Index, vertex and instance counts for draw calls -* Thread group count for compute dispatches -* Improved default raytracing and mesh shader markers -* Barriers -* Queue type (Direct for graphics, Compute for compute) +If an AMD GPU crash dump (.rgd file) was captured while having Driver Experiments activated, it will contain this information and that would be presented by the RGD output as part of the Driver Info section. -Example output (with a single "Frame 429" user marker): - -Command Buffer ID: 0x883 (Queue type: Direct) -============================================= -[>] "Frame 429" - ├─[X] Draw(VertextCount=3, InstanceCount=1) - ├─[X] ----------Barrier---------- - └─[>] Dispatch(ThreadGroupCount=[16,16,1]) +For more details about the Driver Experiments feature see the "RGD documentation" in the documentation subfolder of this repository. Explicit exclusions =================== @@ -33,19 +22,17 @@ Known Issues message saying: "Summary generation failed” and clicking on "Show error" will display a text description that ends with "execution marker information missing [UMD]". As a workaround, restart RDP. A fix for this issue will be included in an upcoming driver update - there is no need to update the tool. * In certain cases, trying to capture a GPU crash dump of an app that has Microsoft® DRED enabled can lead to a system crash. -* In Radeon Developer Panel (RDP), it may happen that generated .rgd crash dump files appear with a wrong file size of 0 bytes. * Attempting to capture GPU crash dumps on a system with a Ryzen CPU that includes integrated graphics (with no connected discrete Radeon GPU) may result in a BSOD. -* A system reboot is recommended after the driver installation. An invalid crash dump file may get generated when RGD workflow is executed after a fresh driver installation without a system reboot. System Requirements =================== * Operating system: Windows 10 or 11. -* Latest Adrenalin Software driver (minimum version 23.12.1). +* Latest Adrenalin Software driver (minimum version 24.9.1). A system reboot is recommended after the driver installation. * GPU: Radeon™ RX 6000 series (RDNA™2) or RX 7000 series (RDNA™3) card. -* Latest RDP (Radeon Developer Panel) version, which is available as part of the Radeon Developer Tool Suite and can be downloaded from GPUOpen.com. Make sure you are using RDP v2.12.0.7 or later. +* Latest RDP (Radeon Developer Panel) version, which is available as part of the Radeon Developer Tool Suite and can be downloaded from GPUOpen.com. Make sure you are using RDP v3.2 or later. Note that this version of RGD supports DirectX® 12 and Vulkan® applications, so you will need either DX12 or vulkan application that crashes. For the best experience, it is recommended to: * Use string markers around render passes using the AMD GPU Services (AGS) library, as these will appear in the command line tool's output and will help identifying the code that was executing during the crash. diff --git a/documentation/source/conf.py b/documentation/source/conf.py index 350b17d..ee10188 100644 --- a/documentation/source/conf.py +++ b/documentation/source/conf.py @@ -20,6 +20,8 @@ #import os #import sys + + # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -53,9 +55,9 @@ # built documents. # # The short X.Y version. -version = u'1.2' +version = u'1.3' # The full version, including alpha/beta/rc tags. -release = u'1.2' +release = u'1.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/documentation/source/help_manual.rst b/documentation/source/help_manual.rst index 5802384..38326d6 100644 --- a/documentation/source/help_manual.rst +++ b/documentation/source/help_manual.rst @@ -75,9 +75,36 @@ This section is titled ``SYSTEM INFO`` and includes information about the system * **Operating system** information * **Graphics driver** information +* List of active Driver Experiments * Details about the installed **CPUs** * Details about the installed **GPUs** +Driver Experiments for Crash Analysis +""""""""""""""""""""""""""""""""""""" + +RGD v1.3 supports a powerful new feature called **Driver Experiments** which lets you toggle certain driver features and optimizations that can change the behavior of your application without modifying its source code or configuration. This is done using Driver Experiments that control the low-level behavior of the Radeon Adrenalin driver. These experiments control features like raytracing or mesh shader support, compiler optimizations and more and can be useful in debugging GPU crashes. + +AMD GPU crash dumps (.rgd files) record the list of Driver Experiments that were active during the crash analysis session, so that you always have an accurate picture of the driver configuration with which your app crashed. RGD's crash analysis output summary text file will display the list of Driver Experiments that were active as part of the System Info section. This information will also be available in RGD's machine-readable JSON output file. +For more details about this feature, please refer to the :ref:`quickstart-guide`. + +For a detailed description of each supported experiment, please refer to the Driver Experiments section of the `RDP documentation `_. + +Here is an example of active Driver Experiments:: + + =========== + SYSTEM INFO + =========== + + Driver info + =========== + ... + Experiments : total of 4 Driver Experiments were active while capturing the AMD GPU crash dump: + 1. Disable sampler feedback support + 2. Disable raytracing support + 3. Disable variable rate shading + 4. Hull shader wave size: Force 32 threads per wave + + Markers in progress """"""""""""""""""" @@ -172,7 +199,7 @@ The tree structure and contents are also configurable through the RDP options (o Note that RGD will collapse nodes which have all of their subnodes in finished state to remove noise and improve the tree's readability. -.. image:: images/image2024-06-19-advanced-options.png +.. image:: images/rgd-advanced-options.png Page fault summary """""""""""""""""" @@ -282,7 +309,7 @@ Let's elaborate: but a different other type of problem, e.g. a shader hang due to timeout (too long execution) or an infinite loop. -Scope of v1.2 +Scope of v1.3 ------------- RGD is designed to capture **GPU crashes** on Windows. If a GPU fault (such as memory page fault or infinite loop in a shader) causes the GPU driver to not respond to the OS for some pre-determined time period (the default on Windows is 2 seconds), the OS will detect that and attempt to restart or remove the device. This mechanism is also known as "TDR" (Timeout Detection and Recovery) and is what we @@ -300,7 +327,6 @@ Please use CPU debugging mechanisms like Microsoft Visual Studio to investigate Rendering code which **incorrectly uses D3D12 or Vulkan** may also fail purely on the CPU and not reach the graphics driver or the GPU. Therefore, such crashes are not captured by RGD. They usually result in ``DXGI_ERROR_INVALID_CALL`` error code returned, and are usually detected by the D3D12 Debug Layer. - .. note:: When debugging a problem in any D3D12 application, first **enable the D3D12 Debug Layer** and @@ -331,6 +357,9 @@ Usage tips for RGD * In Vulkan, the old device extension VK_EXT_debug_marker is also supported by RGD, but it is now deprecated in favor of the VK_EXT_debug_utils instance extension. +* **Try Crash Analysis with Driver Experiments**: If you suspect that certain optimizations or features enabled by the driver might be causing the crash, + you can try to disable them using Driver Experiments. This can help you narrow down the search for the cause of the crash. + Known issues and workarounds ---------------------------- diff --git a/documentation/source/images/driver-experiments-select-api.png b/documentation/source/images/driver-experiments-select-api.png new file mode 100644 index 0000000..48a1626 Binary files /dev/null and b/documentation/source/images/driver-experiments-select-api.png differ diff --git a/documentation/source/images/driver-experiments-select-experiment.png b/documentation/source/images/driver-experiments-select-experiment.png new file mode 100644 index 0000000..77a169d Binary files /dev/null and b/documentation/source/images/driver-experiments-select-experiment.png differ diff --git a/documentation/source/images/image2024-enable-ca.png b/documentation/source/images/enable-crash-analysis.png similarity index 100% rename from documentation/source/images/image2024-enable-ca.png rename to documentation/source/images/enable-crash-analysis.png diff --git a/documentation/source/images/enable-driver-experiments.png b/documentation/source/images/enable-driver-experiments.png new file mode 100644 index 0000000..37d984a Binary files /dev/null and b/documentation/source/images/enable-driver-experiments.png differ diff --git a/documentation/source/images/image2024-open-text-summary.png b/documentation/source/images/open-text-summary.png similarity index 100% rename from documentation/source/images/image2024-open-text-summary.png rename to documentation/source/images/open-text-summary.png diff --git a/documentation/source/images/image2024-06-19-advanced-options.png b/documentation/source/images/rgd-advanced-options.png similarity index 100% rename from documentation/source/images/image2024-06-19-advanced-options.png rename to documentation/source/images/rgd-advanced-options.png diff --git a/documentation/source/images/image2024-select-text.png b/documentation/source/images/select-text-output-format.png similarity index 100% rename from documentation/source/images/image2024-select-text.png rename to documentation/source/images/select-text-output-format.png diff --git a/documentation/source/index.rst b/documentation/source/index.rst index 9799962..a9f488a 100644 --- a/documentation/source/index.rst +++ b/documentation/source/index.rst @@ -18,7 +18,7 @@ This guide will get you up and running with RGD, a tool for post-mortem GPU cras .. note:: Review these requirements to make sure that this tool is relevant for your use case: - * RGD v1.1 supports **DirectX12** and **Vulkan**. + * RGD v1.3 supports **DirectX12** and **Vulkan**. * **Windows 10 or 11**. * **RDNA™2** (RX 6000 series) **or RDNA™3** (RX 7000 series) card. * Must **TDR** (we don't catch it if there is no TDR). @@ -33,19 +33,36 @@ Capture GPU crash dump 1. Before you start, if you ever changed the TdrLevel registry setting, make sure it is set to TdrLevelRecover(3). 2. Run RDP GUI app (RadeonDeveloperPanel.exe). -3. Under CAPTURE -> "Available features", enable "Crash Analysis". +3. Under CAPTURE -> "Available features", enable "Crash Analysis". -.. image:: images/image2024-enable-ca.png +.. image:: images/enable-crash-analysis.png 4. Under the "Crash Analysis" tab, make sure that the Text checkbox is checked for the automatic crash summary generation. -.. image:: images/image2024-select-text.png +.. image:: images/select-text-output-format.png 5. Run the crashing app and reproduce the TDR. .. note:: You can always generate the text or JSON summary files from an .rgd file after has been captured. This can be done either by right-clicking the .rgd file entry in RDP and using the context menu or by invoking the rgd command line tool directly (run ``rgd -h`` to see the help manual). +Capture GPU crash dump with Driver Experiments enabled +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Under CAPTURE -> "Available features", enable "Driver Experiments" + +.. image:: images/enable-driver-experiments.png + +2. Under the "Driver Experiments" tab, select the API you want to enable the experiments for (DirectX12 or Vulkan). + +.. image:: images/driver-experiments-select-api.png + +3. Under the "Driver Experiments" tab, enable/select the experiments you want to activate. + +.. image:: images/driver-experiments-select-experiment.png + +4. Follow the steps in the previous section to capture the GPU crash dump. + Crash analysis ^^^^^^^^^^^^^^ @@ -55,7 +72,7 @@ RGD doesn't offer a GUI tool to open these files. Instead, you can convert them to a report in text or JSON format directly from RDP. To do it, right-click and select “Open text summary”: -.. image:: images/image2024-open-text-summary.png +.. image:: images/open-text-summary.png This will open the .txt crash analysis file which includes information that can help narrow down the search for the crash's root cause:: @@ -112,4 +129,4 @@ of Sale. AMD, the AMD Arrow logo, Radeon, Ryzen, CrossFire, RDNA and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -© 2023 Advanced Micro Devices, Inc. All rights reserved. \ No newline at end of file +© 2024 Advanced Micro Devices, Inc. All rights reserved. \ No newline at end of file diff --git a/samples/sample_crash_dump.rgd b/samples/sample_crash_dump.rgd index b64b64e..a4ed4fc 100644 Binary files a/samples/sample_crash_dump.rgd and b/samples/sample_crash_dump.rgd differ diff --git a/source/radeon_gpu_detective_backend/rgd_data_types.h b/source/radeon_gpu_detective_backend/rgd_data_types.h index 35000f2..a2e3bd6 100644 --- a/source/radeon_gpu_detective_backend/rgd_data_types.h +++ b/source/radeon_gpu_detective_backend/rgd_data_types.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief global data types. @@ -13,6 +13,9 @@ #include #include +// JSON. +#include "json/single_include/nlohmann/json.hpp" + // RDF. #include "rdf/rdf/inc/amdrdf.h" @@ -61,6 +64,24 @@ static const std::unordered_set kBarrierMarkerStrings = { kBarrierS static const char* kChunkIdTraceProcessInfo = "TraceProcessInfo"; static const uint32_t kChunkMaxSupportedVersionTraceProcessInfo = 1; +static const char* kChunkIdDriverOverrides = "DriverOverrides"; + +// DriverOverrides chunk version constants. +static const uint32_t kChunkMaxSupportedVersionDriverOverrides = 3; + +// DriverOverrides chunk JSON element name constants. +static const char* kJsonElemComponentsDriverOverridesChunk = "Components"; +static const char* kJsonElemComponentDriverOverridesChunk = "Component"; +static const char* kJsonElemStructuresDriverOverridesChunk = "Structures"; +static const char* kJsonElemExperimentsDriverOverridesChunk = "Experiments"; +static const char* kJsonElemSettingNameDriverOverridesChunk = "SettingName"; +static const char* kJsonElemUserOverrideDriverOverridesChunk = "UserOverride"; +static const char* kJsonElemWasSupportedDriverOverridesChunk = "Supported"; +static const char* kJsonElemCurrentDriverOverridesChunk = "Current"; +static const char* kJsonElemIsDriverExperimentsDriverOverridesChunk = "IsDriverExperiments"; +static const char* kErrorMsgInvalidDriverOverridesJson = "invalid DriverOverrides JSON"; +static const char* kErrorMsgFailedToParseDriverExperimentsInfo = "failed to parse Driver Experiments info"; + // Represents the execution status of an execution marker. // A marker can be in a one of 3 states: // 1. Hasn't started executing @@ -184,6 +205,9 @@ struct RgdCrashDumpContents TraceProcessInfo crashing_app_process_info; // Mapping between command buffer ID and the indices for umd_crash_data.events array of its relevant execution marker events. std::unordered_map> cmd_buffer_mapping; + + // Driver Experiments JSON + nlohmann::json driver_experiments_json; }; #endif // RADEON_GPU_DETECTIVE_SOURCE_RGD_DATA_TYPES_H_ diff --git a/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.cpp b/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.cpp index e8100a8..a7ad7ef 100644 --- a/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.cpp +++ b/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief execution marker tree serialization. diff --git a/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.h b/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.h index e630dcc..c68bedc 100644 --- a/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.h +++ b/source/radeon_gpu_detective_backend/rgd_exec_marker_tree_serializer.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief execution marker tree serialization. diff --git a/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.cpp b/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.cpp index 8017430..038336d 100644 --- a/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.cpp +++ b/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief execution marker serialization. diff --git a/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.h b/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.h index 9cd8010..26757d9 100644 --- a/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.h +++ b/source/radeon_gpu_detective_backend/rgd_marker_data_serializer.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief execution marker serialization. diff --git a/source/radeon_gpu_detective_backend/rgd_parsing_utils.cpp b/source/radeon_gpu_detective_backend/rgd_parsing_utils.cpp index 28d671e..fa57262 100644 --- a/source/radeon_gpu_detective_backend/rgd_parsing_utils.cpp +++ b/source/radeon_gpu_detective_backend/rgd_parsing_utils.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief utilities for parsing raw data. @@ -613,4 +613,60 @@ bool RgdParsingUtils::ParseTraceProcessInfoChunk(rdf::ChunkFile& chunk_file, con ret = error_txt.str().empty(); return ret; -} \ No newline at end of file +} + +bool RgdParsingUtils::ParseDriverOverridesChunk(rdf::ChunkFile& chunk_file, const char* chunk_identifier, nlohmann::json& driver_experiments_json) +{ + bool ret = true; + const int64_t kChunkCount = chunk_file.GetChunkCount(chunk_identifier); + const int64_t kChunkIdx = 0; + const char* kErrorMsg = "failed to extract the list of enabled Driver Experiments"; + std::stringstream error_txt; + + // Parse DriverOverrides chunk. It will not be present for the files captured with RDP 3.0 and before. + if (kChunkCount > 0) + { + const uint32_t kChunkVersion = chunk_file.GetChunkVersion(chunk_identifier); + if (kChunkVersion <= kChunkMaxSupportedVersionDriverOverrides) + { + // Only one DriverOverrides chunk is expected so chunk index is set to 0 (first chunk). + assert(kChunkCount == 1); + uint64_t payload_size = chunk_file.GetChunkDataSize(chunk_identifier, kChunkIdx); + if (payload_size > 0) + { + std::string driver_overrides_json_data(payload_size, '\0'); + + // Read the DriverOverrides chunk payload data. + chunk_file.ReadChunkDataToBuffer(chunk_identifier, kChunkIdx, driver_overrides_json_data.data()); + try + { + driver_experiments_json = nlohmann::json::parse(driver_overrides_json_data.data()); + } + catch (const std::exception& e) + { + error_txt << kErrorMsg << " (" << e.what() << ")"; + RgdUtils::PrintMessage(error_txt.str().c_str(), RgdMessageType::kError, true); + } + } + else + { + error_txt << kErrorMsg << " (invalid chunk payload size [" << kChunkIdDriverOverrides << "])"; + RgdUtils::PrintMessage(error_txt.str().c_str(), RgdMessageType::kError, true); + } + } + else + { + error_txt << kErrorMsg << " (unsupported chunk version: " << kChunkVersion << " [" << kChunkIdDriverOverrides << "])"; + RgdUtils::PrintMessage(error_txt.str().c_str(), RgdMessageType::kError, true); + } + } + else + { + error_txt << kErrorMsg << " (Driver Experiments information missing [" << kChunkIdDriverOverrides << "])"; + RgdUtils::PrintMessage(error_txt.str().c_str(), RgdMessageType::kError, true); + } + + ret = error_txt.str().empty(); + + return ret; +} diff --git a/source/radeon_gpu_detective_backend/rgd_parsing_utils.h b/source/radeon_gpu_detective_backend/rgd_parsing_utils.h index e6a8b54..f4739fa 100644 --- a/source/radeon_gpu_detective_backend/rgd_parsing_utils.h +++ b/source/radeon_gpu_detective_backend/rgd_parsing_utils.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief utilities for parsing raw data. @@ -12,6 +12,9 @@ #include #include +// JSON. +#include "json/single_include/nlohmann/json.hpp" + // RDF. #include "rdf/rdf/inc/amdrdf.h" @@ -44,6 +47,9 @@ class RgdParsingUtils // Parses a TraceProcessInfo chunk. static bool ParseTraceProcessInfoChunk(rdf::ChunkFile& chunk_file, const char* chunk_identifier, TraceProcessInfo& process_info); + // Parse a 'DriverOverrides' chunk from the given chunk file. + static bool ParseDriverOverridesChunk(rdf::ChunkFile& chunk_file, const char* chunk_identifier, nlohmann::json& driver_experiments_json); + private: RgdParsingUtils() = delete; ~RgdParsingUtils() = delete; diff --git a/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.cpp b/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.cpp index 0711cca..b5572ec 100644 --- a/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.cpp +++ b/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer for memory resource information. diff --git a/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.h b/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.h index 1ffa95a..aa01200 100644 --- a/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.h +++ b/source/radeon_gpu_detective_backend/rgd_resource_info_serializer.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer for memory resource information. diff --git a/source/radeon_gpu_detective_backend/rgd_serializer.cpp b/source/radeon_gpu_detective_backend/rgd_serializer.cpp index 3c2d74f..8b58591 100644 --- a/source/radeon_gpu_detective_backend/rgd_serializer.cpp +++ b/source/radeon_gpu_detective_backend/rgd_serializer.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer different data elements. @@ -35,7 +35,105 @@ static double GetValueInHz(double val, ClockSpeedUnit val_unit = ClockSpeedUnit: return result; } -bool RgdSerializer::ToString(const Config& user_config, const system_info_utils::SystemInfo& system_info, std::string& system_info_txt) +// Returns the list of active Driver Experiments. +static std::string GetDriverExperimentsString(const nlohmann::json& driver_experiments_json) +{ + std::stringstream txt; + size_t active_experiments_count = 0; + + try + { + if (driver_experiments_json.find(kJsonElemComponentsDriverOverridesChunk) != driver_experiments_json.end() + && driver_experiments_json[kJsonElemComponentsDriverOverridesChunk].is_array() + && driver_experiments_json.find(kJsonElemIsDriverExperimentsDriverOverridesChunk) != driver_experiments_json.end() + && driver_experiments_json[kJsonElemIsDriverExperimentsDriverOverridesChunk].is_boolean()) + { + const bool is_driver_experiments = driver_experiments_json[kJsonElemIsDriverExperimentsDriverOverridesChunk].get(); + + if (is_driver_experiments) + { + const nlohmann::json& components_json = driver_experiments_json[kJsonElemComponentsDriverOverridesChunk]; + + for (const auto& component : components_json) + { + // Process "Experiments" component. + if (component[kJsonElemComponentDriverOverridesChunk] == kJsonElemExperimentsDriverOverridesChunk) + { + const nlohmann::json& structures = component[kJsonElemStructuresDriverOverridesChunk]; + for (auto it = structures.begin(); it != structures.end(); ++it) + { + const nlohmann::json& experiments = it.value(); + for (const auto& experiment : experiments) + { + // Check if the experiment was supported by the driver at the time of the crash. + bool is_supported_experiment = (experiment[kJsonElemWasSupportedDriverOverridesChunk].is_boolean() && + experiment[kJsonElemWasSupportedDriverOverridesChunk]); + if (is_supported_experiment) + { + if (experiment[kJsonElemUserOverrideDriverOverridesChunk].is_boolean() && + experiment[kJsonElemCurrentDriverOverridesChunk].is_boolean()) + { + // The user override value. + bool is_user_override = experiment[kJsonElemUserOverrideDriverOverridesChunk].get(); + + // The value in the driver at the time of the crash. + bool is_current = experiment[kJsonElemCurrentDriverOverridesChunk].get(); + + if (is_user_override && is_current) + { + // Experiment is active only when both user override and current values are true. + txt << "\t" << ++active_experiments_count << ". " + << experiment[kJsonElemSettingNameDriverOverridesChunk].get() << std::endl; + } + } + else + { + txt << "\t" << ++active_experiments_count << ". " + << experiment[kJsonElemSettingNameDriverOverridesChunk].get() << ": " + << experiment[kJsonElemCurrentDriverOverridesChunk].get() << std::endl; + } + } + } + } + } + } + } + } + else + { + assert(false); + RgdUtils::PrintMessage(kErrorMsgInvalidDriverOverridesJson, RgdMessageType::kError, true); + } + } + catch (nlohmann::json::exception e) + { + assert(false); + std::stringstream error_msg; + error_msg << kErrorMsgFailedToParseDriverExperimentsInfo << " (" << e.what() << ")"; + RgdUtils::PrintMessage(error_msg.str().c_str(), RgdMessageType::kError, true); + } + + const char* kDriverExperimentsSectionStr = "Experiments : "; + const char* kDriverExperimentsActiveMsgPart1 = "total of "; + const char* kDriverExperimentsActiveMsgPart2 = " Driver Experiments were active while capturing the AMD GPU crash dump:"; + const char* kDriverExperimentsNotActiveMsg = "no driver experiments were enabled."; + + std::stringstream driver_experiments_txt; + if (active_experiments_count > 0) + { + driver_experiments_txt << kDriverExperimentsSectionStr << kDriverExperimentsActiveMsgPart1 << active_experiments_count << kDriverExperimentsActiveMsgPart2 + << std::endl; + driver_experiments_txt << txt.str(); + } + else + { + driver_experiments_txt << kDriverExperimentsSectionStr << kDriverExperimentsNotActiveMsg << std::endl; + } + + return driver_experiments_txt.str(); +} + +bool RgdSerializer::ToString(const Config& user_config, const system_info_utils::SystemInfo& system_info, const nlohmann::json& driver_experiments_json, std::string& system_info_txt) { bool ret = true; std::stringstream txt; @@ -56,6 +154,7 @@ bool RgdSerializer::ToString(const Config& user_config, const system_info_utils: txt << "Driver packaging version: " << system_info.driver.packaging_version << std::endl; txt << "Driver software version: " << system_info.driver.software_version << std::endl; txt << "Dev driver version: " << system_info.devdriver.tag << std::endl; + txt << GetDriverExperimentsString(driver_experiments_json); txt << std::endl; // Operating system info. diff --git a/source/radeon_gpu_detective_backend/rgd_serializer.h b/source/radeon_gpu_detective_backend/rgd_serializer.h index c66382f..2087610 100644 --- a/source/radeon_gpu_detective_backend/rgd_serializer.h +++ b/source/radeon_gpu_detective_backend/rgd_serializer.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer different data elements. @@ -23,7 +23,7 @@ class RgdSerializer public: // Serializes the given SystemInfo structure into a string. // Returns true if the file exists and false otherwise. - static bool ToString(const Config& user_config, const system_info_utils::SystemInfo& system_info, std::string& system_info_txt); + static bool ToString(const Config& user_config, const system_info_utils::SystemInfo& system_info, const nlohmann::json& driver_experiments_json, std::string& system_info_txt); // Serialize the input parameters information into a string. static void InputInfoToString(const Config& user_config, diff --git a/source/radeon_gpu_detective_backend/rgd_serializer_json.cpp b/source/radeon_gpu_detective_backend/rgd_serializer_json.cpp index 68e3d24..6a62b9f 100644 --- a/source/radeon_gpu_detective_backend/rgd_serializer_json.cpp +++ b/source/radeon_gpu_detective_backend/rgd_serializer_json.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer to JSON format. @@ -18,6 +18,8 @@ // *** INTERNALLY-LINKED AUXILIARY CONSTANTS - BEGIN *** static const char* kJsonElemTimestampElement = "timestamp"; +static const char* kJsonElemSystemInfo = "system_info"; +static const char* kJsonElemDriverInfo = "driver_info"; // *** INTERNALLY-LINKED AUXILIARY CONSTANTS - ENDS *** @@ -55,36 +57,36 @@ void RgdSerializerJson::SetSystemInfoData(const Config& user_config, const syste if (user_config.is_extended_sysinfo) { // Version. - json_["system_info"]["system_info_version"]["major"] = system_info.version.major; - json_["system_info"]["system_info_version"]["minor"] = system_info.version.minor; - json_["system_info"]["system_info_version"]["patch"] = system_info.version.patch; - json_["system_info"]["system_info_version"]["build"] = system_info.version.build; + json_[kJsonElemSystemInfo]["system_info_version"]["major"] = system_info.version.major; + json_[kJsonElemSystemInfo]["system_info_version"]["minor"] = system_info.version.minor; + json_[kJsonElemSystemInfo]["system_info_version"]["patch"] = system_info.version.patch; + json_[kJsonElemSystemInfo]["system_info_version"]["build"] = system_info.version.build; } // Driver info. - json_["system_info"]["driver_info"]["packaging_version"] = system_info.driver.packaging_version; - json_["system_info"]["driver_info"]["software_version"] = system_info.driver.software_version; - json_["system_info"]["driver_info"]["dev_driver_version"] = system_info.devdriver.tag; + json_[kJsonElemSystemInfo][kJsonElemDriverInfo]["packaging_version"] = system_info.driver.packaging_version; + json_[kJsonElemSystemInfo][kJsonElemDriverInfo]["software_version"] = system_info.driver.software_version; + json_[kJsonElemSystemInfo][kJsonElemDriverInfo]["dev_driver_version"] = system_info.devdriver.tag; // Operating system info. - json_["system_info"]["os"]["name"] = system_info.os.name; - json_["system_info"]["os"]["description"] = system_info.os.desc; - json_["system_info"]["os"]["hostname"] = system_info.os.hostname; - json_["system_info"]["os"]["memory"] = nlohmann::json::array(); - json_["system_info"]["os"]["memory"].push_back({ + json_[kJsonElemSystemInfo]["os"]["name"] = system_info.os.name; + json_[kJsonElemSystemInfo]["os"]["description"] = system_info.os.desc; + json_[kJsonElemSystemInfo]["os"]["hostname"] = system_info.os.hostname; + json_[kJsonElemSystemInfo]["os"]["memory"] = nlohmann::json::array(); + json_[kJsonElemSystemInfo]["os"]["memory"].push_back({ {"physical_bytes", system_info.os.memory.physical}, {"swap_bytes", system_info.os.memory.swap} }); // CPU info. - json_["system_info"]["cpu"] = nlohmann::json::array(); + json_[kJsonElemSystemInfo]["cpu"] = nlohmann::json::array(); for (uint32_t i = 0; i < system_info.cpus.size(); i++) { std::string cpu_name; RgdUtils::TrimLeadingAndTrailingWhitespace(system_info.cpus[i].name, cpu_name); if (user_config.is_extended_sysinfo) { - json_["system_info"]["cpu"].push_back({ + json_[kJsonElemSystemInfo]["cpu"].push_back({ {"name", cpu_name }, {"architecture", system_info.cpus[i].architecture }, {"cpu_id", system_info.cpus[i].cpu_id }, @@ -98,7 +100,7 @@ void RgdSerializerJson::SetSystemInfoData(const Config& user_config, const syste } else { - json_["system_info"]["cpu"].push_back({ + json_[kJsonElemSystemInfo]["cpu"].push_back({ {"name", cpu_name }, {"architecture", system_info.cpus[i].architecture }, {"cpu_id", system_info.cpus[i].cpu_id }, @@ -108,7 +110,7 @@ void RgdSerializerJson::SetSystemInfoData(const Config& user_config, const syste } // GPU info. - json_["system_info"]["gpu"] = nlohmann::json::array(); + json_[kJsonElemSystemInfo]["gpu"] = nlohmann::json::array(); for (uint32_t g = 0; g < system_info.gpus.size(); g++) { // Memory heaps. @@ -149,7 +151,7 @@ void RgdSerializerJson::SetSystemInfoData(const Config& user_config, const syste }); } - json_["system_info"]["gpu"].push_back({ + json_[kJsonElemSystemInfo]["gpu"].push_back({ {"name", system_info.gpus[g].name }, {"engine_clock_max_hz", system_info.gpus[g].asic.engine_clock_hz.max }, {"engine_clock_min_hz", system_info.gpus[g].asic.engine_clock_hz.min }, @@ -174,7 +176,7 @@ void RgdSerializerJson::SetSystemInfoData(const Config& user_config, const syste } else { - json_["system_info"]["gpu"].push_back({ + json_[kJsonElemSystemInfo]["gpu"].push_back({ {"name", system_info.gpus[g].name }, {"device_id", system_info.gpus[g].asic.id_info.device }, {"device_revision_id", system_info.gpus[g].asic.id_info.e_rev }, @@ -381,6 +383,92 @@ void RgdSerializerJson::SetExecutionMarkerSummaryList(const Config& user_config, } } +void RgdSerializerJson::SetDriverExperimentsInfoData(const nlohmann::json& driver_experiments_json) +{ + const char* kJsonElemExperimentsRgdOutputJson = "experiments"; + const char* kJsonElemSettingNameRgdOutputJson = "setting_name"; + const char* kJsonElemUserOverrideRgdOutputJson = "user_override"; + std::stringstream txt; + + // Add "experiments" array under the system_info -> driver_info. + json_[kJsonElemSystemInfo][kJsonElemDriverInfo][kJsonElemExperimentsRgdOutputJson] = nlohmann::json::array(); + + try + { + if (driver_experiments_json.find(kJsonElemComponentsDriverOverridesChunk) != driver_experiments_json.end() && + driver_experiments_json[kJsonElemComponentsDriverOverridesChunk].is_array() && + driver_experiments_json.find(kJsonElemIsDriverExperimentsDriverOverridesChunk) != driver_experiments_json.end() && + driver_experiments_json[kJsonElemIsDriverExperimentsDriverOverridesChunk].is_boolean()) + { + const bool is_driver_experiments = driver_experiments_json[kJsonElemIsDriverExperimentsDriverOverridesChunk].get(); + + if (is_driver_experiments) + { + const nlohmann::json& components_json = driver_experiments_json[kJsonElemComponentsDriverOverridesChunk]; + + for (const auto& component : components_json) + { + // Process "Experiments" component. + if (component[kJsonElemComponentDriverOverridesChunk] == kJsonElemExperimentsDriverOverridesChunk) + { + const nlohmann::json& structures = component[kJsonElemStructuresDriverOverridesChunk]; + size_t exp_seq_no = 1; + for (auto it = structures.begin(); it != structures.end(); ++it) + { + const nlohmann::json& experiments = it.value(); + for (const auto& experiment : experiments) + { + // Check if the experiment was supported by the driver at the time of the crash. + bool is_supported_experiment = (experiment[kJsonElemWasSupportedDriverOverridesChunk].is_boolean() && + experiment[kJsonElemWasSupportedDriverOverridesChunk]); + if (is_supported_experiment) + { + if (experiment[kJsonElemUserOverrideDriverOverridesChunk].is_boolean() && + experiment[kJsonElemCurrentDriverOverridesChunk].is_boolean()) + { + // The user override value. + bool is_user_override = experiment[kJsonElemUserOverrideDriverOverridesChunk].get(); + + // The value in the driver at the time of the crash. + bool is_current = experiment[kJsonElemCurrentDriverOverridesChunk].get(); + + if (is_user_override && is_current) + { + // Experiment is active only when both user override and current values are true. + json_[kJsonElemSystemInfo][kJsonElemDriverInfo][kJsonElemExperimentsRgdOutputJson].push_back( + {{kJsonElemSettingNameRgdOutputJson, experiment[kJsonElemSettingNameDriverOverridesChunk]}, + {kJsonElemUserOverrideRgdOutputJson, experiment[kJsonElemCurrentDriverOverridesChunk]}}); + } + } + else + { + json_[kJsonElemSystemInfo][kJsonElemDriverInfo][kJsonElemExperimentsRgdOutputJson].push_back( + {{kJsonElemSettingNameRgdOutputJson, experiment[kJsonElemSettingNameDriverOverridesChunk]}, + {kJsonElemUserOverrideRgdOutputJson, experiment[kJsonElemCurrentDriverOverridesChunk]}}); + } + } + } + } + } + } + } + } + else + { + assert(false); + RgdUtils::PrintMessage(kErrorMsgInvalidDriverOverridesJson, RgdMessageType::kError, true); + } + } + catch (nlohmann::json::exception e) + { + assert(false); + std::stringstream error_msg; + error_msg << kErrorMsgFailedToParseDriverExperimentsInfo << " (" << e.what() << ")"; + RgdUtils::PrintMessage(error_msg.str().c_str(), RgdMessageType::kError, true); + } + +} + bool RgdSerializerJson::SaveToFile(const Config& user_config) const { std::string contents; diff --git a/source/radeon_gpu_detective_backend/rgd_serializer_json.h b/source/radeon_gpu_detective_backend/rgd_serializer_json.h index e5eda92..5191228 100644 --- a/source/radeon_gpu_detective_backend/rgd_serializer_json.h +++ b/source/radeon_gpu_detective_backend/rgd_serializer_json.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief serializer to JSON format. @@ -24,7 +24,7 @@ // JSON Schema version #define STRINGIFY_JSON_SCHEMA_VERSION(major, minor) STRINGIFY_MACRO(major) "." STRINGIFY_MACRO(minor) #define RGD_JSON_SCHEMA_VERSION_MAJOR 1 -#define RGD_JSON_SCHEMA_VERSION_MINOR 0 +#define RGD_JSON_SCHEMA_VERSION_MINOR 1 #define RGD_JSON_SCHEMA_VERSION STRINGIFY_JSON_SCHEMA_VERSION(RGD_JSON_SCHEMA_VERSION_MAJOR, RGD_JSON_SCHEMA_VERSION_MINOR) // *** INTERNALLY-LINKED AUXILIARY CONSTANTS - BEGIN *** @@ -69,6 +69,9 @@ class RgdSerializerJson const std::unordered_map >& cmd_buffer_events, ExecMarkerDataSerializer& exec_marker_serializer); + // Set Driver Experiments info. + void SetDriverExperimentsInfoData(const nlohmann::json& driver_experiments_json); + // Saves the JSON contents to a file. bool SaveToFile(const Config& user_config) const; diff --git a/source/radeon_gpu_detective_backend/rgd_utils.cpp b/source/radeon_gpu_detective_backend/rgd_utils.cpp index e9f6d1b..2fda445 100644 --- a/source/radeon_gpu_detective_backend/rgd_utils.cpp +++ b/source/radeon_gpu_detective_backend/rgd_utils.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief general utilities. diff --git a/source/radeon_gpu_detective_backend/rgd_utils.h b/source/radeon_gpu_detective_backend/rgd_utils.h index 143b8e3..db2f431 100644 --- a/source/radeon_gpu_detective_backend/rgd_utils.h +++ b/source/radeon_gpu_detective_backend/rgd_utils.h @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief general utilities. diff --git a/source/radeon_gpu_detective_cli/main.cpp b/source/radeon_gpu_detective_cli/main.cpp index 94c69d2..5afd1e0 100644 --- a/source/radeon_gpu_detective_cli/main.cpp +++ b/source/radeon_gpu_detective_cli/main.cpp @@ -1,5 +1,5 @@ //============================================================================= -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. /// @author AMD Developer Tools Team /// @file /// @brief main entry point of RGD CLI. @@ -102,6 +102,7 @@ static bool ParseCrashDump(const Config& user_config, RgdCrashDumpContents& cont std::string error_msg; bool ret = false; bool is_system_info_parsed = false; + bool is_driveroverrides_parsed = false; try { rdf::ChunkFile chunk_file = rdf::ChunkFile(file); @@ -118,6 +119,8 @@ static bool ParseCrashDump(const Config& user_config, RgdCrashDumpContents& cont // Parse TraceProcessInfo chunk. RgdParsingUtils::ParseTraceProcessInfoChunk(chunk_file, kChunkIdTraceProcessInfo, contents.crashing_app_process_info); + + is_driveroverrides_parsed = RgdParsingUtils::ParseDriverOverridesChunk(chunk_file, kChunkIdDriverOverrides, contents.driver_experiments_json); } catch (const std::exception& e) { @@ -161,6 +164,16 @@ static bool ParseCrashDump(const Config& user_config, RgdCrashDumpContents& cont std::cerr << "ERROR: failed to parse system information contents in crash dump file." << std::endl; } + assert(is_driveroverrides_parsed); + if (is_driveroverrides_parsed) + { + RgdUtils::PrintMessage("driver experiments information parsed successfully.", RgdMessageType::kInfo, user_config.is_verbose); + } + else + { + std::cerr << "ERROR: failed to parse DriverOverrides chunk in crash dump file." << std::endl; + } + // Done parsing the file here. file.Close(); @@ -187,7 +200,7 @@ static void SerializeTextOutput(const RgdCrashDumpContents& contents, const Conf txt << input_info_str; std::string system_info_str; - RgdSerializer::ToString(user_config, contents.system_info, system_info_str); + RgdSerializer::ToString(user_config, contents.system_info, contents.driver_experiments_json, system_info_str); txt << system_info_str << std::endl; std::cout << "Generating text representation of the execution marker information..." << std::endl; @@ -408,6 +421,7 @@ static bool PerformCrashAnalysis(const Config& user_config) serializer_json.SetInputInfo(user_config, contents.crashing_app_process_info, contents.system_info, contents.api_info); serializer_json.SetSystemInfoData(user_config, contents.system_info); + serializer_json.SetDriverExperimentsInfoData(contents.driver_experiments_json); ExecMarkerDataSerializer exec_marker_serializer;