diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py index 21f652a..a42e765 100644 --- a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py @@ -138,12 +138,10 @@ def read(self): prefetch_factor = math.ceil(self._args.prefetch_size / self._args.read_threads) else: prefetch_factor = self._args.prefetch_size - if prefetch_factor > 0: - if self._args.my_rank == 0: - else: + if prefetch_factor <= 0: prefetch_factor = 2 if self._args.my_rank == 0: - logging.debug(f"{utcnow()} Setup dataloader with {self._args.read_threads} workers {torch.__version__}") + logging.debug(f"{utcnow()} Setup dataloader with {self._args.read_threads} workers {torch.__version__}") if self._args.read_threads==0: kwargs={} else: diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb index 8d5f583..f562697 100644 --- a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb @@ -9,7 +9,7 @@ "source": [ "# Using DYAD to accelerate distributed Deep Learning (DL) training\n", "\n", - "Now that we have seen how Flux enables the management and deployment of services, let's look at an example of using DYAD, an advanced Flux service for runtime data movement, in a real world application. Specifically, we will show how DYAD speeds up distributed Deep Learning (DL) training. In this module, we cover these topics:\n", + "Now that we have seen how Flux enables the management and deployment of services, let's look at an example of using DYAD, an advanced Flux service for runtime data movement, in a real-world application. Specifically, we will show how DYAD speeds up distributed Deep Learning (DL) training. In this module, we cover these topics:\n", "1. Design of DYAD\n", "2. Distributed DL training\n", "3. Deep Learning I/O (DLIO) benchmark\n", @@ -17,12 +17,12 @@ "\n", "## Design of DYAD\n", "\n", - "DYAD provides transparent, locality-aware, write-once, read-many file caching that runs on top of local NVMe and other burst buffer-style technologies (e.g., El Capitan Rabbit nodes). Figure X shows the components of DYAD, including the DYAD service (implemented as a Flux broker module), the DYAD client, and DYAD's data transport layer. DYAD uses the Flux KVS to store metadata about tracked files, and it uses Flux's remote proceedure call capabilities to communicate between client and service. DYAD also uses UCX to perform RDMA-based data transfer to move files.\n", + "DYAD provides transparent, locality-aware, write-once, read-many file caching that runs on top of local NVMe and other burst buffer-style technologies (e.g., El Capitan Rabbit nodes). Figure 1 shows the components of DYAD, including the DYAD service (implemented as a Flux broker module), the DYAD client, and DYAD's data transport layer. DYAD uses the Flux KVS to store metadata about tracked files, and it uses Flux's remote procedure call capabilities to communicate between client and service. DYAD also uses UCX to perform RDMA-based data transfer to move files.\n", "\n", "
\n", "\n", "
\n", - "Image created by Ian Lumsden for a poster at SC'23
\n", + "Figure 1: Image created by Ian Lumsden for a poster at SC'23\n", "
\n", "\n", "DYAD is designed to accelerate large, distributed workloads, such as distributed Deep Learning (DL) training and scientific computing workflows, on HPC systems. It is also designed be transparent, which allows users to leverage DYAD with little to no code refactoring. Unlike similar tools (e.g., DataSpaces and UnifyFS), which tend to optimize for write performance, DYAD aims to provide good write **and read** performance. To optimize read performance, DYAD uses a locality-aware \"Hierarchical Data Locator,\" which prioritizes node-local metadata and data retrieval to minimize the amount of network communications. When moving data from another node, DYAD also uses a streaming RPC over RDMA protocol, which uses preallocated buffers and connection caching to maximize network bandwidth. This process is shown in the figure below:\n", @@ -30,7 +30,7 @@ "
\n", "\n", "
\n", - "Image created by Hari Devarajan for a paper submitted to SC'24
\n", + "Figure 2: Image created by Hari Devarajan for a paper submitted to SC'24\n", "
" ] }, @@ -41,20 +41,20 @@ "source": [ "## Distributed DL Training\n", "\n", - "Distributed DL training is an approach to speed up the training of large Deep Learning models by performing multiple epochs of training in parallel across multiple GPUs and, oftentimes, multiple nodes. This approach is supported by most major DL libraries, such as PyTorch and Tensorflow. In this module, we focus on PyTorch. When running training across multiple nodes and GPUs, PyTorch starts by spawning one process per GPU, called the worker. Each worker performs three major tasks:\n", + "Distributed DL training is an approach to speed up the training of large Deep Learning models by performing multiple epochs of training in parallel across multiple GPUs and, oftentimes, multiple nodes. Most major DL libraries, such as PyTorch and TensorFlow support this approach. In this module, we focus on PyTorch. When running training across multiple nodes and GPUs, PyTorch starts by spawning one process per GPU, called the worker. Each worker performs three major tasks:\n", "1. Determining which samples from the dataset will comprise the batch for the next epoch of training (i.e., epoch *N+1*)\n", "2. Reading these samples from the filesystem\n", "3. Building a batch from these samples and moving the batch to the GPU\n", "\n", - "To assist with reading the samples from the filesystem, each worker also spawns additional I/O processes. Each of these processes reads data and, optionally, transforms the data based on the configuration of the training pipeline. Figure X shows this process for a single GPU, a single worker, and a single spawned I/O process. In this figure, \"I/O\" indicates data being read from the filesystem, and \"Map\" indicates the optional transformation of data. \"Batch\" indicates the building of a batch from the read samples.\n", + "To assist with reading the samples from the filesystem, each worker also spawns additional I/O processes. Each of these processes reads data and, optionally, transforms the data based on the configuration of the training pipeline. Figure 3 shows this process for a single GPU, a single worker, and a single spawned I/O process. In this figure, \"I/O\" indicates data being read from the filesystem, and \"Map\" indicates the optional transformation of data. \"Batch\" indicates the building of a batch from the read samples.\n", "\n", "
\n", "\n", "
\n", - "Image created by Ian Lumsden based on an image from this article
\n", + "Figure 3: Image created by Ian Lumsden based on an image from this article\n", "
\n", "\n", - "One key difference between distributed DL training and many conventional HPC applications (e.g., MPI-based simulations) is the asynchronous loading of data by workers during training. In many conventional HPC applications, data loading and computation are performed one after the one. On the other hand, as shown in Figure X, the loading of data in distributed DL training is asynchronous. In other words, while the GPU is training the DL model for epoch *N*, the worker reading and creating the batch for epoch *N+1*. This asynchronous loading of data can lead to imbalance between data loading and training. For example, Figure X shows a scenario where the data loading takes longer than training, resulting in idle time on the GPU, wasted resources, and, overall, an I/O bound application.\n", + "One key difference between distributed DL training and many conventional HPC applications (e.g., MPI-based simulations) is the asynchronous loading of data by workers during training. In many conventional HPC applications, data loading and computation are performed one after the other. On the other hand, as shown in Figure 3, the loading of data in distributed DL training is asynchronous. In other words, while the GPU is training the DL model for epoch *N*, the worker is reading and creating the batch for epoch *N+1*. This asynchronous loading of data can lead to an imbalance between data loading and training. For example, Figure 3 shows a scenario where the data loading takes longer than training, resulting in idle time on the GPU, wasted resources, and, overall, an I/O bound application.\n", "\n", "At the end of each epoch of training, all workers and GPUs are synchronized so that the DL models from each GPU can be merged together. This synchronization and merging usually consists of an allreduce-style operation. This synchronization makes the effects of any imbalance between data loading and training more pronounced because, if even one worker and GPU become imbalanced, the performance of the entire distributed training will suffer." ] @@ -112,7 +112,7 @@ "from pygments.formatters import HtmlFormatter\n", "from IPython.display import display, HTML\n", "\n", - "sys.path.insert(0, os.path.abspath(\"../dlio_extensions/\"))\n", + "sys.path.insert(0, os.path.abspath(\"dlio_extensions/\"))\n", "\n", "from dyad_torch_data_loader import DYADTorchDataset" ] @@ -226,7 +226,7 @@ "workers_per_node = 1\n", "dyad_install_prefix = \"/usr/local\"\n", "num_nodes = 2\n", - "dlio_extensions_dir = \"/home/jovyan/flux-tutorial-2024/dlio_extensions\"\n", + "dlio_extensions_dir = \"/home/jovyan/supplementary/dyad/dlio_extensions\"\n", "workload = \"dyad_unet3d_demo\"" ] }, @@ -465,16 +465,16 @@ "
\n", "\n", "
\n", - "
\n", + "Figure 4: DYAD improves the epoch time of Unet3D by up to 10.82x due to locality-aware caching as compared to UnifyFS.\n", "
\n", "\n", - "Figure X shows the performance of Lustre, [UnifyFS](https://ieeexplore.ieee.org/document/10177390), and DYAD in terms of runtime and I/O bandwidth for the full version of the 3D U-Net training. As explained on the [webpage for the KiTS19 Challenge](https://kits19.grand-challenge.org/), the dataset for the full version of this application consists of 10,240, NPZ-formatted image files, resulting in a total dataset size of 1.36 TB. Within each epoch of PyTorch-based training, the model processes batches of 4 images using 6 I/O processes per GPU. The model trains for 20 epochs without checkpointing. The model scales from 8 to 64 nodes of LLNL's [Corona](https://hpc.llnl.gov/hardware/compute-platforms/corona) supercomputer, with 8 GPUs per node.\n", + "Figure 4 shows the performance of Lustre, [UnifyFS](https://ieeexplore.ieee.org/document/10177390), and DYAD in terms of runtime and I/O bandwidth for the full version of the 3D U-Net training. As explained on the [webpage for the KiTS19 Challenge](https://kits19.grand-challenge.org/), the dataset for the full version of this application consists of 10,240, NPZ-formatted image files, resulting in a total dataset size of 1.36 TB. Within each epoch of PyTorch-based training, the model processes batches of 4 images using 6 I/O processes per GPU. The model trains for 20 epochs without checkpointing. The model scales from 8 to 64 nodes of LLNL's [Corona](https://hpc.llnl.gov/hardware/compute-platforms/corona) supercomputer, with 8 GPUs per node.\n", "\n", - "In the leftmost plot of Figure X, we show the runtime of the training for Lustre, UnifyFS, and DYAD at 8, 16, 32, and 64 nodes. This plot shows that DYAD provides significant runtime improvement compared to Lustre and UnifyFS for the 3D U-Net, mainly due to locality optimizations. DYAD runs up to 7.5 times faster than Lustre and 1.88 times faster than UnifyFS, with less performance variability due to DYAD's use of node-local storage.\n", + "In the leftmost plot of Figure 4, we show the runtime of the training for Lustre, UnifyFS, and DYAD at 8, 16, 32, and 64 nodes. This plot shows that DYAD provides significant runtime improvement compared to Lustre and UnifyFS for the 3D U-Net, mainly due to locality optimizations. DYAD runs up to 7.5 times faster than Lustre and 1.88 times faster than UnifyFS, with less performance variability due to DYAD's use of node-local storage.\n", "\n", - "In the middle plot of Figure X, we show the bandwidth per epoch of training across 512 GPUs (64 nodes). Because DYAD's capabilities allow for on-the-fly caching of data, its performance starts similar to that of Lustre. As more data is cached into DYAD, its bandwidth increases to 140 GB/s due to DYAD's streaming RPC over RDMA protocol. Finally, as even more data is cached, DYAD's bandwidth reaches 1409 GB/s because DYAD's locality-aware caching allows almost all sample reads to be performed directly on node-local NVMe. In comparison, both Lustre and Unify maintain consistent bandwidths well under those of DYAD. By the 20th epoch, DYAD speeds up training by 10.62 times compared to UnifyFS.\n", + "In the middle plot of Figure 4, we show the bandwidth per epoch of training across 512 GPUs (64 nodes). Because DYAD's capabilities allow for on-the-fly caching of data, its performance starts similar to that of Lustre. As more data is cached into DYAD, its bandwidth increases to 140 GB/s due to DYAD's streaming RPC over RDMA protocol. Finally, as even more data is cached, DYAD's bandwidth reaches 1409 GB/s because DYAD's locality-aware caching allows almost all sample reads to be performed directly on node-local NVMe. In comparison, both Lustre and Unify maintain consistent bandwidths well under those of DYAD. By the 20th epoch, DYAD speeds up training by 10.62 times compared to UnifyFS.\n", "\n", - "Finally, in the rightmost plot of Figure X, we show how often DYAD retrieved data from node-local storage versus retrieving data from storage on a remote node in terms of percentage of data access requests. Initially, DYAD mostly performs remote requests. As training continues, more and more data is replicated with DYAD's locality-aware caching, resulting in a larger percentage of local requests. By epoch 13, almost all data is accessed through local requests. This transition from mostly remote requests to mostly local requests corresponds with the increase in bandwidth shown in the middle plot of Figure X." + "Finally, in the rightmost plot of Figure 4, we show how often DYAD retrieved data from node-local storage versus retrieving data from storage on a remote node in terms of the percentage of data access requests. Initially, DYAD mostly performs remote requests. As training continues, more and more data is replicated with DYAD's locality-aware caching, resulting in a larger percentage of local requests. By epoch 13, almost all data is accessed through local requests. This transition from mostly remote requests to mostly local requests corresponds with the increase in bandwidth shown in the middle plot of Figure 4." ] }, { @@ -482,7 +482,7 @@ "id": "81d7d87f-1e09-42c8-b165-8902551f6847", "metadata": {}, "source": [ - "# This concludes Module 3.\n", + "# This concludes Suppliment Module 1.\n", "\n", "In this module, we covered:\n", "1. Design of DYAD\n", @@ -492,6 +492,14 @@ "\n", "To continue with the tutorial, open [Module 4](./04_flux_tutorial_conclusions.ipynb)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37fc415f-7972-4f44-ac1e-f3d1258345be", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -510,7 +518,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dl-training-io.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dl-training-io.png new file mode 100755 index 0000000..6129d0e Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dl-training-io.png differ