[Feature] Enable activation checkpoint offloading (#722)

* Add act offloading option * Add changelog * Turn off offloading when ac is not enabled * formatting --------- Co-authored-by: Mohammad Amin Nabian <[email protected]>
NVIDIA · Nov 25, 2024 · a5d3b5b · a5d3b5b
1 parent 7f739f7
commit a5d3b5b
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - The XAeroNet model.
 - Incoporated CorrDiff-GEFS-HRRR model into CorrDiff, with lead-time aware SongUNet and
   cross entropy loss.
+- Option to offload checkpoints to further reduce memory usage
 - Added StormCast model training and simple inference to examples
 
 ### Changed

diff --git a/modulus/models/meshgraphnet/meshgraphnet.py b/modulus/models/meshgraphnet/meshgraphnet.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from contextlib import nullcontext
+
 import torch
 import torch.nn as nn
 from torch import Tensor
@@ -98,6 +100,8 @@ class MeshGraphNet(Module):
         Whether to replace concat+MLP with MLP+idx+sum
     num_processor_checkpoint_segments: int, optional
         Number of processor segments for gradient checkpointing, by default 0 (checkpointing disabled)
+    checkpoint_offloading: bool, optional
+        Whether to offload the checkpointing to the CPU, by default False
 
     Example
     -------
@@ -138,6 +142,7 @@ def __init__(
         aggregation: str = "sum",
         do_concat_trick: bool = False,
         num_processor_checkpoint_segments: int = 0,
+        checkpoint_offloading: bool = False,
         recompute_activation: bool = False,
     ):
         super().__init__(meta=MetaData())
@@ -184,6 +189,7 @@ def __init__(
             activation_fn=activation_fn,
             do_concat_trick=do_concat_trick,
             num_processor_checkpoint_segments=num_processor_checkpoint_segments,
+            checkpoint_offloading=checkpoint_offloading,
         )
 
     def forward(
@@ -215,10 +221,14 @@ def __init__(
         activation_fn: nn.Module = nn.ReLU(),
         do_concat_trick: bool = False,
         num_processor_checkpoint_segments: int = 0,
+        checkpoint_offloading: bool = False,
     ):
         super().__init__()
         self.processor_size = processor_size
         self.num_processor_checkpoint_segments = num_processor_checkpoint_segments
+        self.checkpoint_offloading = (
+            checkpoint_offloading if (num_processor_checkpoint_segments > 0) else False
+        )
 
         edge_block_invars = (
             input_dim_node,
@@ -254,6 +264,23 @@ def __init__(
         self.processor_layers = nn.ModuleList(layers)
         self.num_processor_layers = len(self.processor_layers)
         self.set_checkpoint_segments(self.num_processor_checkpoint_segments)
+        self.set_checkpoint_offload_ctx(self.checkpoint_offloading)
+
+    def set_checkpoint_offload_ctx(self, enabled: bool):
+        """
+        Set the context for CPU offloading of checkpoints
+
+        Parameters
+        ----------
+        checkpoint_offloading : bool
+            whether to offload the checkpointing to the CPU
+        """
+        if enabled:
+            self.checkpoint_offload_ctx = torch.autograd.graph.save_on_cpu(
+                pin_memory=True
+            )
+        else:
+            self.checkpoint_offload_ctx = nullcontext()
 
     def set_checkpoint_segments(self, checkpoint_segments: int):
         """
@@ -326,14 +353,15 @@ def forward(
         edge_features: Tensor,
         graph: Union[DGLGraph, List[DGLGraph], CuGraphCSC],
     ) -> Tensor:
-        for segment_start, segment_end in self.checkpoint_segments:
-            edge_features, node_features = self.checkpoint_fn(
-                self.run_function(segment_start, segment_end),
-                node_features,
-                edge_features,
-                graph,
-                use_reentrant=False,
-                preserve_rng_state=False,
-            )
+        with self.checkpoint_offload_ctx:
+            for segment_start, segment_end in self.checkpoint_segments:
+                edge_features, node_features = self.checkpoint_fn(
+                    self.run_function(segment_start, segment_end),
+                    node_features,
+                    edge_features,
+                    graph,
+                    use_reentrant=False,
+                    preserve_rng_state=False,
+                )
 
         return node_features