diff --git a/examples/weather/dlwp_healpix/README.md b/examples/weather/dlwp_healpix/README.md index dbbf64ed90..f773ef5d39 100644 --- a/examples/weather/dlwp_healpix/README.md +++ b/examples/weather/dlwp_healpix/README.md @@ -5,7 +5,8 @@ This example is an implementation of the model. The DLWP model can be used to predict the state of the atmosphere given a previous atmospheric state. You can infer a 320-member ensemble set of six-week forecasts at 1.4° resolution within a couple of minutes, demonstrating the potential of AI in developing -near real-time digital twins for weather prediction +near real-time digital twins for weather prediction. This example also contains an +implementation of the coupled Ocean-Atmosphere DLWP model. ## Problem overview @@ -13,4 +14,28 @@ The goal is to train an AI model that can emulate the state of the atmosphere an global weather over a certain time span. The Deep Learning Weather Prediction (DLWP) model uses deep CNNs for globally gridded weather prediction. DLWP CNNs directly map u(t) to its future state u(t+Δt) by learning from historical observations of the weather, -with Δt set to 6 hr +with Δt set to 6 hr. The Deep Learning Ocean Model (DLOM) that is designed to couple with +deep learning weather prediction (DLWP) model. The DLOM forecasts sea surface +temperature (SST). DLOMs use deep learning techniques as in DLWP models but are +configured with different architectures and slower time stepping. DLOMs and DLWP models +are trained to learn atmosphere-ocean coupling. + +## Getting Started + +To train the DLWP HEALPix model, run + +```bash +python train.py +``` + +To train the coupled DLWP model, run + +```bash +python train.py --config-name config_hpx32_coupled_dlwp +``` + +To train the coupled DLOM model, run + +```bash +python train.py --config-name config_hpx32_coupled_dlom +``` diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/hpx_rec_unet.yaml b/examples/weather/dlwp_healpix/config_hpx32_coupled_dlom.yaml similarity index 58% rename from examples/weather/dlwp_healpix_coupled/configs/model/hpx_rec_unet.yaml rename to examples/weather/dlwp_healpix/config_hpx32_coupled_dlom.yaml index 1bebf83a47..de324a48f7 100644 --- a/examples/weather/dlwp_healpix_coupled/configs/model/hpx_rec_unet.yaml +++ b/examples/weather/dlwp_healpix/config_hpx32_coupled_dlom.yaml @@ -15,22 +15,28 @@ # limitations under the License. defaults: - - modules/encoder@encoder: rec_unet_enc - - modules/decoder@decoder: rec_unet_dec + - _self_ + - data: era5_hpx32_dlom_sst-z1000-ws_48H-dt + - model: coupled_hpx_unet_dlom + - trainer: dlom -_target_: modulus.models.dlwp_healpix.HEALPixRecUNet -_recursive_: false -presteps: 1 -input_time_dim: ${data.input_time_dim} -output_time_dim: ${data.output_time_dim} -delta_time: ${data.time_step} +experiment_name: ${now:%Y-%m-%d}/${now:%H-%M-%S} +output_dir: outputs/${experiment_name} +# checkpoints names are in the form training-state-.mdlus +checkpoint_name: last +load_weights_only: false +seed: 0 -# Parameters automatically overridden in train code -input_channels: 7 -output_channels: 7 -n_constants: 2 -decoder_input_channels: 1 +# Training specifications +batch_size: 64 +learning_rate: 1e-4 +num_workers: 8 -# some perf parameters -enable_nhwc: false -enable_healpixpad: false +# Distributed setup (multi GPU) +port: 29450 +master_address: localhost + +hydra: + verbose: true + run: + dir: ${output_dir} diff --git a/examples/weather/dlwp_healpix_coupled/configs/config_hpx32_coupled_dlwp.yaml b/examples/weather/dlwp_healpix/config_hpx32_coupled_dlwp.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/config_hpx32_coupled_dlwp.yaml rename to examples/weather/dlwp_healpix/config_hpx32_coupled_dlwp.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/callbacks/early_stopping.yaml b/examples/weather/dlwp_healpix/configs/callbacks/early_stopping.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/callbacks/early_stopping.yaml rename to examples/weather/dlwp_healpix/configs/callbacks/early_stopping.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/config_hpx32_coupled_dlom.yaml b/examples/weather/dlwp_healpix/configs/config_hpx32_coupled_dlom.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/config_hpx32_coupled_dlom.yaml rename to examples/weather/dlwp_healpix/configs/config_hpx32_coupled_dlom.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/config.yaml b/examples/weather/dlwp_healpix/configs/config_hpx32_coupled_dlwp.yaml similarity index 92% rename from examples/weather/dlwp_healpix_coupled/configs/config.yaml rename to examples/weather/dlwp_healpix/configs/config_hpx32_coupled_dlwp.yaml index 6c9591c36d..3e2538f75d 100644 --- a/examples/weather/dlwp_healpix_coupled/configs/config.yaml +++ b/examples/weather/dlwp_healpix/configs/config_hpx32_coupled_dlwp.yaml @@ -16,9 +16,9 @@ defaults: - _self_ - - data: era5_hpx64_7var_6h_24h - - model: hpx_rec_unet - - trainer: default + - data: era5_hpx32_8var-coupled_6h_24h + - model: coupled_hpx_rec_unet + - trainer: dlwp experiment_name: ${now:%Y-%m-%d}/${now:%H-%M-%S} output_dir: outputs/${experiment_name} diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_8var-coupled_6h_24h.yaml b/examples/weather/dlwp_healpix/configs/data/era5_hpx32_8var-coupled_6h_24h.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_8var-coupled_6h_24h.yaml rename to examples/weather/dlwp_healpix/configs/data/era5_hpx32_8var-coupled_6h_24h.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_dlom_sst-z1000-ws_48H-dt.yaml b/examples/weather/dlwp_healpix/configs/data/era5_hpx32_dlom_sst-z1000-ws_48H-dt.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_dlom_sst-z1000-ws_48H-dt.yaml rename to examples/weather/dlwp_healpix/configs/data/era5_hpx32_dlom_sst-z1000-ws_48H-dt.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/module/atmos_ConstantCoupling.yaml b/examples/weather/dlwp_healpix/configs/data/module/atmos_ConstantCoupling.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/module/atmos_ConstantCoupling.yaml rename to examples/weather/dlwp_healpix/configs/data/module/atmos_ConstantCoupling.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/module/sst-z1000-ws.yaml b/examples/weather/dlwp_healpix/configs/data/module/sst-z1000-ws.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/module/sst-z1000-ws.yaml rename to examples/weather/dlwp_healpix/configs/data/module/sst-z1000-ws.yaml diff --git a/examples/weather/dlwp_healpix/configs/data/scaling/hpx32.yaml b/examples/weather/dlwp_healpix/configs/data/scaling/hpx32.yaml index 6c4b37475a..65b7fd2d98 100644 --- a/examples/weather/dlwp_healpix/configs/data/scaling/hpx32.yaml +++ b/examples/weather/dlwp_healpix/configs/data/scaling/hpx32.yaml @@ -49,4 +49,19 @@ sst: tp6: mean: 0. std: 1. +z1000-24H: + mean: 936.7376098632812 + std: 883.0859375 +ws10-24H: + mean: 6.15248966217041 + std: 3.321399688720703 +ws10: + mean: 6.1497307 + std: 3.583117 +ws10-48H: + mean: 6.081215 + std: 3.1224248 +adt: + mean: 0.40309871564035454 + std: 0.6165622319307328 diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/hpx64.yaml b/examples/weather/dlwp_healpix/configs/data/scaling/hpx64.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/scaling/hpx64.yaml rename to examples/weather/dlwp_healpix/configs/data/scaling/hpx64.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/splits/large_test.yaml b/examples/weather/dlwp_healpix/configs/data/splits/large_test.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/splits/large_test.yaml rename to examples/weather/dlwp_healpix/configs/data/splits/large_test.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/splits/large_test_1950-2022.yaml b/examples/weather/dlwp_healpix/configs/data/splits/large_test_1950-2022.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/data/splits/large_test_1950-2022.yaml rename to examples/weather/dlwp_healpix/configs/data/splits/large_test_1950-2022.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/coupled_hpx_rec_unet.yaml b/examples/weather/dlwp_healpix/configs/model/coupled_hpx_rec_unet.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/coupled_hpx_rec_unet.yaml rename to examples/weather/dlwp_healpix/configs/model/coupled_hpx_rec_unet.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/coupled_hpx_rec_unet_dlom.yaml b/examples/weather/dlwp_healpix/configs/model/coupled_hpx_unet_dlom.yaml similarity index 88% rename from examples/weather/dlwp_healpix_coupled/configs/model/coupled_hpx_rec_unet_dlom.yaml rename to examples/weather/dlwp_healpix/configs/model/coupled_hpx_unet_dlom.yaml index 133baaf6da..0fd797b9cd 100644 --- a/examples/weather/dlwp_healpix_coupled/configs/model/coupled_hpx_rec_unet_dlom.yaml +++ b/examples/weather/dlwp_healpix/configs/model/coupled_hpx_unet_dlom.yaml @@ -15,10 +15,10 @@ # limitations under the License. defaults: - - modules/encoder@encoder: rec_unet_enc - - modules/decoder@decoder: rec_unet_dec + - modules/encoder@encoder: unet_enc + - modules/decoder@decoder: unet_dec -_target_: modulus.models.dlwp_healpix.HEALPixRecUNet +_target_: modulus.models.dlwp_healpix.HEALPixUNet _recursive_: false presteps: 0 input_time_dim: ${data.input_time_dim} diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/symmetric_conv_next_block.yaml b/examples/weather/dlwp_healpix/configs/model/modules/blocks/symmetric_conv_next_block.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/symmetric_conv_next_block.yaml rename to examples/weather/dlwp_healpix/configs/model/modules/blocks/symmetric_conv_next_block.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/decoder_symmetric-conv_90-90-180.yaml b/examples/weather/dlwp_healpix/configs/model/modules/decoder/decoder_symmetric-conv_90-90-180.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/decoder_symmetric-conv_90-90-180.yaml rename to examples/weather/dlwp_healpix/configs/model/modules/decoder/decoder_symmetric-conv_90-90-180.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/unet_dec.yaml b/examples/weather/dlwp_healpix/configs/model/modules/decoder/unet_dec.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/unet_dec.yaml rename to examples/weather/dlwp_healpix/configs/model/modules/decoder/unet_dec.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/encoder_symmetric-conv_180-90-90.yaml b/examples/weather/dlwp_healpix/configs/model/modules/encoder/encoder_symmetric-conv_180-90-90.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/encoder_symmetric-conv_180-90-90.yaml rename to examples/weather/dlwp_healpix/configs/model/modules/encoder/encoder_symmetric-conv_180-90-90.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/unet_enc.yaml b/examples/weather/dlwp_healpix/configs/model/modules/encoder/unet_enc.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/unet_enc.yaml rename to examples/weather/dlwp_healpix/configs/model/modules/encoder/unet_enc.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/hpx32_coupled-atmos.yaml b/examples/weather/dlwp_healpix/configs/trainer/criterion/hpx32_coupled-atmos.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/hpx32_coupled-atmos.yaml rename to examples/weather/dlwp_healpix/configs/trainer/criterion/hpx32_coupled-atmos.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/hpx64_7var.yaml b/examples/weather/dlwp_healpix/configs/trainer/criterion/hpx64_7var.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/hpx64_7var.yaml rename to examples/weather/dlwp_healpix/configs/trainer/criterion/hpx64_7var.yaml diff --git a/examples/weather/dlwp_healpix/configs/trainer/criterion/ocean_mse.yaml b/examples/weather/dlwp_healpix/configs/trainer/criterion/ocean_mse.yaml index ab4cd1a1bf..eb5a908ee9 100644 --- a/examples/weather/dlwp_healpix/configs/trainer/criterion/ocean_mse.yaml +++ b/examples/weather/dlwp_healpix/configs/trainer/criterion/ocean_mse.yaml @@ -16,5 +16,5 @@ _target_: modulus.metrics.climate.healpix_loss.OceanMSE -lsm_file: /invariants/hpx32_1950-2022_3h_sst-only.zarr +lsm_file: /datasets/healpix/HPX32/hpx32_1950-2022_3h_sst_coupled.zarr diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/dlom.yaml b/examples/weather/dlwp_healpix/configs/trainer/dlom.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/dlom.yaml rename to examples/weather/dlwp_healpix/configs/trainer/dlom.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/dlwp.yaml b/examples/weather/dlwp_healpix/configs/trainer/dlwp.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/dlwp.yaml rename to examples/weather/dlwp_healpix/configs/trainer/dlwp.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/constant.yaml b/examples/weather/dlwp_healpix/configs/trainer/lr_scheduler/constant.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/constant.yaml rename to examples/weather/dlwp_healpix/configs/trainer/lr_scheduler/constant.yaml diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/plateau.yaml b/examples/weather/dlwp_healpix/configs/trainer/lr_scheduler/plateau.yaml similarity index 100% rename from examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/plateau.yaml rename to examples/weather/dlwp_healpix/configs/trainer/lr_scheduler/plateau.yaml diff --git a/examples/weather/dlwp_healpix_coupled/README.md b/examples/weather/dlwp_healpix_coupled/README.md deleted file mode 100644 index 61f188bf20..0000000000 --- a/examples/weather/dlwp_healpix_coupled/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Deep Learning Weather Prediction (DLWP) model for weather forecasting - -This example is an implementation of the coupled Ocean-Atmosphere DLWP model. - -## Problem overview - -The goal is to train an AI model that can emulate the state of the atmosphere and predict -global weather over a certain time span. The Deep Learning Weather Prediction (DLWP) model -uses deep CNNs for globally gridded weather prediction. DLWP CNNs directly map u(t) to -its future state u(t+Δt) by learning from historical observations of the weather, -with Δt set to 6 hr. The Deep Learning Ocean Model (DLOM) that is designed to couple with -deep learning weather prediction (DLWP) model. The DLOM forecasts sea surface -temperature (SST). DLOMs use deep learning techniques as in DLWP models but are -configured with different architectures and slower time stepping. DLOMs and DLWP models -are trained to learn atmosphere-ocean coupling. - -## Getting Started - -To train the coupled DLWP model, run - -```bash -python train.py --config-name config_hpx32_coupled_dlwp -``` - -To train the coupled DLOM model, run - -```bash -python train.py --config-name config_hpx32_coupled_dlom -``` diff --git a/examples/weather/dlwp_healpix_coupled/configs/callbacks/learning_rate_monitor.yaml b/examples/weather/dlwp_healpix_coupled/configs/callbacks/learning_rate_monitor.yaml deleted file mode 100644 index 5133d4b0df..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/callbacks/learning_rate_monitor.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -learning_rate_monitor: - _target_: pytorch_lightning.callbacks.LearningRateMonitor - logging_interval: epoch \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/callbacks/model_checkpoint.yaml b/examples/weather/dlwp_healpix_coupled/configs/callbacks/model_checkpoint.yaml deleted file mode 100644 index cff4cce20f..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/callbacks/model_checkpoint.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -model_checkpoint: - _target_: pytorch_lightning.callbacks.ModelCheckpoint - filename: '{epoch:03d}-{val_loss:.4E}' - monitor: 'val_loss' - mode: 'min' - save_top_k: 10 - save_last: True \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/callbacks/swa.yaml b/examples/weather/dlwp_healpix_coupled/configs/callbacks/swa.yaml deleted file mode 100644 index 9b13af08be..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/callbacks/swa.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -stochastic_weight_avg: - _target_: pytorch_lightning.callbacks.StochasticWeightAveraging - swa_epoch_start: 5 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_7var_6h_24h.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_7var_6h_24h.yaml deleted file mode 100644 index 0cf59d59ab..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx32_7var_6h_24h.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - module: time_series - - scaling: classic - - splits: default - -src_directory: /datasets/healpix/HPX32 -dst_directory: /datasets/healpix/HPX32 -dataset_name: era5_hpx32_7var_6h_24h -prefix: era5_1deg_3h_HPX32_1979-2021_ -suffix: '' -data_format: classic -input_variables: - - z500 - - tau300-700 - - z1000 - - t2m0 - - tcwv0 - - t850 - - z250 -output_variables: null -constants: - land_sea_mask: lsm - topography: z -input_time_dim: 2 -output_time_dim: 4 -data_time_step: 3h -time_step: 6h -gap: 6h -add_insolation: true -nside: 32 -cube_dim: ${data.nside} -prebuilt_dataset: true diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx64_7var_6h_24h.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx64_7var_6h_24h.yaml deleted file mode 100644 index dd6f4b133a..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/era5_hpx64_7var_6h_24h.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - module: time_series - - scaling: classic - - splits: default - -src_directory: /datasets/healpix/HPX64 -dst_directory: /datasets/healpix/HPX64 -dataset_name: era5_hpx64_7var_6h_24h -prefix: era5_0.25deg_3h_HPX64_1979-2021_ -suffix: '' -data_format: classic -input_variables: - - z500 - - tau300-700 - - z1000 - - t2m0 - - tcwv0 - - t850 - - z250 -output_variables: null -constants: - land_sea_mask: lsm - topography: z -input_time_dim: 2 -output_time_dim: 4 -data_time_step: 3h -time_step: 6h -gap: 6h -add_insolation: true -nside: 64 -cube_dim: ${data.nside} -prebuilt_dataset: true diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/module/time_series.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/module/time_series.yaml deleted file mode 100644 index f2ee59b2a2..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/module/time_series.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.datapipes.healpix.data_modules.TimeSeriesDataModule -src_directory: ${data.src_directory} -dst_directory: ${data.dst_directory} -dataset_name: ${data.dataset_name} -prefix: ${data.prefix} -suffix: ${data.suffix} -data_format: ${data.data_format} -batch_size: ${batch_size} -drop_last: true -input_variables: ${data.input_variables} -output_variables: ${data.output_variables} -constants: ${data.constants} -scaling: ${data.scaling} -splits: ${data.splits} -presteps: ${model.presteps} -input_time_dim: ${data.input_time_dim} -output_time_dim: ${data.output_time_dim} -data_time_step: ${data.data_time_step} -time_step: ${data.time_step} -gap: ${data.gap} -shuffle: true -add_insolation: ${data.add_insolation} -cube_dim: ${data.cube_dim} -num_workers: ${num_workers} -pin_memory: true -prebuilt_dataset: ${data.prebuilt_dataset} \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/classic.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/scaling/classic.yaml deleted file mode 100644 index 65b86cedd0..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/classic.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -t2m0: - mean: 287.8665771484375 - std: 14.86227798461914 -t850: - mean: 281.2710266113281 - std: 12.04991626739502 -tau300-700: - mean: 61902.72265625 - std: 2559.8408203125 -tcwv0: - mean: 24.034976959228516 - std: 16.411935806274414 -z1000: - mean: 952.1435546875 - std: 895.7516479492188 -z250: - mean: 101186.28125 - std: 5551.77978515625 -z500: - mean: 55625.9609375 - std: 2681.712890625 -tp6: - mean: 0. - std: 1. - log_epsilon: 1e-6 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/hpx32.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/scaling/hpx32.yaml deleted file mode 100644 index adc3b0e03c..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/hpx32.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -t2m0: - mean: 287.8665771484375 - std: 14.86227798461914 -t850: - mean: 281.2710266113281 - std: 12.04991626739502 -tau300-700: - mean: 61902.72265625 - std: 2559.8408203125 -tcwv0: - mean: 24.034976959228516 - std: 16.411935806274414 -z1000: - mean: 952.1435546875 - std: 895.7516479492188 -z1000-48H: - mean: 934.4945 - std: 842.1188 -z250: - mean: 101186.28125 - std: 5551.77978515625 -z500: - mean: 55625.9609375 - std: 2681.712890625 -# calculated with data from 1979-2018 -sst-ti: - mean: 290.53864 - std: 10.5835 -# calculated with data from 1950-2022 -sst: - mean: 290.64487 - std: 10.5792 -tp6: - mean: 0. - std: 1. -z1000-24H: - mean: 936.7376098632812 - std: 883.0859375 -ws10-24H: - mean: 6.15248966217041 - std: 3.321399688720703 -ws10: - mean: 6.1497307 - std: 3.583117 -ws10-48H: - mean: 6.081215 - std: 3.1224248 -adt: - mean: 0.40309871564035454 - std: 0.6165622319307328 diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/zeros.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/scaling/zeros.yaml deleted file mode 100644 index 5b12f9935d..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/scaling/zeros.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -z500: - mean: 0. - std: 1. -z1000: - mean: 0. - std: 1. -tau300-700: - mean: 0. - std: 1. -t2m0: - mean: 0. - std: 1. -tcwv0: - mean: 0. - std: 1. -t850: - mean: 0. - std: 1. -z250: - mean: 0. - std: 1. -tp6: - mean: 0. - std: 1. - log_epsilon: 1e-6 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/splits/1959-1998.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/splits/1959-1998.yaml deleted file mode 100644 index 81c8129435..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/splits/1959-1998.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -train_date_start: 1959-01-01 -train_date_end: 1998-12-31T18:00 -val_date_start: 1999-01-01 -val_date_end: 2000-12-31T18:00 -test_date_start: 2017-01-01 -test_date_end: 2018-12-31T18:00 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/splits/1964-2003.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/splits/1964-2003.yaml deleted file mode 100644 index ad0e4aa0d4..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/splits/1964-2003.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -train_date_start: 1964-01-01 -train_date_end: 2003-12-31T18:00 -val_date_start: 2004-01-01 -val_date_end: 2005-12-31T18:00 -test_date_start: 2017-01-01 -test_date_end: 2018-12-31T18:00 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/data/splits/default.yaml b/examples/weather/dlwp_healpix_coupled/configs/data/splits/default.yaml deleted file mode 100644 index da8ad7eaa6..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/data/splits/default.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -train_date_start: 1979-01-01 -train_date_end: 2012-12-31T18:00 -val_date_start: 2013-01-01 -val_date_end: 2016-12-31T18:00 -test_date_start: 2017-01-01 -test_date_end: 2018-12-31T18:00 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_gelu.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_gelu.yaml deleted file mode 100644 index c27f1f3953..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_gelu.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.models.layers.activations.CappedGELU -cap_value: 10 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_leaky_relu.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_leaky_relu.yaml deleted file mode 100644 index 18020f4357..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/activations/capped_leaky_relu.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.models.layers.activations.CappedLeakyReLU -cap_value: 10 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/avg_pool.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/avg_pool.yaml deleted file mode 100644 index 40a07bc208..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/avg_pool.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.AvgPool -pooling: 2 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/basic_conv_block.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/basic_conv_block.yaml deleted file mode 100644 index 9e57d46877..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/basic_conv_block.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - /model/modules/activations@activation: capped_gelu - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.BasicConvBlock -_recursive_: true -in_channels: 3 -out_channels: 1 -kernel_size: 3 -dilation: 1 -n_layers: 1 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_gru_block.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_gru_block.yaml deleted file mode 100644 index 64670db177..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_gru_block.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.ConvGRUBlock -_recursive_: false -in_channels: 3 -kernel_size: 1 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_next_block.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_next_block.yaml deleted file mode 100644 index 96eacea0df..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/conv_next_block.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - /model/modules/activations@activation: capped_gelu - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.ConvNeXtBlock -_recursive_: true -in_channels: 3 -out_channels: 1 -kernel_size: 3 -dilation: 1 -upscale_factor: 4 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/output_layer.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/output_layer.yaml deleted file mode 100644 index 001d399204..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/output_layer.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - activation: null - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.BasicConvBlock -in_channels: 3 -out_channels: 2 -kernel_size: 1 -dilation: 1 -n_layers: 1 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/transposed_conv_upsample.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/transposed_conv_upsample.yaml deleted file mode 100644 index 7cf1e7e330..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/blocks/transposed_conv_upsample.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - /model/modules/activations@activation: capped_gelu - -_target_: modulus.models.dlwp_healpix_layers.healpix_blocks.TransposedConvUpsample -in_channels: 3 -out_channels: 1 -upsampling: 2 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/rec_unet_dec.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/rec_unet_dec.yaml deleted file mode 100644 index e816b7f9ca..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/decoder/rec_unet_dec.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - /model/modules/blocks@conv_block: conv_next_block - - /model/modules/blocks@up_sampling_block: transposed_conv_upsample - - /model/modules/blocks@recurrent_block: conv_gru_block - - /model/modules/blocks@output_layer: output_layer - -_target_: modulus.models.dlwp_healpix_layers.healpix_decoder.UNetDecoder -_recursive_: false -n_channels: - - 34 - - 68 - - 136 -dilations: - - 4 - - 2 - - 1 diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/rec_unet_enc.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/rec_unet_enc.yaml deleted file mode 100644 index e92a2f98ee..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/encoder/rec_unet_enc.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - /model/modules/blocks@conv_block: conv_next_block - - /model/modules/blocks@down_sampling_block: avg_pool - - /model/modules/blocks@recurrent_block: conv_gru_block - -_target_: modulus.models.dlwp_healpix_layers.healpix_encoder.UNetEncoder -_recursive_: false -n_channels: - - 136 - - 68 - - 34 -dilations: - - 1 - - 2 - - 4 \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse.yaml deleted file mode 100644 index cee82c7ca0..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: torch.nn.MSELoss \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse_ssim.yaml b/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse_ssim.yaml deleted file mode 100644 index 00f0d87f01..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/model/modules/loss/mse_ssim.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.metrics.climate.loss.MSE_SSIM - -mse_params: -ssim_params: - window_size: 11 - time_series_forecasting: True -# variables over which to calculate SSIM-MSE weighted average loss -ssim_variables: - - ttr1h - - tcwv0 -# used to calculated the weighted average between MSE and DSSIM -weights: - - 0. - - 1. diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/mse.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/mse.yaml deleted file mode 100644 index a6b617f3b1..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/mse.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.metrics.climate.healpix_loss.BaseMSE diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/ocean_mse.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/ocean_mse.yaml deleted file mode 100644 index eb5a908ee9..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/ocean_mse.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.metrics.climate.healpix_loss.OceanMSE - -lsm_file: /datasets/healpix/HPX32/hpx32_1950-2022_3h_sst_coupled.zarr - diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/weighted_mse.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/weighted_mse.yaml deleted file mode 100644 index ff9df3a269..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/criterion/weighted_mse.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: modulus.metrics.climate.healpix_loss.WeightedMSE - -weights: - - 1.0 - - 1.0 - - 1.0 - - 1.0 - - 1.0 - - 1.0 - - 1.0 diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/default.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/default.yaml deleted file mode 100644 index 205501702f..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/default.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - criterion: mse - - optimizer: adam - - lr_scheduler: cosine - -_target_: trainer.Trainer -_recursive_: true -max_epochs: 300 -early_stopping_patience: null -amp_mode: "fp16" -graph_mode: "train_eval" -output_dir: ${output_dir} diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/cosine.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/cosine.yaml deleted file mode 100644 index 20d290951c..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/lr_scheduler/cosine.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: torch.optim.lr_scheduler.CosineAnnealingLR -optimizer: ${model.optimizer} -T_max: ${trainer.max_epochs} -eta_min: 4e-5 -last_epoch: -1 -verbose: false \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/configs/trainer/optimizer/adam.yaml b/examples/weather/dlwp_healpix_coupled/configs/trainer/optimizer/adam.yaml deleted file mode 100644 index fc09755d67..0000000000 --- a/examples/weather/dlwp_healpix_coupled/configs/trainer/optimizer/adam.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -_target_: torch.optim.Adam -lr: ${learning_rate} \ No newline at end of file diff --git a/examples/weather/dlwp_healpix_coupled/train.py b/examples/weather/dlwp_healpix_coupled/train.py deleted file mode 100644 index 28ab2445ee..0000000000 --- a/examples/weather/dlwp_healpix_coupled/train.py +++ /dev/null @@ -1,175 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import sys - -import hydra -import numpy as np -import torch as th -from modulus.distributed import DistributedManager -from hydra.utils import instantiate - -from modulus import Module -from modulus.launch.logging import PythonLogger, RankZeroLoggingWrapper - -from pathlib import Path - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - - -@hydra.main(config_path="./configs", config_name="config", version_base=None) -def train(cfg): - """Train DLWP HEALPix weather model using the techniques described in the - paper "Advancing Parsimonious Deep Learning Weather Prediction using the HEALPix Mesh". - """ - # Initialize distributed - DistributedManager.initialize() - dist = DistributedManager() - - # set device globally to be sure that no spurious context are created on gpu 0: - th.cuda.set_device(dist.device) - - # Initialize logger. - os.makedirs(".logs", exist_ok=True) - logger = PythonLogger(name="train") # General python logger - logger0 = RankZeroLoggingWrapper(logger, dist) - logger.file_logging(file_name=f".logs/train_{dist.rank}.log") - logger0.info(f"experiment working directory: {os.getcwd()}") - - # Seed - if cfg.seed is not None: - th.manual_seed(cfg.seed) - if th.cuda.is_available(): - th.cuda.manual_seed(cfg.seed) - np.random.seed(cfg.seed) - - # Data module - data_module = instantiate(cfg.data.module) - - # Model - input_channels = len(cfg.data.input_variables) - output_channels = ( - len(cfg.data.output_variables) - if cfg.data.output_variables is not None - else input_channels - ) - constants_arr = data_module.constants - n_constants = ( - 0 if constants_arr is None else len(constants_arr.keys()) - ) # previously was 0 but with new format it is 1 - - decoder_input_channels = int(cfg.data.get("add_insolation", 0)) - cfg.model["input_channels"] = input_channels - cfg.model["output_channels"] = output_channels - cfg.model["n_constants"] = n_constants - cfg.model["decoder_input_channels"] = decoder_input_channels - - # convert Hydra cfg to pure dicts so they can be saved using modulus - model = instantiate(cfg.model, _convert_="all") - model.batch_size = cfg.batch_size - model.learning_rate = cfg.learning_rate - - # Instantiate PyTorch modules (with state dictionaries from checkpoint if given) - criterion = instantiate(cfg.trainer.criterion) - optimizer = instantiate(cfg.trainer.optimizer, params=model.parameters()) - lr_scheduler = ( - instantiate(cfg.trainer.lr_scheduler, optimizer=optimizer) - if cfg.trainer.lr_scheduler is not None - else None - ) - - # setup startup values - epoch = 1 - val_error = th.inf - iteration = 0 - epochs_since_improved = 0 - - # Prepare training under consideration of checkpoint if given - if cfg.get("checkpoint_name", None) is not None: - checkpoint_path = Path( - cfg.get("output_dir"), - "tensorboard", - "checkpoints", - "training-state-" + cfg.get("checkpoint_name") + ".mdlus", - ) - optimizer_path = Path( - cfg.get("output_dir"), - "tensorboard", - "checkpoints", - "optimizer-state-" + cfg.get("checkpoint_name") + ".ckpt", - ) - if checkpoint_path.exists(): - logger0.info(f"Loading checkpoint: {checkpoint_path}") - model = Module.from_checkpoint(str(checkpoint_path)) - checkpoint = th.load(optimizer_path, map_location=dist.device) - if not cfg.get("load_weights_only"): - # Load optimizer - optimizer = instantiate( - cfg.trainer.optimizer, params=model.parameters() - ) - optimizer_state_dict = checkpoint["optimizer_state_dict"] - optimizer.load_state_dict(optimizer_state_dict) - # Move tensors to the appropriate device as in https://github.com/pytorch/pytorch/issues/2830 - for state in optimizer.state.values(): - for k, v in state.items(): - if th.is_tensor(v): - state[k] = v.to(device=dist.device) - # Optionally load scheduler - if cfg.trainer.lr_scheduler is not None: - lr_scheduler = instantiate( - cfg.trainer.lr_scheduler, optimizer=optimizer - ) - lr_scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) - else: - lr_scheduler = None - epoch = checkpoint["epoch"] - val_error = checkpoint["val_error"] - iteration = checkpoint["iteration"] - epochs_since_improved = ( - checkpoint["epochs_since_improved"] - if "epochs_since_improved" in checkpoint.keys() - else 0 - ) - else: - logger0.info( - f"Checkpoint not found, weights not loaded. Requested path: {checkpoint_path}" - ) - - # Instantiate the trainer and fit the model - logger0.info("Model initialized") - trainer = instantiate( - cfg.trainer, - model=model, - data_module=data_module, - criterion=criterion, - optimizer=optimizer, - lr_scheduler=lr_scheduler, - device=dist.device, - ) - logger0.info(f"starting training") - trainer.fit( - epoch=epoch, - validation_error=val_error, - iteration=iteration, - epochs_since_improved=epochs_since_improved, - ) - - -if __name__ == "__main__": - train() - print("Done.") diff --git a/examples/weather/dlwp_healpix_coupled/trainer.py b/examples/weather/dlwp_healpix_coupled/trainer.py deleted file mode 100644 index ad67a1b802..0000000000 --- a/examples/weather/dlwp_healpix_coupled/trainer.py +++ /dev/null @@ -1,560 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env python3 -import gc -import os -import threading - -import torch - -# distributed stuff -from modulus.distributed import DistributedManager -from torch.cuda import amp -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.utils.tensorboard import SummaryWriter -from tqdm import tqdm - -# custom -from utils import write_checkpoint - -from modulus.launch.logging import PythonLogger, RankZeroLoggingWrapper - - -class Trainer: - """ - A class for DLWP model training - """ - - def __init__( - self, - model: torch.nn.Module, # Specify... (import) - data_module: torch.nn.Module, # Specify... (import) - criterion: torch.nn.Module, # Specify... (import) - optimizer: torch.nn.Module, # Specify... (import) - lr_scheduler: torch.nn.Module, # Specify... (import) - max_epochs: int = 500, - early_stopping_patience: int = None, - amp_mode: str = "none", - graph_mode: str = "none", - device: torch.device = torch.device("cpu"), - output_dir: str = "/outputs/", - max_norm: float = None, - ): - """ - Constructor. - - Parameters: - model: torch.nn.Module - The model to train - data_module: torch.nn.Module - The Pytorch module used for dataloading - criterion: torch.nn.Module - The PyTorch loss module to use - optimizer: torch.nn.Module - The PyTorch optimizer module to use - lr_scheduler: torch.nn.Module - The PyTorch learning rate scheduler module to use - max_epochs: int, optional - The maximum number of epochs to train for - early_stopping_patience: int, optional - amp_mode: str, optional - amp mode to use, valid options ["fp16", "bfloat16"] - graph_mode: str, optional - Where to use cudagraphs for training, valid options ["train", "train_eval"] - device: torch.device, optional - Device on which to run training on, can be any available torch.device - output_dir: str, optional - Where to store results - max_norm: float, optional - Maximum norm to use for training - """ - self.device = device - self.amp_enable = False if (amp_mode == "none") else True - self.amp_dtype = torch.float16 if (amp_mode == "fp16") else torch.bfloat16 - self.output_variables = data_module.output_variables - self.early_stopping_patience = early_stopping_patience - self.max_norm = max_norm - - self.model = model.to(device=self.device) - - self.dist = DistributedManager() - - # Initialize logger. - self.logger = PythonLogger(name="training_loop") # General python logger - self.logger0 = RankZeroLoggingWrapper(self.logger, self.dist) - self.logger.file_logging(file_name=f".logs/training_loop_{self.dist.rank}.log") - - self.dataloader_train, self.sampler_train = data_module.train_dataloader( - num_shards=self.dist.world_size, shard_id=self.dist.rank - ) - self.dataloader_valid, self.sampler_valid = data_module.val_dataloader( - num_shards=self.dist.world_size, shard_id=self.dist.rank - ) - self.output_dir_tb = os.path.join(output_dir, "tensorboard") - - # set the other parameters - self.optimizer = optimizer - # Set up criterion, pass metadata - self.criterion = criterion.to(device=self.device) - try: - self.criterion.setup(self) - except AttributeError: - raise NotImplementedError( - 'Attribute error encountered in call to criterio.setup(). \ - Could be that criterion is not compatable with custom loss dlwp training. See \ - "modulus/metrics/climate/healpix_loss.py" for proper criterion implementation examples.' - ) - - # opportunity for custom loss classes to get everything in order - self.lr_scheduler = lr_scheduler - self.max_epochs = max_epochs - - # add gradient scaler - self.gscaler = amp.GradScaler( - enabled=(self.amp_enable and self.amp_dtype == torch.float16) - ) - - # use distributed data parallel if requested: - self.print_to_screen = True - self.train_graph = None - self.eval_graph = None - - # for status bars - self.print_to_screen = self.dist.rank == 0 - - if self.dist.device.type == "cuda": - capture_stream = torch.cuda.Stream() - if torch.distributed.is_initialized(): - with torch.cuda.stream(capture_stream): - self.model = DDP( - self.model, - device_ids=[self.device.index], - output_device=[self.device.index], - broadcast_buffers=True, - find_unused_parameters=False, - gradient_as_bucket_view=True, - ) - capture_stream.synchronize() - - # capture graph if requested - if graph_mode in ["train", "train_eval"]: - self.logger0.info("Capturing model for training ...") - # get the shapes - inp, tar = next(iter(self.dataloader_train)) - - self._train_capture(capture_stream, [x.shape for x in inp], tar.shape) - - if graph_mode == "train_eval": - self.logger0.info("Capturing model for validation ...") - self._eval_capture(capture_stream) - - # Set up tensorboard summary_writer or try 'weights and biases' - # Initialize tensorbaord to track scalars - if self.dist.rank == 0: - self.writer = SummaryWriter(log_dir=self.output_dir_tb) - - def _train_capture( - self, capture_stream, inp_shapes, tar_shape, num_warmup_steps=20 - ): - # perform graph capture of the model - self.static_inp = [ - torch.zeros(x_shape, dtype=torch.float32, device=self.device) - for x_shape in inp_shapes - ] - self.static_tar = torch.zeros( - tar_shape, dtype=torch.float32, device=self.device - ) - - self.model.train() - capture_stream.wait_stream(torch.cuda.current_stream()) - with torch.cuda.stream(capture_stream): - for _ in range(num_warmup_steps): - self.model.zero_grad(set_to_none=True) - - # FW - with amp.autocast(enabled=self.amp_enable, dtype=self.amp_dtype): - self.static_gen_train = self.model.forward(self.static_inp) - - self.static_loss_train = self.criterion( - self.static_gen_train, self.static_tar - ) - - # BW - self.gscaler.scale(self.static_loss_train).backward() - - # sync here - capture_stream.synchronize() - - gc.collect() - torch.cuda.empty_cache() - - # create graph - self.train_graph = torch.cuda.CUDAGraph() - - # zero grads before capture: - self.model.zero_grad(set_to_none=True) - - # start capture - with torch.cuda.graph(self.train_graph): - # FW - with amp.autocast(enabled=self.amp_enable, dtype=self.amp_dtype): - # self.static_gen_train = self.model(self.static_inp) - self.static_gen_train = self.model.forward(self.static_inp) - - self.static_loss_train = self.criterion( - self.static_gen_train, self.static_tar - ) - - # BW - self.gscaler.scale(self.static_loss_train).backward() - - # wait for capture to finish - torch.cuda.current_stream().wait_stream(capture_stream) - - def _eval_capture(self, capture_stream, num_warmup_steps=20): - self.model.eval() - capture_stream.wait_stream(torch.cuda.current_stream()) - with torch.cuda.stream(capture_stream): - with torch.no_grad(): - for _ in range(num_warmup_steps): - # FW - with amp.autocast(enabled=self.amp_enable, dtype=self.amp_dtype): - # self.static_gen_eval = self.model(self.static_inp) - self.static_gen_eval = self.model.forward(self.static_inp) - - self.static_loss_eval = self.criterion( - self.static_gen_eval, self.static_tar - ) - # False flag for average channels ensures criterion will keep variable loss separated - self.static_losses_eval = self.criterion( - self.static_gen_eval, - self.static_tar, - average_channels=False, - ) - - # sync here - capture_stream.synchronize() - - gc.collect() - torch.cuda.empty_cache() - - # create graph - self.eval_graph = torch.cuda.CUDAGraph() - - # start capture: - with torch.cuda.graph(self.eval_graph, pool=self.train_graph.pool()): - # FW - with torch.no_grad(): - with amp.autocast(enabled=self.amp_enable, dtype=self.amp_dtype): - # self.static_gen_eval = self.model(self.static_inp) - self.static_gen_eval = self.model.forward(self.static_inp) - - self.static_loss_eval = self.criterion( - self.static_gen_eval, self.static_tar - ) - # False flag for average channels ensures criterion will keep variable loss separated - self.static_losses_eval = self.criterion( - self.static_gen_eval, - self.static_tar, - average_channels=False, - ) - - # wait for capture to finish - torch.cuda.current_stream().wait_stream(capture_stream) - - def fit( - self, - epoch: int = 0, - validation_error: torch.Tensor = torch.inf, - iteration: int = 0, - epochs_since_improved: int = 0, - ): - """ - Perform training by iterating over all epochs - - Parameters - epoch: int, optional - Current epoch number - validation_error: torch.Tensor, optional - Current best validation error - iteration: int, optional - Current iteration number - epochs_since_improved: int, optional - Number of epochs that have seen improvement in validation error - """ - best_validation_error = validation_error - for epoch in range(epoch, self.max_epochs): - torch.cuda.nvtx.range_push(f"training epoch {epoch}") - - if self.sampler_train is not None: - self.sampler_train.set_epoch(epoch) - - # Train: iterate over all training samples - training_step = 0 - self.model.train() - for inputs, target in ( - pbar := tqdm(self.dataloader_train, disable=(not self.print_to_screen)) - ): - pbar.set_description(f"Training epoch {epoch}/{self.max_epochs}") - - # Trach epoch in tensorboard - if self.dist.rank == 0: - self.writer.add_scalar( - tag="epoch", scalar_value=epoch, global_step=iteration - ) - - torch.cuda.nvtx.range_push(f"training step {training_step}") - - inputs = [x.to(device=self.device) for x in inputs] - target = target.to(device=self.device) - - # do optimizer step - if self.train_graph is not None: - # copy data into entry nodes - for idx, inp in enumerate(inputs): - self.static_inp[idx].copy_(inp) - - self.static_tar.copy_(target) - - # replay - self.train_graph.replay() - - # extract loss - output = self.static_gen_train - train_loss = self.static_loss_train - else: - # zero grads - self.model.zero_grad(set_to_none=True) - - if self.amp_enable: - with amp.autocast( - enabled=self.amp_enable, dtype=self.amp_dtype - ): - output = self.model(inputs) - train_loss = self.criterion(output, target) - else: - output = self.model(inputs) - train_loss = self.criterion(output, target) - - self.gscaler.scale(train_loss).backward() - - # Gradient clipping - self.gscaler.unscale_(self.optimizer) - try: - curr_lr = ( - self.optimizer.param_groups[-1]["lr"] - if self.lr_scheduler is None - else self.lr_scheduler.get_last_lr()[0] - ) - except ( - AttributeError - ): # try loop required since LearnOnPlateau has no "get_last_lr" attribute - curr_lr = ( - self.optimizer.param_groups[-1]["lr"] - if self.lr_scheduler is None - else self.optimizer.param_groups[0]["lr"] - ) - # check that max norm was not given to trainer - if self.max_norm is None: - torch.nn.utils.clip_grad_norm_(self.model.parameters(), curr_lr) - else: - torch.nn.utils.clip_grad_norm_( - self.model.parameters(), self.max_norm - ) - - # Optimizer step - self.gscaler.step(self.optimizer) - self.gscaler.update() - - pbar.set_postfix({"Loss": train_loss.item()}) - - torch.cuda.nvtx.range_pop() - - if self.dist.rank == 0: - self.writer.add_scalar( - tag="loss", scalar_value=train_loss, global_step=iteration - ) - iteration += 1 - training_step += 1 - - torch.cuda.nvtx.range_pop() - torch.cuda.nvtx.range_push(f"validation epoch {epoch}") - - # Validate (without gradients) - if self.sampler_valid is not None: - self.sampler_valid.set_epoch(epoch) - - self.model.eval() - with torch.no_grad(): - validation_stats = torch.zeros( - (2 + len(self.output_variables)), - dtype=torch.float32, - device=self.device, - ) - for inputs, target in ( - pbar := tqdm( - self.dataloader_valid, disable=(not self.print_to_screen) - ) - ): - pbar.set_description(f"Validation epoch {epoch}/{self.max_epochs}") - inputs = [x.to(device=self.device) for x in inputs] - target = target.to(device=self.device) - bsize = float(target.shape[0]) - - # do eval step - if self.eval_graph is not None: - # copy data into entry nodes - for idx, inp in enumerate(inputs): - self.static_inp[idx].copy_(inp) - self.static_tar.copy_(target) - - # replay graph - self.eval_graph.replay() - - # increase the loss - validation_stats[0] += self.static_loss_eval * bsize - - # Same for the per-variable loss - for v_idx, v_name in enumerate(self.output_variables): - validation_stats[1 + v_idx] += ( - self.static_losses_eval[v_idx] * bsize - ) - else: - if self.amp_enable: - with amp.autocast( - enabled=self.amp_enable, dtype=self.amp_dtype - ): - output = self.model(inputs) - validation_stats[0] += ( - self.criterion(prediction=output, target=target) - * bsize - ) - # save per variable loss - eval_losses = self.criterion( - output, target, average_channels=False - ) - for v_idx, v_name in enumerate(self.output_variables): - validation_stats[1 + v_idx] += ( - eval_losses[v_idx] * bsize - ) - else: - output = self.model(inputs) - validation_stats[0] += ( - self.criterion(prediction=output, target=target) * bsize - ) - eval_losses = self.criterion( - output, target, average_channels=False - ) - for v_idx, v_name in enumerate(self.output_variables): - validation_stats[1 + v_idx] += ( - eval_losses[v_idx] * bsize - ) - - pbar.set_postfix( - {"Loss": (validation_stats[0] / validation_stats[-1]).item()} - ) - - # increment sample counter - validation_stats[-1] += bsize - - if torch.distributed.is_initialized(): - torch.distributed.all_reduce(validation_stats) - - validation_error = (validation_stats[0] / validation_stats[-1]).item() - - # Record error per variable - validation_errors = [] - for v_idx, v_name in enumerate(self.output_variables): - validation_errors.append( - (validation_stats[1 + v_idx] / validation_stats[-1]).item() - ) - - # Track validation improvement to later check early stopping criterion - if validation_error < best_validation_error: - best_validation_error = validation_error - epochs_since_improved = 0 - else: - epochs_since_improved += 1 - - torch.cuda.nvtx.range_pop() - - # Logging and checkpoint saving - if self.dist.rank == 0: - if self.lr_scheduler is not None: - self.writer.add_scalar( - tag="learning_rate", - scalar_value=self.optimizer.param_groups[0]["lr"], - global_step=iteration, - ) - self.writer.add_scalar( - tag="val_loss", scalar_value=validation_error, global_step=iteration - ) - - # Per-variable loss - for v_idx, v_name in enumerate(self.output_variables): - self.writer.add_scalar( - tag=f"val_loss/{v_name}", - scalar_value=validation_errors[v_idx], - global_step=iteration, - ) - - # Write model checkpoint to file, using a separate thread - self.logger0.info("Writing checkpoint") - thread = threading.Thread( - target=write_checkpoint, - args=( - self.model.module - if torch.distributed.is_initialized() - else self.model, - self.optimizer, - self.lr_scheduler, - epoch, - iteration, - validation_error, - epochs_since_improved, - self.output_dir_tb, - ), - ) - thread.start() - - # Update learning rate - try: - if self.lr_scheduler is not None: - self.lr_scheduler.step() - except TypeError: # Plateau Learning rate requires val loss - if self.lr_scheduler is not None: - self.lr_scheduler.step(validation_error) - - # Check early stopping criterium - if ( - self.early_stopping_patience is not None - and epochs_since_improved >= self.early_stopping_patience - ): - self.logger0.info( - f"Hit early stopping criterium by not improving the validation error for {epochs_since_improved}" - " epochs. Finishing training." - ) - break - - # Wrap up - if self.dist.rank == 0: - try: - thread.join() - except UnboundLocalError: - pass - self.writer.flush() - self.writer.close() diff --git a/examples/weather/dlwp_healpix_coupled/utils.py b/examples/weather/dlwp_healpix_coupled/utils.py deleted file mode 100644 index 30ceb9cfbb..0000000000 --- a/examples/weather/dlwp_healpix_coupled/utils.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-FileCopyrightText: All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import logging -import os -import re - -import numpy as np -import torch as th - -logger = logging.getLogger(__name__) - - -# TODO switch over to modulus checkpoint system -def write_checkpoint( - model, - optimizer, - scheduler, - epoch: int, - iteration: int, - val_error: float, - epochs_since_improved: int, - dst_path: str, - keep_n_checkpoints: int = 5, -): - """ - Writes a checkpoint including model, optimizer, and scheduler state dictionaries along with current epoch, - iteration, and validation error to file. - - :param model: The network model - :param optimizer: The pytorch optimizer - :param scheduler: The pytorch learning rate scheduler - :param epoch: Current training epoch - :param iteration: Current training iteration - :param val_error: The validation error of the current training - :param epochs_since_improved: The number of epochs since the validation error improved - :param dst_path: Path where the checkpoint is written to - :param keep_n_checkpoints: Number of best checkpoints that will be saved (worse checkpoints are overwritten) - """ - root_path = os.path.join( - dst_path, - "checkpoints", - ) - # root_path = os.path.dirname(ckpt_dst_path) - ckpt_dst_path = os.path.join( - root_path, - f"training-state-epoch-{str(epoch).zfill(4)}-val_loss=" - + "{:.4E}".format(val_error) - + ".mdlus", - ) - os.makedirs(root_path, exist_ok=True) - - model.save(ckpt_dst_path) - model.save(os.path.join(root_path, "training-state-last.mdlus")) - - opt_dst_path = os.path.join( - root_path, - f"optimizer-state-epoch-{str(epoch).zfill(4)}-val_loss=" - + "{:.4E}".format(val_error) - + ".ckpt", - ) - th.save( - obj={ - "optimizer_state_dict": optimizer.state_dict(), - "scheduler_state_dict": scheduler.state_dict(), - "epoch": epoch + 1, - "iteration": iteration, - "val_error": val_error, - "epochs_since_improved": epochs_since_improved, - }, - f=opt_dst_path, - ) - th.save( - obj={ - "optimizer_state_dict": optimizer.state_dict(), - "scheduler_state_dict": scheduler.state_dict(), - "epoch": epoch + 1, - "iteration": iteration, - "val_error": val_error, - "epochs_since_improved": epochs_since_improved, - }, - f=os.path.join(root_path, "optimizer-state-last.ckpt"), - ) - - # Only keep top n checkpoints - ckpt_paths = np.array(glob.glob(root_path + "/training-state-epoch-*.mdlus")) - if len(ckpt_paths) > keep_n_checkpoints + 1: - worst_path = "" - worst_error = -np.infty - for ckpt_path in ckpt_paths: - if "NAN" in ckpt_path: - os.remove(ckpt_path) - try: - os.remove(ckpt_path.replace("training", "optimizer")) - except FileNotFoundError: - pass - continue - # Read the scientific number from the checkpoint name and perform update if appropriate - curr_error = float( - re.findall("-?\d*\.?\d+E[+-]?\d+", os.path.basename(ckpt_path))[0] - ) - if curr_error > worst_error: - worst_path = ckpt_path - worst_error = curr_error - os.remove(worst_path) - try: - os.remove( - worst_path.replace("training", "optimizer").replace("mdlus", "ckpt") - ) - except FileNotFoundError: - pass