From 3a50cf2b7942104388cb231d2e3ea9726cfec6be Mon Sep 17 00:00:00 2001 From: XiangAn Date: Wed, 8 Feb 2023 23:21:02 +0800 Subject: [PATCH] Resolved torchrun Bug: Fixed issue #2163 Updated torch.distributed.launch to torchrun. --- recognition/arcface_torch/README.md | 4 ++-- recognition/arcface_torch/dist.sh | 4 ++-- recognition/arcface_torch/train.py | 4 ++-- recognition/arcface_torch/train_v2.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/recognition/arcface_torch/README.md b/recognition/arcface_torch/README.md index 390c27fae..166749111 100644 --- a/recognition/arcface_torch/README.md +++ b/recognition/arcface_torch/README.md @@ -9,9 +9,9 @@ The "arcface_torch" repository is the official implementation of the ArcFace alg ## Requirements -To avail the latest features of PyTorch, we have upgraded to version 1.9.0. +To avail the latest features of PyTorch, we have upgraded to version 1.12.0. -- Install [PyTorch](http://pytorch.org) (torch>=1.9.0), our doc for [install.md](docs/install.md). +- Install [PyTorch](https://pytorch.org/get-started/previous-versions/) (torch>=1.12.0). - (Optional) Install [DALI](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/), our doc for [install_dali.md](docs/install_dali.md). - `pip install -r requirement.txt`. diff --git a/recognition/arcface_torch/dist.sh b/recognition/arcface_torch/dist.sh index 7215ca078..9f3c6a527 100644 --- a/recognition/arcface_torch/dist.sh +++ b/recognition/arcface_torch/dist.sh @@ -4,9 +4,9 @@ config=wf42m_pfc03_32gpu_r100 for((node_rank=0;node_rank<${#ip_list[*]};node_rank++)); do - ssh face@${ip_list[node_rank]} "cd `pwd`;PATH=$PATH \ + ssh ubuntu@${ip_list[node_rank]} "cd `pwd`;PATH=$PATH \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - python -m torch.distributed.launch \ + torchrun \ --nproc_per_node=8 \ --nnodes=${#ip_list[*]} \ --node_rank=$node_rank \ diff --git a/recognition/arcface_torch/train.py b/recognition/arcface_torch/train.py index ba8a33a61..9cbc56340 100755 --- a/recognition/arcface_torch/train.py +++ b/recognition/arcface_torch/train.py @@ -18,8 +18,8 @@ from utils.utils_distributed_sampler import setup_seed from utils.utils_logging import AverageMeter, init_logging -assert torch.__version__ >= "1.9.0", "In order to enjoy the features of the new torch, \ -we have upgraded the torch to 1.9.0. torch before than 1.9.0 may not work in the future." +assert torch.__version__ >= "1.12.0", "In order to enjoy the features of the new torch, \ +we have upgraded the torch to 1.12.0. torch before than 1.9.0 may not work in the future." try: rank = int(os.environ["RANK"]) diff --git a/recognition/arcface_torch/train_v2.py b/recognition/arcface_torch/train_v2.py index 643d717dd..b3dc3e321 100755 --- a/recognition/arcface_torch/train_v2.py +++ b/recognition/arcface_torch/train_v2.py @@ -18,8 +18,8 @@ from utils.utils_distributed_sampler import setup_seed from utils.utils_logging import AverageMeter, init_logging -assert torch.__version__ >= "1.9.0", "In order to enjoy the features of the new torch, \ -we have upgraded the torch to 1.9.0. torch before than 1.9.0 may not work in the future." +assert torch.__version__ >= "1.12.0", "In order to enjoy the features of the new torch, \ +we have upgraded the torch to 1.12.0. torch before than 1.9.0 may not work in the future." try: rank = int(os.environ["RANK"])