Skip to content

Commit

Permalink
[Feature]Add Ascend NPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
xuuyangg committed Aug 7, 2023
1 parent 5b4ef8d commit d349e2c
Show file tree
Hide file tree
Showing 14 changed files with 335 additions and 4 deletions.
13 changes: 13 additions & 0 deletions examples/cifar10_dist_npu_eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# CIFAR-10 Evaluation Example

## Single process evaluation

```bash
python cifar10_npu_eval.py
```

## Multiple processes evaluation with torch.distributed

```bash
python cifar10_eval_torch_npu_dist.py
```
63 changes: 63 additions & 0 deletions examples/cifar10_dist_npu_eval/cifar10_eval_torch_npu_dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import torch
import torchvision as tv
import tqdm
from torch.utils.data import DataLoader, DistributedSampler

from mmeval import Accuracy


def get_eval_dataloader(rank=0, num_replicas=1):
dataset = tv.datasets.CIFAR10(
root='./',
train=False,
download=True,
transform=tv.transforms.ToTensor())
dist_sampler = DistributedSampler(
dataset, num_replicas=num_replicas, rank=rank)
data_loader = DataLoader(dataset, batch_size=1, sampler=dist_sampler)
return data_loader, len(dataset)


def get_model(pretrained_model_fpath=None):
model = tv.models.resnet18(num_classes=10)
if pretrained_model_fpath is not None:
model.load_state_dict(torch.load(pretrained_model_fpath))
return model.eval()


def eval_fn(rank, process_num):
master_addr = 'localhost'
master_port = 12345

os.environ['MASTER_ADDR'] = master_addr
os.environ['MASTER_PORT'] = str(master_port)

torch.distributed.init_process_group(
backend='hccl',
init_method='env://',
world_size=process_num,
rank=rank)

num_npus = torch.npu.device_count()
torch.npu.set_device(rank % num_npus)

eval_dataloader, total_num_samples = get_eval_dataloader(rank, process_num)
model = get_model().npu()
accuracy = Accuracy(topk=(1, 3), dist_backend='npu_dist')

with torch.no_grad():
for images, labels in tqdm.tqdm(eval_dataloader, disable=(rank != 0)):
images = images.npu()
labels = labels.npu()
predicted_score = model(images)
accuracy.add(predictions=predicted_score, labels=labels)

print(accuracy.compute(size=total_num_samples))
accuracy.reset()


if __name__ == '__main__':
process_num = 8
torch.multiprocessing.spawn(
eval_fn, nprocs=process_num, args=(process_num, ))
37 changes: 37 additions & 0 deletions examples/cifar10_dist_npu_eval/cifar10_npu_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import torch
import torchvision as tv
import tqdm
from torch.utils.data import DataLoader

from mmeval import Accuracy


def get_eval_dataloader():
dataset = tv.datasets.CIFAR10(
root='./',
train=False,
download=True,
transform=tv.transforms.ToTensor())
return DataLoader(dataset, batch_size=1)


def get_model(pretrained_model_fpath=None):
model = tv.models.resnet18(num_classes=10)
if pretrained_model_fpath is not None:
model.load_state_dict(torch.load(pretrained_model_fpath))
return model.eval()


eval_dataloader = get_eval_dataloader()
model = get_model().npu()
accuracy = Accuracy(topk=(1, 3))

with torch.no_grad():
for images, labels in tqdm.tqdm(eval_dataloader):
images = images.npu()
labels = labels.npu()
predicted_score = model(images)
accuracy.add(predictions=predicted_score, labels=labels)

print(accuracy.compute())
accuracy.reset()
7 changes: 4 additions & 3 deletions mmeval/core/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from typing import List, Optional, no_type_check

from .dist_backends import (BaseDistBackend, MPI4PyDist, NonDist, OneFlowDist,
PaddleDist, TFHorovodDist, TorchCPUDist,
TorchCUDADist)
from .dist_backends import (BaseDistBackend, MPI4PyDist, NonDist, NPUDist,
OneFlowDist, PaddleDist, TFHorovodDist,
TorchCPUDist, TorchCUDADist)

_DIST_BACKENDS = {
'non_dist': NonDist,
Expand All @@ -14,6 +14,7 @@
'torch_cpu': TorchCPUDist,
'torch_cuda': TorchCUDADist,
'paddle_dist': PaddleDist,
'npu_dist': NPUDist
}

_DEFAULT_BACKEND = 'non_dist'
Expand Down
3 changes: 2 additions & 1 deletion mmeval/core/dist_backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .base_backend import BaseDistBackend, TensorBaseDistBackend
from .mpi4py import MPI4PyDist
from .non_dist import NonDist
from .npu_dist import NPUDist
from .oneflow_dist import OneFlowDist
from .paddle_dist import PaddleDist
from .tf_horovod import TFHorovodDist
Expand All @@ -12,5 +13,5 @@
__all__ = [
'BaseDistBackend', 'TensorBaseDistBackend', 'MPI4PyDist', 'NonDist',
'OneFlowDist', 'TFHorovodDist', 'TorchCPUDist', 'TorchCUDADist',
'PaddleDist'
'PaddleDist', 'NPUDist'
]
59 changes: 59 additions & 0 deletions mmeval/core/dist_backends/npu_dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) OpenMMLab. All rights reserved.

from typing import TYPE_CHECKING, Any, Tuple, TypeVar, Union

from mmeval.utils import try_import
from .torch_cpu import TorchCPUDist

if TYPE_CHECKING:
import torch
import torch_npu
else:
torch = try_import('torch')
torch_npu = try_import('torch_npu')

Tensor = TypeVar('Tensor', bound='torch.Tensor')


class NPUDist(TorchCPUDist):
"""A distributed communication backend for Ascend NPU."""

def __init__(self) -> None:
super().__init__()
if torch_npu is None:
raise ImportError(f'For availability of {self.__class__.__name__},'
' please install ascend pytorch first.')
if not torch.distributed.is_hccl_available():
raise RuntimeError(
f'For availability of {self.__class__.__name__},'
' make sure torch.distributed.is_hccl_available().')

def _object_to_tensor(self, obj: Any) -> Tuple[Tensor, Tensor]:
"""Convert the given object to a npu tensor via `pickle.dumps`.
Args:
obj (any): Any pickle-able python object.
Returns:
tuple: A tuple of the tensor converted from given object and the
tensor size.
"""
# Add type annotation make mypy happy
obj_tensor: Tensor
obj_size_tensor: Tensor
obj_tensor, obj_size_tensor = super()._object_to_tensor(obj)
return obj_tensor.npu(), obj_size_tensor.npu()

def _tensor_to_object(self, tensor: Tensor,
tensor_size: Union[int, Tensor]) -> Any:
"""Convert the given npu tensor to a object via `pickle.loads`.
Args:
tenosr (Tensor): A npu tensor.
tensor_size (int or Tensor): The tensor size of the given Tensor to
be convert object.
Returns:
Any: The object converted from the given npu tensor.
"""
return super()._tensor_to_object(tensor.detach().cpu(), tensor_size)
2 changes: 2 additions & 0 deletions mmeval/metrics/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
jax = try_import('jax')
flow = try_import('oneflow')

torch = try_import('torch')


@overload
@dispatch
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/average_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
torch = try_import('torch')
flow = try_import('oneflow')

torch_npu = try_import('torch_npu')

NUMPY_IMPL_HINTS = Tuple[Union[np.ndarray, np.number], Union[np.ndarray,
np.number]]
TORCH_IMPL_HINTS = Tuple['torch.Tensor', 'torch.Tensor']
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/end_point_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
torch = try_import('torch')
flow = try_import('oneflow')

torch_npu = try_import('torch_npu')


class EndPointError(BaseMetric):
"""EndPointError evaluation metric.
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/f1_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
torch = try_import('torch')
flow = try_import('oneflow')

torch_npu = try_import('torch_npu')


class F1Score(BaseMetric):
"""Compute F1 scores.
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/mean_iou.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
tf = try_import('tensorflow')
flow = try_import('oneflow')

torch_npu = try_import('torch_npu')


class MeanIoU(BaseMetric):
"""MeanIoU evaluation metric.
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
tf = try_import('tensorflow')
flow = try_import('oneflow')

torch_npu = try_import('torch_npu')


def softmax(x: np.ndarray) -> np.ndarray:
"""Compute the softmax function.
Expand Down
2 changes: 2 additions & 0 deletions mmeval/metrics/precision_recall_f1score.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
flow = try_import('oneflow')
of_F = try_import('oneflow.nn.functional')

torch_npu = try_import('torch_npu')

NUMPY_IMPL_HINTS = Tuple[Union[np.ndarray, np.number], Union[np.ndarray,
np.number]]
TORCH_IMPL_HINTS = Tuple['torch.Tensor', 'torch.Tensor']
Expand Down
Loading

0 comments on commit d349e2c

Please sign in to comment.