From 650542a05bb9bec64becb138d9d3aef0de52cb9e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 25 Jan 2024 15:29:27 -0800 Subject: [PATCH 01/41] Improvements to multinode papers100m default hyperparams and adding eval on all ranks --- .../multi_gpu/papers100m_gcn_multinode.py | 87 +++++++++---------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index f827700ac73c..f00df44667be 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -1,6 +1,6 @@ """Multi-node multi-GPU example on ogbn-papers100m. -To run: +Example way to run using srun: srun -l -N --ntasks-per-node= \ --container-name=cont --container-image= \ --container-mounts=/ogb-papers100m/:/workspace/dataset @@ -16,7 +16,8 @@ from torch.nn.parallel import DistributedDataParallel from torch_geometric.loader import NeighborLoader -from torch_geometric.nn import GCNConv +from torch_geometric.nn.models import GCN +from torchmetrics import Accuracy def get_num_workers() -> int: @@ -31,21 +32,7 @@ def get_num_workers() -> int: return num_workers -class GCN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels): - super().__init__() - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) - - def forward(self, x, edge_index): - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv1(x, edge_index).relu() - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv2(x, edge_index) - return x - - -def run(world_size, data, split_idx, model): +def run(world_size, data, split_idx, model, acc): local_id = int(os.environ['LOCAL_RANK']) rank = torch.distributed.get_rank() torch.cuda.set_device(local_id) @@ -54,18 +41,20 @@ def run(world_size, data, split_idx, model): print(f'Using {nprocs} GPUs...') split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, - dim=0, - )[rank].clone() - + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + split_idx['valid'] = split_idx['valid'].split( + split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() + split_idx['test'] = split_idx['test'].split( + split_idx['test'].size(0) // world_size, dim=0)[rank].clone() + model = DistributedDataParallel(model.to(device), device_ids=[local_id]) - optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) kwargs = dict( data=data, - batch_size=128, + batch_size=1024, num_workers=get_num_workers(), - num_neighbors=[50, 50], + num_neighbors=[16, 16], ) train_loader = NeighborLoader( @@ -73,15 +62,15 @@ def run(world_size, data, split_idx, model): shuffle=True, **kwargs, ) - if rank == 0: - val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs) - test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs) + val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs) + test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs) val_steps = 1000 warmup_steps = 100 + acc = acc.to(rank) if rank == 0: print("Beginning training...") - + for epoch in range(1, 4): model.train() for i, batch in enumerate(train_loader): @@ -103,7 +92,7 @@ def run(world_size, data, split_idx, model): print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") model.eval() - total_correct = total_examples = 0 + acc_sum = 0.0 for i, batch in enumerate(val_loader): if i >= val_steps: break @@ -113,29 +102,35 @@ def run(world_size, data, split_idx, model): batch = batch.to(device) with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch.batch_size] - pred = out.argmax(dim=-1) - y = batch.y[:batch.batch_size].view(-1).to(torch.long) - - total_correct += int((pred == y).sum()) - total_examples += y.size(0) - - print(f"Val Acc: {total_correct / total_examples:.4f}") + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + num_batches = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) + dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + + print(f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) sec_per_iter = (time.time() - start) / (i - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") if rank == 0: model.eval() - total_correct = total_examples = 0 + acc_sum = 0.0 for i, batch in enumerate(test_loader): batch = batch.to(device) with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch.batch_size] - pred = out.argmax(dim=-1) - y = batch.y[:batch.batch_size].view(-1).to(torch.long) - - total_correct += int((pred == y).sum()) - total_examples += y.size(0) - print(f"Test Acc: {total_correct / total_examples:.4f}") + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + num_batches = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) + dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) if __name__ == '__main__': @@ -145,6 +140,6 @@ def run(world_size, data, split_idx, model): assert dist.is_initialized(), "Distributed cluster not initialized" dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() - model = GCN(dataset.num_features, 64, dataset.num_classes) - - run(nprocs, dataset[0], split_idx, model) + model = GCN(dataset.num_features, 128, 2, dataset.num_classes) + acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) + run(nprocs, dataset[0], split_idx, model, acc) From 98ba40ef2e8b3c232ae774320222c8b43aa71129 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:31:19 +0000 Subject: [PATCH 02/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/papers100m_gcn_multinode.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index f00df44667be..b1c00f91e9aa 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -14,10 +14,10 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel +from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader from torch_geometric.nn.models import GCN -from torchmetrics import Accuracy def get_num_workers() -> int: @@ -41,12 +41,12 @@ def run(world_size, data, split_idx, model, acc): print(f'Using {nprocs} GPUs...') split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() split_idx['valid'] = split_idx['valid'].split( split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() split_idx['test'] = split_idx['test'].split( split_idx['test'].size(0) // world_size, dim=0)[rank].clone() - + model = DistributedDataParallel(model.to(device), device_ids=[local_id]) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) @@ -70,7 +70,7 @@ def run(world_size, data, split_idx, model, acc): acc = acc.to(rank) if rank == 0: print("Beginning training...") - + for epoch in range(1, 4): model.train() for i, batch in enumerate(train_loader): @@ -103,15 +103,16 @@ def run(world_size, data, split_idx, model, acc): with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch.batch_size] acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) num_batches = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) + device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) - print(f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) + print( + f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) sec_per_iter = (time.time() - start) / (i - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") @@ -123,12 +124,12 @@ def run(world_size, data, split_idx, model, acc): with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch.batch_size] acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) num_batches = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) + device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) From deec96979e7d843790866353fce4384808c38750 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 25 Jan 2024 15:33:10 -0800 Subject: [PATCH 03/41] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9adcbec6c288..8b90d12f6996 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## \[2.5.0\] - 2023-MM-DD ### Added - +- Improvements to multinode papers100m default hyperparams and adding eval on all ranks ([#8823](https://github.com/pyg-team/pytorch_geometric/pull/8823)) - Added support for graph partitioning for temporal data in `torch_geometric.distributed` ([#8718](https://github.com/pyg-team/pytorch_geometric/pull/8718), [#8815](https://github.com/pyg-team/pytorch_geometric/pull/8815)) - Added `TreeGraph` and `GridMotif` generators ([#8736](https://github.com/pyg-team/pytorch_geometric/pull/8736)) - Added an example for edge-level temporal sampling on a heterogenous graph ([#8383](https://github.com/pyg-team/pytorch_geometric/pull/8383)) From ab0fad418be948b1f659dea37cf7a6a08d1c2758 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:34:42 +0000 Subject: [PATCH 04/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b90d12f6996..5ac92cf3f9b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## \[2.5.0\] - 2023-MM-DD ### Added + - Improvements to multinode papers100m default hyperparams and adding eval on all ranks ([#8823](https://github.com/pyg-team/pytorch_geometric/pull/8823)) - Added support for graph partitioning for temporal data in `torch_geometric.distributed` ([#8718](https://github.com/pyg-team/pytorch_geometric/pull/8718), [#8815](https://github.com/pyg-team/pytorch_geometric/pull/8815)) - Added `TreeGraph` and `GridMotif` generators ([#8736](https://github.com/pyg-team/pytorch_geometric/pull/8736)) From 729a524275415ccf2451a4159133318ae3f7f0cf Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 25 Jan 2024 16:02:26 -0800 Subject: [PATCH 05/41] cleanup --- examples/multi_gpu/papers100m_gcn_multinode.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index b1c00f91e9aa..d80bd4ad9f89 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -77,9 +77,10 @@ def run(world_size, data, split_idx, model, acc): if i == warmup_steps: start = time.time() batch = batch.to(device) + batch_size = batch.batch_size optimizer.zero_grad() - y = batch.y[:batch.batch_size].view(-1).to(torch.long) - out = model(batch.x, batch.edge_index)[:batch.batch_size] + y = batch.y[:batch_size].view(-1).to(torch.long) + out = model(batch.x, batch.edge_index)[:batch_size] loss = F.cross_entropy(out, y) loss.backward() optimizer.step() @@ -100,8 +101,9 @@ def run(world_size, data, split_idx, model, acc): start = time.time() batch = batch.to(device) + batch_size = batch.batch_size with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch.batch_size] + out = model(batch.x, batch.edge_index)[:batch_size] acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, @@ -121,8 +123,9 @@ def run(world_size, data, split_idx, model, acc): acc_sum = 0.0 for i, batch in enumerate(test_loader): batch = batch.to(device) + batch_size = batch.batch_size with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch.batch_size] + out = model(batch.x, batch.edge_index)[:batch_size] acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, @@ -131,7 +134,7 @@ def run(world_size, data, split_idx, model, acc): num_batches = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) - print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) if __name__ == '__main__': From a152835343f985bab2023d72722d708368147f5d Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 26 Jan 2024 09:21:19 -0800 Subject: [PATCH 06/41] fixing --- .../multi_gpu/papers100m_gcn_multinode.py | 61 ++++++++++--------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index d80bd4ad9f89..e16a8e08edbe 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -87,41 +87,19 @@ def run(world_size, data, split_idx, model, acc): if rank == 0 and i % 10 == 0: print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') - + + dist.barrier() if rank == 0: sec_per_iter = (time.time() - start) / (i - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") - - model.eval() - acc_sum = 0.0 - for i, batch in enumerate(val_loader): - if i >= val_steps: - break - if i == warmup_steps: - start = time.time() - - batch = batch.to(device) - batch_size = batch.batch_size - with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch_size] - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - num_batches = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) - - print( - f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) - sec_per_iter = (time.time() - start) / (i - warmup_steps) - print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") - - if rank == 0: model.eval() acc_sum = 0.0 - for i, batch in enumerate(test_loader): + for i, batch in enumerate(val_loader): + if i >= val_steps: + break + if i == warmup_steps: + start = time.time() + batch = batch.to(device) batch_size = batch.batch_size with torch.no_grad(): @@ -134,7 +112,30 @@ def run(world_size, data, split_idx, model, acc): num_batches = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + if rank == 0: + print(f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) + sec_per_iter = (time.time() - start) / (i - warmup_steps) + print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") + dist.barrier() + + model.eval() + acc_sum = 0.0 + for i, batch in enumerate(test_loader): + batch = batch.to(device) + batch_size = batch.batch_size + with torch.no_grad(): + out = model(batch.x, batch.edge_index)[:batch_size] + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + num_batches = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) + dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) + dist.barrier() if __name__ == '__main__': From 281de731a9dcba36035c2d76d24b578bd23eda72 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Jan 2024 17:25:52 +0000 Subject: [PATCH 07/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index e16a8e08edbe..7fff717c57f0 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -87,7 +87,7 @@ def run(world_size, data, split_idx, model, acc): if rank == 0 and i % 10 == 0: print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') - + dist.barrier() if rank == 0: sec_per_iter = (time.time() - start) / (i - warmup_steps) @@ -113,7 +113,8 @@ def run(world_size, data, split_idx, model, acc): device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) if rank == 0: - print(f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) + print( + f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) sec_per_iter = (time.time() - start) / (i - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") dist.barrier() @@ -125,10 +126,8 @@ def run(world_size, data, split_idx, model, acc): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) + acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) num_batches = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) From acce4dee8f8ce0434fc50e7780cb1764ed630d89 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 26 Jan 2024 11:15:57 -0800 Subject: [PATCH 08/41] graphsage --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 7fff717c57f0..7269acb56ae4 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -17,7 +17,7 @@ from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader -from torch_geometric.nn.models import GCN +from torch_geometric.nn.models import GraphSAGE def get_num_workers() -> int: @@ -144,6 +144,6 @@ def run(world_size, data, split_idx, model, acc): assert dist.is_initialized(), "Distributed cluster not initialized" dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() - model = GCN(dataset.num_features, 128, 2, dataset.num_classes) + model = GraphSAGE(dataset.num_features, 128, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) run(nprocs, dataset[0], split_idx, model, acc) From 07c57817ce1b7688cd46c84ea1fff38769cd577f Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 26 Jan 2024 11:16:36 -0800 Subject: [PATCH 09/41] back to GCN --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 7269acb56ae4..7fff717c57f0 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -17,7 +17,7 @@ from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader -from torch_geometric.nn.models import GraphSAGE +from torch_geometric.nn.models import GCN def get_num_workers() -> int: @@ -144,6 +144,6 @@ def run(world_size, data, split_idx, model, acc): assert dist.is_initialized(), "Distributed cluster not initialized" dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() - model = GraphSAGE(dataset.num_features, 128, 2, dataset.num_classes) + model = GCN(dataset.num_features, 128, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) run(nprocs, dataset[0], split_idx, model, acc) From 2d37cc71289e4b5b4540a0f0cd111822a7115902 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 26 Jan 2024 11:38:24 -0800 Subject: [PATCH 10/41] specify download location --- examples/multi_gpu/papers100m_gcn_multinode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 7fff717c57f0..692cdd1ad34c 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -142,7 +142,8 @@ def run(world_size, data, split_idx, model, acc): torch.distributed.init_process_group("nccl") nprocs = dist.get_world_size() assert dist.is_initialized(), "Distributed cluster not initialized" - dataset = PygNodePropPredDataset(name='ogbn-papers100M') + dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() model = GCN(dataset.num_features, 128, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) From d2c851b074e7ff082a048cdf30bf692fb142fd0f Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 12:50:45 -0800 Subject: [PATCH 11/41] better hyperparams --- examples/multi_gpu/papers100m_gcn_multinode.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 692cdd1ad34c..64da6dbe81cf 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -48,13 +48,13 @@ def run(world_size, data, split_idx, model, acc): split_idx['test'].size(0) // world_size, dim=0)[rank].clone() model = DistributedDataParallel(model.to(device), device_ids=[local_id]) - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) kwargs = dict( data=data, batch_size=1024, num_workers=get_num_workers(), - num_neighbors=[16, 16], + num_neighbors=[10, 10, 10], ) train_loader = NeighborLoader( @@ -71,7 +71,7 @@ def run(world_size, data, split_idx, model, acc): if rank == 0: print("Beginning training...") - for epoch in range(1, 4): + for epoch in range(1, 21): model.train() for i, batch in enumerate(train_loader): if i == warmup_steps: @@ -145,6 +145,6 @@ def run(world_size, data, split_idx, model, acc): dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() - model = GCN(dataset.num_features, 128, 2, dataset.num_classes) + model = GCN(dataset.num_features, 256, 3, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) run(nprocs, dataset[0], split_idx, model, acc) From f167890c41743348396dcf39242b8ec1b1fae1fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jan 2024 22:24:53 +0000 Subject: [PATCH 12/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 64da6dbe81cf..46a7b8049426 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -48,7 +48,8 @@ def run(world_size, data, split_idx, model, acc): split_idx['test'].size(0) // world_size, dim=0)[rank].clone() model = DistributedDataParallel(model.to(device), device_ids=[local_id]) - optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, + weight_decay=5e-4) kwargs = dict( data=data, From 7bc1ecc8099ac79e18fb6a90c717d3ace73e725d Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 14:26:32 -0800 Subject: [PATCH 13/41] adding cuda sync --- examples/multi_gpu/papers100m_gcn_multinode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 46a7b8049426..d8cd577e7593 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -90,6 +90,7 @@ def run(world_size, data, split_idx, model, acc): print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') dist.barrier() + torch.cuda.synchronize() if rank == 0: sec_per_iter = (time.time() - start) / (i - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") From 87cd881a5ce36cd61d998727be2cde5e9436bdeb Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 14:33:17 -0800 Subject: [PATCH 14/41] cuda sync --- examples/multi_gpu/papers100m_gcn_multinode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index d8cd577e7593..7f086010d81b 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -114,6 +114,7 @@ def run(world_size, data, split_idx, model, acc): num_batches = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + torch.cuda.synchronize() if rank == 0: print( f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) From 0ffdccd8da1aa907ef48fcc03f8df9f0dd1e205e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 13:59:01 -0800 Subject: [PATCH 15/41] new hyperparams --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 7f086010d81b..94325d8aae05 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -55,7 +55,7 @@ def run(world_size, data, split_idx, model, acc): data=data, batch_size=1024, num_workers=get_num_workers(), - num_neighbors=[10, 10, 10], + num_neighbors=[30, 30], ) train_loader = NeighborLoader( @@ -148,6 +148,6 @@ def run(world_size, data, split_idx, model, acc): dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() - model = GCN(dataset.num_features, 256, 3, dataset.num_classes) + model = GCN(dataset.num_features, 256, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) run(nprocs, dataset[0], split_idx, model, acc) From 60b6db43ef7eaaf8992e092dbded0ad9342575d4 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 16:36:38 -0800 Subject: [PATCH 16/41] cuda syncs for timing --- examples/multi_gpu/papers100m_gcn_multinode.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 94325d8aae05..5826a7c8a5d3 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -76,6 +76,7 @@ def run(world_size, data, split_idx, model, acc): model.train() for i, batch in enumerate(train_loader): if i == warmup_steps: + torch.cuda.synchronize() start = time.time() batch = batch.to(device) batch_size = batch.batch_size @@ -100,6 +101,7 @@ def run(world_size, data, split_idx, model, acc): if i >= val_steps: break if i == warmup_steps: + torch.cuda.synchronize() start = time.time() batch = batch.to(device) From f1894a56b0f7d09872ac7c79db7481917e13a537 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 16:49:46 -0800 Subject: [PATCH 17/41] better timing --- examples/multi_gpu/papers100m_gcn_multinode.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 5826a7c8a5d3..3cd198d1f919 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -32,7 +32,7 @@ def get_num_workers() -> int: return num_workers -def run(world_size, data, split_idx, model, acc): +def run(world_size, data, split_idx, model, acc, wall_clock_start): local_id = int(os.environ['LOCAL_RANK']) rank = torch.distributed.get_rank() torch.cuda.set_device(local_id) @@ -69,7 +69,10 @@ def run(world_size, data, split_idx, model, acc): val_steps = 1000 warmup_steps = 100 acc = acc.to(rank) + dist.barrier() + torch.cuda.synchronize() if rank == 0: + print("Total time before training begins=", round(time.perf_counter() - wall_clock_start, 2), "seconds") print("Beginning training...") for epoch in range(1, 21): @@ -140,9 +143,11 @@ def run(world_size, data, split_idx, model, acc): if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() + print("Total Program Runtime=", round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': + wall_clock_start = time.perf_counter() # Setup multi-node: torch.distributed.init_process_group("nccl") nprocs = dist.get_world_size() @@ -152,4 +157,4 @@ def run(world_size, data, split_idx, model, acc): split_idx = dataset.get_idx_split() model = GCN(dataset.num_features, 256, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) - run(nprocs, dataset[0], split_idx, model, acc) + run(nprocs, dataset[0], split_idx, model, acc, wall_clock_start) From 2e609805bbc7407a3d4e4dab243af026fbb8fdb2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jan 2024 00:50:48 +0000 Subject: [PATCH 18/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 3cd198d1f919..4150bab8bd18 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -72,7 +72,8 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): dist.barrier() torch.cuda.synchronize() if rank == 0: - print("Total time before training begins=", round(time.perf_counter() - wall_clock_start, 2), "seconds") + print("Total time before training begins=", + round(time.perf_counter() - wall_clock_start, 2), "seconds") print("Beginning training...") for epoch in range(1, 21): @@ -143,7 +144,8 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() - print("Total Program Runtime=", round(time.perf_counter() - wall_clock_start, 2), "seconds") + print("Total Program Runtime=", + round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': From b23f8a6545c0cb14c5d0e89061ac0d0e10a2bf50 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 12:04:48 -0800 Subject: [PATCH 19/41] clean up --- examples/multi_gpu/papers100m_gcn_multinode.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 4150bab8bd18..55b1e0629b50 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -96,8 +96,9 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): dist.barrier() torch.cuda.synchronize() + num_batches = i + 1 if rank == 0: - sec_per_iter = (time.time() - start) / (i - warmup_steps) + sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") model.eval() acc_sum = 0.0 @@ -117,14 +118,14 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - num_batches = torch.tensor(float(i), dtype=torch.float32, + num_batches = torch.tensor(float(i + 1), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) torch.cuda.synchronize() if rank == 0: print( f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) - sec_per_iter = (time.time() - start) / (i - warmup_steps) + sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") dist.barrier() @@ -138,7 +139,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - num_batches = torch.tensor(float(i), dtype=torch.float32, + num_batches = torch.tensor(float(i + 1), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) if rank == 0: From 42dcac01a4c65d150b2824c963272690e26275db Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 12:33:14 -0800 Subject: [PATCH 20/41] cleaning --- examples/multi_gpu/papers100m_gcn_multinode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 55b1e0629b50..4574a045e0d5 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -96,7 +96,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): dist.barrier() torch.cuda.synchronize() - num_batches = i + 1 + num_batches = i + 1.0 if rank == 0: sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") From 907af56b2889e79c3666589bfc7fe5199cd3bd27 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 13:23:41 -0800 Subject: [PATCH 21/41] better timing --- examples/multi_gpu/papers100m_gcn_multinode.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 4574a045e0d5..eb4495972af5 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -72,8 +72,9 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): dist.barrier() torch.cuda.synchronize() if rank == 0: - print("Total time before training begins=", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + prep_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total time before training begins (prep_time)=", + prep_time, "seconds") print("Beginning training...") for epoch in range(1, 21): @@ -145,8 +146,10 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() - print("Total Program Runtime=", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + total_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total Program Runtime (total_time) =", + total_time, "seconds") + print("total_time - prep_time =", total_time - prep_time, "seconds") if __name__ == '__main__': From db9c123fda262b75cd534431d5bba7a57d142107 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 19:25:19 +0000 Subject: [PATCH 22/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index eb4495972af5..fe61d696c2bd 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -73,8 +73,8 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): torch.cuda.synchronize() if rank == 0: prep_time = round(time.perf_counter() - wall_clock_start, 2) - print("Total time before training begins (prep_time)=", - prep_time, "seconds") + print("Total time before training begins (prep_time)=", prep_time, + "seconds") print("Beginning training...") for epoch in range(1, 21): @@ -147,8 +147,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() total_time = round(time.perf_counter() - wall_clock_start, 2) - print("Total Program Runtime (total_time) =", - total_time, "seconds") + print("Total Program Runtime (total_time) =", total_time, "seconds") print("total_time - prep_time =", total_time - prep_time, "seconds") From 286832723c3716a55fad306100ee32b1d806951e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 15 Feb 2024 14:16:04 -0800 Subject: [PATCH 23/41] fix --- examples/multi_gpu/papers100m_gcn_multinode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index fe61d696c2bd..0058531cde90 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -68,7 +68,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): val_steps = 1000 warmup_steps = 100 - acc = acc.to(rank) + acc = acc.to(device) dist.barrier() torch.cuda.synchronize() if rank == 0: From 9c905047b186f21f094cef2a7b6158beffa290b6 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 16 Feb 2024 08:24:54 -0800 Subject: [PATCH 24/41] fix for eval --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 0058531cde90..992705d060f5 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -162,4 +162,6 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): split_idx = dataset.get_idx_split() model = GCN(dataset.num_features, 256, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) - run(nprocs, dataset[0], split_idx, model, acc, wall_clock_start) + data = dataset[0] + data.y = data.y.reshape(-1) + run(nprocs, data, split_idx, model, acc, wall_clock_start) From 69c942ad5cb18cc89628c74081a4a088a081287c Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 16 Feb 2024 09:57:53 -0800 Subject: [PATCH 25/41] fixing copypaste from SNMG --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 992705d060f5..c497d0a15104 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -117,7 +117,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) + device=device) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) num_batches = torch.tensor(float(i + 1), dtype=torch.float32, device=acc_sum.device) @@ -138,7 +138,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=device) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) num_batches = torch.tensor(float(i + 1), dtype=torch.float32, device=acc_sum.device) From e5bbd30d070001323f031048d71543f9b74e06aa Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 16 Feb 2024 11:58:26 -0800 Subject: [PATCH 26/41] final cleanup, its running well now --- examples/multi_gpu/papers100m_gcn_multinode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index c497d0a15104..2af9d72530cc 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -146,9 +146,10 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() - total_time = round(time.perf_counter() - wall_clock_start, 2) - print("Total Program Runtime (total_time) =", total_time, "seconds") - print("total_time - prep_time =", total_time - prep_time, "seconds") + if rank == 0: + total_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total Program Runtime (total_time) =", total_time, "seconds") + print("total_time - prep_time =", total_time - prep_time, "seconds") if __name__ == '__main__': From 23179c3c4bbdba60d863f0c68fbdbe1b24291522 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 16:21:10 -0800 Subject: [PATCH 27/41] using acc.compute to align with mag240m and single node papers100m examples. --- .../multi_gpu/papers100m_gcn_multinode.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 2af9d72530cc..65bbdce7abdf 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -114,20 +114,16 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_sum += acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=device) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - num_batches = torch.tensor(float(i + 1), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + acc.compute() torch.cuda.synchronize() if rank == 0: print( f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") + acc.reset() dist.barrier() model.eval() @@ -137,15 +133,12 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=device) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - num_batches = torch.tensor(float(i + 1), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(num_batches, op=dist.ReduceOp.SUM) + acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + acc.compute() if rank == 0: print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) dist.barrier() + acc.reset() if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) print("Total Program Runtime (total_time) =", total_time, "seconds") From a36248fdedf63081beef395361b88deea99d4b2d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Mar 2024 00:22:15 +0000 Subject: [PATCH 28/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 65bbdce7abdf..a61c5b5f7775 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -114,8 +114,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc.compute() torch.cuda.synchronize() if rank == 0: From e0b2301d52e4ca8300e6ef154dd32da8b44adfbf Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 16:36:11 -0800 Subject: [PATCH 29/41] Update papers100m_gcn_multinode.py --- .../multi_gpu/papers100m_gcn_multinode.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index a61c5b5f7775..d5b537b216f0 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -61,6 +61,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): train_loader = NeighborLoader( input_nodes=split_idx['train'], shuffle=True, + drop_last=True, **kwargs, ) val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs) @@ -101,41 +102,37 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): if rank == 0: sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") - model.eval() - acc_sum = 0.0 - for i, batch in enumerate(val_loader): - if i >= val_steps: - break - if i == warmup_steps: - torch.cuda.synchronize() - start = time.time() - - batch = batch.to(device) - batch_size = batch.batch_size - with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc.compute() - torch.cuda.synchronize() + def eval(loader: NeighborLoader, val_steps: Optional[int] = None): + model.eval() + for i, batch in enumerate(loader): + if val_steps is not None and i >= val_steps: + break + if i == warmup_steps: + torch.cuda.synchronize() + start = time.time() + + batch = batch.to(device) + batch_size = batch.batch_size + with torch.no_grad(): + out = model(batch.x, batch.edge_index)[:batch_size] + acc_i = acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + acc_sum = acc.compute() + torch.cuda.synchronize() + return acc_sum, start, i + 1 + + eval_acc, eval_start_time, num_batches = eval(val_loader, val_steps) if rank == 0: print( - f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) - sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) + f"Validation Accuracy: {eval_acc * 100.0:.4f}%", ) + sec_per_iter = (time.time() - eval_start_time) / (num_batches - warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") acc.reset() dist.barrier() - model.eval() - acc_sum = 0.0 - for i, batch in enumerate(test_loader): - batch = batch.to(device) - batch_size = batch.batch_size - with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc.compute() + test_acc, _, _ = eval(test_loader) if rank == 0: - print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", ) + print(f"Test Accuracy: {test_acc * 100.0:.4f}%", ) dist.barrier() acc.reset() if rank == 0: From b7fe9285f4ada4a6c7668e8cf8aef80e7cc4d93c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Mar 2024 00:37:13 +0000 Subject: [PATCH 30/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index d5b537b216f0..2b8631349d50 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -102,6 +102,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): if rank == 0: sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") + def eval(loader: NeighborLoader, val_steps: Optional[int] = None): model.eval() for i, batch in enumerate(loader): @@ -110,22 +111,22 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): if i == warmup_steps: torch.cuda.synchronize() start = time.time() - + batch = batch.to(device) batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] acc_i = acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) acc_sum = acc.compute() torch.cuda.synchronize() return acc_sum, start, i + 1 eval_acc, eval_start_time, num_batches = eval(val_loader, val_steps) if rank == 0: - print( - f"Validation Accuracy: {eval_acc * 100.0:.4f}%", ) - sec_per_iter = (time.time() - eval_start_time) / (num_batches - warmup_steps) + print(f"Validation Accuracy: {eval_acc * 100.0:.4f}%", ) + sec_per_iter = (time.time() - eval_start_time) / (num_batches - + warmup_steps) print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") acc.reset() dist.barrier() From 7c38068ce78f92973d9283e3fb310ef2e9328f87 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 18:53:11 -0800 Subject: [PATCH 31/41] cleaning --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 2b8631349d50..aa5c9fa9cc1a 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -15,7 +15,7 @@ from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy - +from typing import Optional from torch_geometric.loader import NeighborLoader from torch_geometric.nn.models import GCN @@ -116,7 +116,7 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa batch.y[:batch_size]) acc_sum = acc.compute() torch.cuda.synchronize() From 8eaa4b913e7fcce1d45ec4bd38be9999e5c8f667 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Mar 2024 02:54:10 +0000 Subject: [PATCH 32/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index aa5c9fa9cc1a..90509a9c4e2e 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -8,6 +8,7 @@ """ import os import time +from typing import Optional import torch import torch.distributed as dist @@ -15,7 +16,7 @@ from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from typing import Optional + from torch_geometric.loader import NeighborLoader from torch_geometric.nn.models import GCN @@ -116,8 +117,9 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa - batch.y[:batch_size]) + acc_i = acc( + out[:batch_size].softmax(dim=-1), # noqa + batch.y[:batch_size]) acc_sum = acc.compute() torch.cuda.synchronize() return acc_sum, start, i + 1 From 55b4f713fb383b12825ec715c27aa8dfd17b4540 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Mar 2024 09:05:49 -0800 Subject: [PATCH 33/41] Update papers100m_gcn_multinode.py --- examples/multi_gpu/papers100m_gcn_multinode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 90509a9c4e2e..583f4be10c75 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -117,8 +117,8 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc( - out[:batch_size].softmax(dim=-1), # noqa + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() torch.cuda.synchronize() From 3e1d8799ab8f6cf1940647e66f1b530aa36032e9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Mar 2024 17:06:49 +0000 Subject: [PATCH 34/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_multinode.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 583f4be10c75..abb05ffdb9d3 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -117,9 +117,8 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): batch_size = batch.batch_size with torch.no_grad(): out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc( # noqa - out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() torch.cuda.synchronize() return acc_sum, start, i + 1 From a994b62d55e773158a92d10ccca9505392e89882 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Mar 2024 14:52:08 -0800 Subject: [PATCH 35/41] Update papers100m_gcn_multinode.py --- examples/multi_gpu/papers100m_gcn_multinode.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index abb05ffdb9d3..b3acd3ab384f 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -106,9 +106,11 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): def eval(loader: NeighborLoader, val_steps: Optional[int] = None): model.eval() + start = None for i, batch in enumerate(loader): if val_steps is not None and i >= val_steps: break + if i == warmup_steps: torch.cuda.synchronize() start = time.time() From 3a91d0352d845caa2fa18c592fe7f2bf3d45321e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Mar 2024 15:59:08 -0800 Subject: [PATCH 36/41] Update papers100m_gcn_multinode.py --- examples/multi_gpu/papers100m_gcn_multinode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index b3acd3ab384f..22c89426bc7d 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -107,11 +107,11 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): def eval(loader: NeighborLoader, val_steps: Optional[int] = None): model.eval() start = None - for i, batch in enumerate(loader): - if val_steps is not None and i >= val_steps: + for j, batch in enumerate(loader): + if val_steps is not None and j >= val_steps: break - if i == warmup_steps: + if j == warmup_steps: torch.cuda.synchronize() start = time.time() From 8ebf76250391d57bb36d6aac24a5d2e1cad9158f Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Mar 2024 16:16:05 -0800 Subject: [PATCH 37/41] Update papers100m_gcn_multinode.py --- examples/multi_gpu/papers100m_gcn_multinode.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 22c89426bc7d..50d0826ada7a 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -110,11 +110,6 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): for j, batch in enumerate(loader): if val_steps is not None and j >= val_steps: break - - if j == warmup_steps: - torch.cuda.synchronize() - start = time.time() - batch = batch.to(device) batch_size = batch.batch_size with torch.no_grad(): @@ -122,23 +117,20 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): acc_i = acc( # noqa out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() - torch.cuda.synchronize() - return acc_sum, start, i + 1 + return acc_sum - eval_acc, eval_start_time, num_batches = eval(val_loader, val_steps) + eval_acc = eval(val_loader, val_steps) if rank == 0: print(f"Validation Accuracy: {eval_acc * 100.0:.4f}%", ) - sec_per_iter = (time.time() - eval_start_time) / (num_batches - - warmup_steps) - print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") acc.reset() dist.barrier() - test_acc, _, _ = eval(test_loader) + test_acc = eval(test_loader) if rank == 0: print(f"Test Accuracy: {test_acc * 100.0:.4f}%", ) dist.barrier() acc.reset() + torch.cuda.synchronize() if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) print("Total Program Runtime (total_time) =", total_time, "seconds") From e10a6eecd3b0d2c2299cc4f54f0902e8b47b3b12 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Mar 2024 16:40:21 -0800 Subject: [PATCH 38/41] Update papers100m_gcn_multinode.py --- examples/multi_gpu/papers100m_gcn_multinode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 50d0826ada7a..9a99640c841d 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -106,7 +106,6 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): def eval(loader: NeighborLoader, val_steps: Optional[int] = None): model.eval() - start = None for j, batch in enumerate(loader): if val_steps is not None and j >= val_steps: break From 872352125dc4e437fe87d6ac8b8a025a8447a313 Mon Sep 17 00:00:00 2001 From: rusty1s Date: Tue, 12 Mar 2024 14:43:30 +0000 Subject: [PATCH 39/41] update --- CHANGELOG.md | 2 +- examples/multi_gpu/papers100m_gcn_multinode.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d670faecad8d..0f44b9c6d3d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Improvements to multinode papers100m default hyperparams and adding eval on all ranks ([#8823](https://github.com/pyg-team/pytorch_geometric/pull/8823)) - Added support for `EdgeIndex` in `MessagePassing` ([#9007](https://github.com/pyg-team/pytorch_geometric/pull/9007)) - Added support for `torch.compile` in combination with `EdgeIndex` ([#9007](https://github.com/pyg-team/pytorch_geometric/pull/9007)) - Added a `ogbn-mag240m` example ([#8249](https://github.com/pyg-team/pytorch_geometric/pull/8249)) @@ -16,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Improvements to multi-node `ogbn-papers100m` default hyperparameters and adding evaluation on all ranks ([#8823](https://github.com/pyg-team/pytorch_geometric/pull/8823)) - Remove filtering of node/edge types in `trim_to_layer` functionality ([#9021](https://github.com/pyg-team/pytorch_geometric/pull/9021)) - Default to `scatter` operations in `MessagePassing` in case `torch.use_deterministic_algorithms` is not set ([#9009](https://github.com/pyg-team/pytorch_geometric/pull/9009)) - Made `MessagePassing` interface thread-safe ([#9001](https://github.com/pyg-team/pytorch_geometric/pull/9001)) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 9a99640c841d..8c8e803f1c44 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -18,7 +18,7 @@ from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader -from torch_geometric.nn.models import GCN +from torch_geometric.nn import GCN def get_num_workers() -> int: @@ -104,6 +104,7 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") + @torch.no_grad() def eval(loader: NeighborLoader, val_steps: Optional[int] = None): model.eval() for j, batch in enumerate(loader): @@ -111,22 +112,21 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): break batch = batch.to(device) batch_size = batch.batch_size - with torch.no_grad(): - out = model(batch.x, batch.edge_index)[:batch_size] - acc_i = acc( # noqa - out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + out = model(batch.x, batch.edge_index) + acc(out[:batch_size], batch.y[:batch_size]) acc_sum = acc.compute() return acc_sum eval_acc = eval(val_loader, val_steps) if rank == 0: - print(f"Validation Accuracy: {eval_acc * 100.0:.4f}%", ) - acc.reset() - dist.barrier() + print(f"Val Accuracy: {eval_acc:.4f}%", ) + + acc.reset() + dist.barrier() test_acc = eval(test_loader) if rank == 0: - print(f"Test Accuracy: {test_acc * 100.0:.4f}%", ) + print(f"Test Accuracy: {test_acc:.4f}%", ) dist.barrier() acc.reset() torch.cuda.synchronize() From eff9ef6d6409a1a31079614f1d64e2fd8f81ea64 Mon Sep 17 00:00:00 2001 From: rusty1s Date: Tue, 12 Mar 2024 14:45:54 +0000 Subject: [PATCH 40/41] update --- .../multi_gpu/papers100m_gcn_multinode.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 8c8e803f1c44..5b00184b26ae 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -86,10 +86,9 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): torch.cuda.synchronize() start = time.time() batch = batch.to(device) - batch_size = batch.batch_size optimizer.zero_grad() - y = batch.y[:batch_size].view(-1).to(torch.long) - out = model(batch.x, batch.edge_index)[:batch_size] + y = batch.y[:batch.batch_size].view(-1).to(torch.long) + out = model(batch.x, batch.edge_index)[:batch.batch_size] loss = F.cross_entropy(out, y) loss.backward() optimizer.step() @@ -99,9 +98,8 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): dist.barrier() torch.cuda.synchronize() - num_batches = i + 1.0 if rank == 0: - sec_per_iter = (time.time() - start) / (num_batches - warmup_steps) + sec_per_iter = (time.time() - start) / (i + 1 - warmup_steps) print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") @torch.no_grad() @@ -111,9 +109,9 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): if val_steps is not None and j >= val_steps: break batch = batch.to(device) - batch_size = batch.batch_size - out = model(batch.x, batch.edge_index) - acc(out[:batch_size], batch.y[:batch_size]) + out = model(batch.x, batch.edge_index)[:batch.batch_size] + y = batch.y[:batch.batch_size].view(-1).to(torch.long) + acc(out, y) acc_sum = acc.compute() return acc_sum @@ -127,9 +125,11 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): test_acc = eval(test_loader) if rank == 0: print(f"Test Accuracy: {test_acc:.4f}%", ) + dist.barrier() acc.reset() torch.cuda.synchronize() + if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) print("Total Program Runtime (total_time) =", total_time, "seconds") @@ -142,8 +142,7 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): torch.distributed.init_process_group("nccl") nprocs = dist.get_world_size() assert dist.is_initialized(), "Distributed cluster not initialized" - dataset = PygNodePropPredDataset(name='ogbn-papers100M', - root='/datasets/ogb_datasets') + dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() model = GCN(dataset.num_features, 256, 2, dataset.num_classes) acc = Accuracy(task="multiclass", num_classes=dataset.num_classes) From ede89ac9e5db3e2f2d7552a613d20dcf3a7c7758 Mon Sep 17 00:00:00 2001 From: rusty1s Date: Tue, 12 Mar 2024 14:47:20 +0000 Subject: [PATCH 41/41] update --- examples/multi_gpu/papers100m_gcn_multinode.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 5b00184b26ae..af434b4d2ef7 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -103,10 +103,10 @@ def run(world_size, data, split_idx, model, acc, wall_clock_start): print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") @torch.no_grad() - def eval(loader: NeighborLoader, val_steps: Optional[int] = None): + def test(loader: NeighborLoader, num_steps: Optional[int] = None): model.eval() for j, batch in enumerate(loader): - if val_steps is not None and j >= val_steps: + if num_steps is not None and j >= num_steps: break batch = batch.to(device) out = model(batch.x, batch.edge_index)[:batch.batch_size] @@ -115,14 +115,14 @@ def eval(loader: NeighborLoader, val_steps: Optional[int] = None): acc_sum = acc.compute() return acc_sum - eval_acc = eval(val_loader, val_steps) + eval_acc = test(val_loader, num_steps=val_steps) if rank == 0: print(f"Val Accuracy: {eval_acc:.4f}%", ) acc.reset() dist.barrier() - test_acc = eval(test_loader) + test_acc = test(test_loader) if rank == 0: print(f"Test Accuracy: {test_acc:.4f}%", )