pyg-team · rusty1s · Mar 12, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Improvements to multinode papers100m default hyperparams and adding eval on all ranks ([#8823](https://github.com/pyg-team/pytorch_geometric/pull/8823))
 - Added support for `EdgeIndex` in `MessagePassing` ([#9007](https://github.com/pyg-team/pytorch_geometric/pull/9007))
 - Added support for `torch.compile` in combination with `EdgeIndex` ([#9007](https://github.com/pyg-team/pytorch_geometric/pull/9007))
 - Added a `ogbn-mag240m` example ([#8249](https://github.com/pyg-team/pytorch_geometric/pull/8249))

@@ -1,6 +1,6 @@
 """Multi-node multi-GPU example on ogbn-papers100m.
 
-To run:
+Example way to run using srun:
 srun -l -N<num_nodes> --ntasks-per-node=<ngpu_per_node> \
 --container-name=cont --container-image=<image_url> \
 --container-mounts=/ogb-papers100m/:/workspace/dataset
@@ -14,9 +14,10 @@
 import torch.nn.functional as F
 from ogb.nodeproppred import PygNodePropPredDataset
 from torch.nn.parallel import DistributedDataParallel
+from torchmetrics import Accuracy
 
 from torch_geometric.loader import NeighborLoader
-from torch_geometric.nn import GCNConv
+from torch_geometric.nn.models import GCN
 
 
 def get_num_workers() -> int:
@@ -31,21 +32,7 @@ def get_num_workers() -> int:
     return num_workers
 
 
-class GCN(torch.nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels):
-        super().__init__()
-        self.conv1 = GCNConv(in_channels, hidden_channels)
-        self.conv2 = GCNConv(hidden_channels, out_channels)
-
-    def forward(self, x, edge_index):
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.conv1(x, edge_index).relu()
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.conv2(x, edge_index)
-        return x
-
-
-def run(world_size, data, split_idx, model):
+def run(world_size, data, split_idx, model, acc, wall_clock_start):
     local_id = int(os.environ['LOCAL_RANK'])
     rank = torch.distributed.get_rank()
     torch.cuda.set_device(local_id)
@@ -54,97 +41,128 @@ def run(world_size, data, split_idx, model):
         print(f'Using {nprocs} GPUs...')
 
     split_idx['train'] = split_idx['train'].split(
-        split_idx['train'].size(0) // world_size,
-        dim=0,
-    )[rank].clone()
+        split_idx['train'].size(0) // world_size, dim=0)[rank].clone()
+    split_idx['valid'] = split_idx['valid'].split(
+        split_idx['valid'].size(0) // world_size, dim=0)[rank].clone()
+    split_idx['test'] = split_idx['test'].split(
+        split_idx['test'].size(0) // world_size, dim=0)[rank].clone()
 
     model = DistributedDataParallel(model.to(device), device_ids=[local_id])
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001,
+                                 weight_decay=5e-4)
 
     kwargs = dict(
         data=data,
-        batch_size=128,
+        batch_size=1024,
         num_workers=get_num_workers(),
-        num_neighbors=[50, 50],
+        num_neighbors=[30, 30],
     )
 
     train_loader = NeighborLoader(
         input_nodes=split_idx['train'],
         shuffle=True,
         **kwargs,
     )
-    if rank == 0:
-        val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs)
-        test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs)
+    val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs)
+    test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs)
 
     val_steps = 1000
     warmup_steps = 100
+    acc = acc.to(device)
+    dist.barrier()
+    torch.cuda.synchronize()
     if rank == 0:
+        prep_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total time before training begins (prep_time)=", prep_time,
+              "seconds")
         print("Beginning training...")
 
-    for epoch in range(1, 4):
+    for epoch in range(1, 21):
         model.train()
         for i, batch in enumerate(train_loader):
             if i == warmup_steps:
+                torch.cuda.synchronize()
                 start = time.time()
             batch = batch.to(device)
+            batch_size = batch.batch_size
             optimizer.zero_grad()
-            y = batch.y[:batch.batch_size].view(-1).to(torch.long)
-            out = model(batch.x, batch.edge_index)[:batch.batch_size]
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+            out = model(batch.x, batch.edge_index)[:batch_size]
             loss = F.cross_entropy(out, y)
             loss.backward()
             optimizer.step()
 
             if rank == 0 and i % 10 == 0:
                 print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}')
 
+        dist.barrier()
+        torch.cuda.synchronize()
+        num_batches = i + 1.0
         if rank == 0:
-            sec_per_iter = (time.time() - start) / (i - warmup_steps)
+            sec_per_iter = (time.time() - start) / (num_batches - warmup_steps)
             print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter")
-
-            model.eval()
-            total_correct = total_examples = 0
-            for i, batch in enumerate(val_loader):
-                if i >= val_steps:
-                    break
-                if i == warmup_steps:
-                    start = time.time()
-
-                batch = batch.to(device)
-                with torch.no_grad():
-                    out = model(batch.x, batch.edge_index)[:batch.batch_size]
-                pred = out.argmax(dim=-1)
-                y = batch.y[:batch.batch_size].view(-1).to(torch.long)
-
-                total_correct += int((pred == y).sum())
-                total_examples += y.size(0)
-
-            print(f"Val Acc: {total_correct / total_examples:.4f}")
-            sec_per_iter = (time.time() - start) / (i - warmup_steps)
-            print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter")
-
-    if rank == 0:
         model.eval()
-        total_correct = total_examples = 0
-        for i, batch in enumerate(test_loader):
+        acc_sum = 0.0
+        for i, batch in enumerate(val_loader):
+            if i >= val_steps:
+                break
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start = time.time()
+
             batch = batch.to(device)
+            batch_size = batch.batch_size
             with torch.no_grad():
-                out = model(batch.x, batch.edge_index)[:batch.batch_size]
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch.batch_size].view(-1).to(torch.long)
-
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
-        print(f"Test Acc: {total_correct / total_examples:.4f}")
+                out = model(batch.x, batch.edge_index)[:batch_size]
+            acc_sum += acc(out[:batch_size].softmax(dim=-1),
+                           batch.y[:batch_size])
+        acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32,
+                               device=device)
+        dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM)
+        num_batches = torch.tensor(float(i + 1), dtype=torch.float32,
+                                   device=acc_sum.device)
+        dist.all_reduce(num_batches, op=dist.ReduceOp.SUM)
+        torch.cuda.synchronize()
+        if rank == 0:
+            print(
+                f"Validation Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", )
+            sec_per_iter = (time.time() - start) / (num_batches - warmup_steps)
+            print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter")
+    dist.barrier()
+
+    model.eval()
+    acc_sum = 0.0
+    for i, batch in enumerate(test_loader):
+        batch = batch.to(device)
+        batch_size = batch.batch_size
+        with torch.no_grad():
+            out = model(batch.x, batch.edge_index)[:batch_size]
+        acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size])
+    acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=device)
+    dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM)
+    num_batches = torch.tensor(float(i + 1), dtype=torch.float32,
+                               device=acc_sum.device)
+    dist.all_reduce(num_batches, op=dist.ReduceOp.SUM)
+    if rank == 0:
+        print(f"Test Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", )
+    dist.barrier()
+    if rank == 0:
+        total_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total Program Runtime (total_time) =", total_time, "seconds")
+        print("total_time - prep_time =", total_time - prep_time, "seconds")
 
 
 if __name__ == '__main__':
+    wall_clock_start = time.perf_counter()
     # Setup multi-node:
     torch.distributed.init_process_group("nccl")
     nprocs = dist.get_world_size()
     assert dist.is_initialized(), "Distributed cluster not initialized"
-    dataset = PygNodePropPredDataset(name='ogbn-papers100M')
+    dataset = PygNodePropPredDataset(name='ogbn-papers100M',
+                                     root='/datasets/ogb_datasets')
     split_idx = dataset.get_idx_split()
-    model = GCN(dataset.num_features, 64, dataset.num_classes)
-
-    run(nprocs, dataset[0], split_idx, model)
+    model = GCN(dataset.num_features, 256, 2, dataset.num_classes)
+    acc = Accuracy(task="multiclass", num_classes=dataset.num_classes)
+    data = dataset[0]
+    data.y = data.y.reshape(-1)
+    run(nprocs, data, split_idx, model, acc, wall_clock_start)