From ec6264bd9cd39f8f47abcad1a899b53fa92ae6d6 Mon Sep 17 00:00:00 2001 From: Jintang Li Date: Tue, 17 Oct 2023 07:01:44 -0500 Subject: [PATCH] Added an industrial-scale heterogeneous dataset `RCDD` (#8196) A challenging dataset with 13,806,619 nodes, 157,814,864 edges, consisting of 7 node types and 7 edge types. --------- Co-authored-by: Akihiro Nitta Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: rusty1s --- CHANGELOG.md | 1 + torch_geometric/datasets/__init__.py | 2 + torch_geometric/datasets/rcdd.py | 139 +++++++++++++++++++++++++++ 3 files changed, 142 insertions(+) create mode 100644 torch_geometric/datasets/rcdd.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ce5e7da310f..b6fa753cc3d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added the `RCDD` dataset ([#8196](https://github.com/pyg-team/pytorch_geometric/pull/8196)) - Added distributed `GAT + ogbn-products` example targeting XPU device ([#8032](https://github.com/pyg-team/pytorch_geometric/pull/8032)) ### Changed diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py index 0387aab31e4e..b95de825472c 100644 --- a/torch_geometric/datasets/__init__.py +++ b/torch_geometric/datasets/__init__.py @@ -92,6 +92,7 @@ from .amazon_book import AmazonBook from .hm import HM from .ose_gvcs import OSE_GVCS +from .rcdd import RCDD from .fake import FakeDataset, FakeHeteroDataset from .sbm_dataset import StochasticBlockModelDataset @@ -202,6 +203,7 @@ 'AmazonBook', 'HM', 'OSE_GVCS', + 'RCDD', ] synthetic_datasets = [ 'FakeDataset', diff --git a/torch_geometric/datasets/rcdd.py b/torch_geometric/datasets/rcdd.py new file mode 100644 index 000000000000..377cd71b7307 --- /dev/null +++ b/torch_geometric/datasets/rcdd.py @@ -0,0 +1,139 @@ +import os +from typing import Callable, List, Optional + +import numpy as np +import torch + +from torch_geometric.data import ( + HeteroData, + InMemoryDataset, + download_url, + extract_zip, +) +from torch_geometric.utils import index_to_mask + + +class RCDD(InMemoryDataset): + r"""The risk commodity detection dataset (RCDD) from the + `"Datasets and Interfaces for Benchmarking Heterogeneous Graph + Neural Networks" `_ paper. + RCDD is an industrial-scale heterogeneous graph dataset based on a + real risk detection scenario from Alibaba's e-commerce platform. + It consists of 13,806,619 nodes and 157,814,864 edges across 7 node types + and 7 edge types, respectively. + + Args: + root (str): Root directory where the dataset should be saved. + transform (callable, optional): A function/transform that takes in an + :obj:`torch_geometric.data.HeteroData` object and returns a + transformed version. The data object will be transformed before + every access. (default: :obj:`None`) + pre_transform (callable, optional): A function/transform that takes in + an :obj:`torch_geometric.data.HeteroData` object and returns a + transformed version. The data object will be transformed before + being saved to disk. (default: :obj:`None`) + """ + url = ('https://s3.cn-north-1.amazonaws.com.cn/dgl-data/dataset/' + 'openhgnn/AliRCD_ICDM.zip') + + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + pre_transform: Optional[Callable] = None, + ): + super().__init__(root, transform, pre_transform) + self.load(self.processed_paths[0], data_cls=HeteroData) + + @property + def raw_file_names(self) -> List[str]: + return [ + 'AliRCD_ICDM_nodes.csv', + 'AliRCD_ICDM_edges.csv', + 'AliRCD_ICDM_train_labels.csv', + 'AliRCD_ICDM_test_labels.csv', + ] + + @property + def processed_file_names(self) -> str: + return 'data.pt' + + def download(self): + path = download_url(self.url, self.raw_dir) + extract_zip(path, self.raw_dir) + os.unlink(path) + + @property + def num_classes(self) -> int: + return 2 + + def process(self): + import pandas as pd + + data = HeteroData() + + node_df = pd.read_csv( # AliRCD_ICDM_nodes.csv: + self.raw_paths[0], + header=None, + names=['node_id', 'node_type', 'node_feat'], + ) + # Map global node IDs to local ones for each node type: + mapping = torch.empty(len(node_df), dtype=torch.long) + for node_type in node_df['node_type'].unique(): + mask = node_df['node_type'] == node_type + mask = torch.from_numpy(mask.values) + num_nodes = int(mask.sum()) + mapping[mask] = torch.arange(num_nodes) + data[node_type].num_nodes = num_nodes + x = np.vstack([ + np.asarray(f.split(':'), dtype=np.float32) + for f in node_df['node_feat'][mask.numpy()] + ]) + data[node_type].x = torch.from_numpy(x) + + edge_df = pd.read_csv( # AliRCD_ICDM_edges.csv: + self.raw_paths[1], + header=None, + names=['src_id', 'dst_id', 'src_type', 'dst_type', 'edge_type'], + ) + for edge_type in edge_df['edge_type'].unique(): + edge_type_df = edge_df[edge_df['edge_type'] == edge_type] + src_type = edge_type_df['src_type'].iloc[0] + dst_type = edge_type_df['dst_type'].iloc[0] + src = mapping[torch.from_numpy(edge_type_df['src_id'].values)] + dst = mapping[torch.from_numpy(edge_type_df['dst_id'].values)] + edge_index = torch.stack([src, dst], dim=0) + data[src_type, edge_type, dst_type].edge_index = edge_index + + train_df = pd.read_csv( # AliRCD_ICDM_train_labels.csv: + self.raw_paths[2], + header=None, + names=['node_id', 'label'], + dtype=int, + ) + test_df = pd.read_csv( # AliRCD_ICDM_test_labels.csv: + self.raw_paths[3], + header=None, + sep='\t', + names=['node_id', 'label'], + dtype=int, + ) + + train_idx = mapping[torch.from_numpy(train_df['node_id'].values)] + test_idx = mapping[torch.from_numpy(test_df['node_id'].values)] + + y = torch.full((data['item'].num_nodes, ), -1, dtype=torch.long) + y[train_idx] = torch.from_numpy(train_df['label'].values) + y[test_idx] = torch.from_numpy(test_df['label'].values) + + train_mask = index_to_mask(train_idx, data['item'].num_nodes) + test_mask = index_to_mask(test_idx, data['item'].num_nodes) + + data['item'].y = y + data['item'].train_mask = train_mask + data['item'].test_mask = test_mask + + if self.pre_transform is not None: + data = self.pre_transform(data) + + self.save([data], self.processed_paths[0])