Skip to content

Commit

Permalink
Added an industrial-scale heterogeneous dataset RCDD (#8196)
Browse files Browse the repository at this point in the history
A challenging dataset with 13,806,619 nodes, 157,814,864 edges,
consisting of 7 node types and 7 edge types.

---------

Co-authored-by: Akihiro Nitta <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: rusty1s <[email protected]>
  • Loading branch information
4 people authored Oct 17, 2023
1 parent 53bb233 commit ec6264b
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added the `RCDD` dataset ([#8196](https://github.com/pyg-team/pytorch_geometric/pull/8196))
- Added distributed `GAT + ogbn-products` example targeting XPU device ([#8032](https://github.com/pyg-team/pytorch_geometric/pull/8032))

### Changed
Expand Down
2 changes: 2 additions & 0 deletions torch_geometric/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
from .amazon_book import AmazonBook
from .hm import HM
from .ose_gvcs import OSE_GVCS
from .rcdd import RCDD

from .fake import FakeDataset, FakeHeteroDataset
from .sbm_dataset import StochasticBlockModelDataset
Expand Down Expand Up @@ -202,6 +203,7 @@
'AmazonBook',
'HM',
'OSE_GVCS',
'RCDD',
]
synthetic_datasets = [
'FakeDataset',
Expand Down
139 changes: 139 additions & 0 deletions torch_geometric/datasets/rcdd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
from typing import Callable, List, Optional

import numpy as np
import torch

from torch_geometric.data import (
HeteroData,
InMemoryDataset,
download_url,
extract_zip,
)
from torch_geometric.utils import index_to_mask


class RCDD(InMemoryDataset):
r"""The risk commodity detection dataset (RCDD) from the
`"Datasets and Interfaces for Benchmarking Heterogeneous Graph
Neural Networks" <http://shichuan.org/doc/156.pdf>`_ paper.
RCDD is an industrial-scale heterogeneous graph dataset based on a
real risk detection scenario from Alibaba's e-commerce platform.
It consists of 13,806,619 nodes and 157,814,864 edges across 7 node types
and 7 edge types, respectively.
Args:
root (str): Root directory where the dataset should be saved.
transform (callable, optional): A function/transform that takes in an
:obj:`torch_geometric.data.HeteroData` object and returns a
transformed version. The data object will be transformed before
every access. (default: :obj:`None`)
pre_transform (callable, optional): A function/transform that takes in
an :obj:`torch_geometric.data.HeteroData` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
"""
url = ('https://s3.cn-north-1.amazonaws.com.cn/dgl-data/dataset/'
'openhgnn/AliRCD_ICDM.zip')

def __init__(
self,
root: str,
transform: Optional[Callable] = None,
pre_transform: Optional[Callable] = None,
):
super().__init__(root, transform, pre_transform)
self.load(self.processed_paths[0], data_cls=HeteroData)

@property
def raw_file_names(self) -> List[str]:
return [
'AliRCD_ICDM_nodes.csv',
'AliRCD_ICDM_edges.csv',
'AliRCD_ICDM_train_labels.csv',
'AliRCD_ICDM_test_labels.csv',
]

@property
def processed_file_names(self) -> str:
return 'data.pt'

def download(self):
path = download_url(self.url, self.raw_dir)
extract_zip(path, self.raw_dir)
os.unlink(path)

@property
def num_classes(self) -> int:
return 2

def process(self):
import pandas as pd

data = HeteroData()

node_df = pd.read_csv( # AliRCD_ICDM_nodes.csv:
self.raw_paths[0],
header=None,
names=['node_id', 'node_type', 'node_feat'],
)
# Map global node IDs to local ones for each node type:
mapping = torch.empty(len(node_df), dtype=torch.long)
for node_type in node_df['node_type'].unique():
mask = node_df['node_type'] == node_type
mask = torch.from_numpy(mask.values)
num_nodes = int(mask.sum())
mapping[mask] = torch.arange(num_nodes)
data[node_type].num_nodes = num_nodes
x = np.vstack([
np.asarray(f.split(':'), dtype=np.float32)
for f in node_df['node_feat'][mask.numpy()]
])
data[node_type].x = torch.from_numpy(x)

edge_df = pd.read_csv( # AliRCD_ICDM_edges.csv:
self.raw_paths[1],
header=None,
names=['src_id', 'dst_id', 'src_type', 'dst_type', 'edge_type'],
)
for edge_type in edge_df['edge_type'].unique():
edge_type_df = edge_df[edge_df['edge_type'] == edge_type]
src_type = edge_type_df['src_type'].iloc[0]
dst_type = edge_type_df['dst_type'].iloc[0]
src = mapping[torch.from_numpy(edge_type_df['src_id'].values)]
dst = mapping[torch.from_numpy(edge_type_df['dst_id'].values)]
edge_index = torch.stack([src, dst], dim=0)
data[src_type, edge_type, dst_type].edge_index = edge_index

train_df = pd.read_csv( # AliRCD_ICDM_train_labels.csv:
self.raw_paths[2],
header=None,
names=['node_id', 'label'],
dtype=int,
)
test_df = pd.read_csv( # AliRCD_ICDM_test_labels.csv:
self.raw_paths[3],
header=None,
sep='\t',
names=['node_id', 'label'],
dtype=int,
)

train_idx = mapping[torch.from_numpy(train_df['node_id'].values)]
test_idx = mapping[torch.from_numpy(test_df['node_id'].values)]

y = torch.full((data['item'].num_nodes, ), -1, dtype=torch.long)
y[train_idx] = torch.from_numpy(train_df['label'].values)
y[test_idx] = torch.from_numpy(test_df['label'].values)

train_mask = index_to_mask(train_idx, data['item'].num_nodes)
test_mask = index_to_mask(test_idx, data['item'].num_nodes)

data['item'].y = y
data['item'].train_mask = train_mask
data['item'].test_mask = test_mask

if self.pre_transform is not None:
data = self.pre_transform(data)

self.save([data], self.processed_paths[0])

0 comments on commit ec6264b

Please sign in to comment.