Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions examples/cobformer/Data/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import torch
import torch.nn.functional as F
import numpy as np
import networkx as nx
import metis
from torch_geometric.utils.num_nodes import maybe_num_nodes
from torch_geometric.utils import add_remaining_self_loops
from torch_geometric.utils import scatter


def rand_train_test_idx(label, train_prop=.5, valid_prop=.25, ignore_negative=True):
""" randomly splits label into train/valid/test splits """
if ignore_negative:
labeled_nodes = torch.where(label != -1)[0]
else:
labeled_nodes = label

n = labeled_nodes.shape[0]
train_num = int(n * train_prop)
valid_num = int(n * valid_prop)

perm = torch.as_tensor(np.random.permutation(n))

train_indices = perm[:train_num]
val_indices = perm[train_num:train_num + valid_num]
test_indices = perm[train_num + valid_num:]

if not ignore_negative:
return train_indices, val_indices, test_indices

train_idx = labeled_nodes[train_indices]
valid_idx = labeled_nodes[val_indices]
test_idx = labeled_nodes[test_indices]
train_mask = torch.zeros_like(label, dtype=torch.bool)
train_mask[train_idx] = True
valid_mask = torch.zeros_like(label, dtype=torch.bool)
valid_mask[valid_idx] = True
test_mask = torch.zeros_like(label, dtype=torch.bool)
test_mask[test_idx] = True

return train_mask, valid_mask, test_mask


def load_fixed_splits(data_dir, dataset, name, protocol):
splits_lst = []
if name in ['Cora', 'CiteSeer', 'PubMed', 'ogbn-arxiv', 'ogbn-products'] and protocol == 'semi':
splits = {}
splits['train'] = torch.as_tensor(dataset.train_mask)
splits['valid'] = torch.as_tensor(dataset.valid_mask)
splits['test'] = torch.as_tensor(dataset.test_mask)
splits['train'] = F.pad(splits['train'], [0, 1])
splits['valid'] = F.pad(splits['valid'], [0, 1])
splits['test'] = F.pad(splits['test'], [0, 1])
splits_lst.append(splits)
elif name in ['film', 'deezer']:
for i in range(10):
splits_file_path = '{}/{}'.format(data_dir, name) + '_split_50_25_' + str(i) + '.npz'
splits = {}
with np.load(splits_file_path) as splits_file:
splits['train'] = torch.BoolTensor(splits_file['train_mask'])
splits['valid'] = torch.BoolTensor(splits_file['val_mask'])
splits['test'] = torch.BoolTensor(splits_file['test_mask'])
splits['train'] = F.pad(splits['train'], [0, 1])
splits['valid'] = F.pad(splits['valid'], [0, 1])
splits['test'] = F.pad(splits['test'], [0, 1])
splits_lst.append(splits)
else:
raise NotImplementedError

return splits_lst


def class_rand_splits(label, label_num_per_class):
train_idx, non_train_idx = [], []
idx = torch.arange(label.shape[0])
class_list = label.squeeze().unique()
valid_num, test_num = 500, 1000
for i in range(class_list.shape[0]):
c_i = class_list[i]
idx_i = idx[label.squeeze() == c_i]
n_i = idx_i.shape[0]
rand_idx = idx_i[torch.randperm(n_i)]
train_idx += rand_idx[:label_num_per_class].tolist()
non_train_idx += rand_idx[label_num_per_class:].tolist()
train_idx = torch.as_tensor(train_idx)
non_train_idx = torch.as_tensor(non_train_idx)
non_train_idx = non_train_idx[torch.randperm(non_train_idx.shape[0])]
valid_idx, test_idx = non_train_idx[:valid_num], non_train_idx[valid_num:valid_num + test_num]
train_mask = torch.zeros_like(label, dtype=torch.bool)
train_mask[train_idx] = True
valid_mask = torch.zeros_like(label, dtype=torch.bool)
valid_mask[valid_idx] = True
test_mask = torch.zeros_like(label, dtype=torch.bool)
test_mask[test_idx] = True

return train_mask, valid_mask, test_mask


def metis_partition(g, n_patches=50):
if g['num_nodes'] < n_patches:
membership = torch.randperm(n_patches)
else:
# data augmentation
adjlist = g['edge_index'].t()
G = nx.Graph()
G.add_nodes_from(np.arange(g['num_nodes']))
G.add_edges_from(adjlist.tolist())
# metis partition
cuts, membership = metis.part_graph(G, n_patches, recursive=True)

assert len(membership) >= g['num_nodes']
membership = torch.tensor(membership[:g['num_nodes']])


patch = []
max_patch_size = -1
for i in range(n_patches):
patch.append(list())
patch[-1] = torch.where(membership == i)[0].tolist()
max_patch_size = max(max_patch_size, len(patch[-1]))

for i in range(len(patch)):
l = len(patch[i])
if l < max_patch_size:
patch[i] += [g['num_nodes']] * (max_patch_size - l)

patch = torch.tensor(patch)

return patch


def patch2batch(g, node_mask):
patches = node_mask.shape[0]
max_patch_size = node_mask.sum(dim=1).max()
all_nodes = torch.tensor(range(g['num_nodes']))
batch_node_list = list()
for i in range(patches):
patch_nodes = all_nodes[node_mask[i, :]].tolist()
l = len(patch_nodes)
if l < max_patch_size:
patch_nodes += [g['num_nodes']] * (max_patch_size - l)
batch_node_list.append(patch_nodes)

batch = torch.tensor(batch_node_list)
return batch


def norm(edge_index, num_nodes=None, edge_weight=None):
num_nodes = maybe_num_nodes(edge_index, num_nodes)
fill_value = 1.

edge_index, edge_weight = add_remaining_self_loops(
edge_index, edge_weight, fill_value, num_nodes)

if edge_weight is None:
edge_weight = torch.ones((edge_index.size(1),), dtype=torch.float,
device=edge_index.device)

row, col = edge_index[0], edge_index[1]
idx = col
deg = scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
deg_inv_sqrt = deg.pow_(-0.5)
deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)
edge_weight = deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

# return torch.sparse_coo_tensor(edge_index, edge_weight)
return edge_index, edge_weight
124 changes: 124 additions & 0 deletions examples/cobformer/Data/get_batch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import torch
import torch.nn.functional as F
import numpy as np
import torch_geometric
import networkx as nx
import metis
from torch_geometric.datasets import Planetoid
from torch_geometric.utils.num_nodes import maybe_num_nodes
from torch_geometric.utils import add_remaining_self_loops, to_undirected, remove_self_loops, add_self_loops
from torch_geometric.utils import scatter
import torch_geometric.transforms as T
from infomap import Infomap
from Data.data_utils import *
from ogb.nodeproppred import NodePropPredDataset
import scipy
import scipy.io
import scipy.sparse as sp
import os


class NCDataset(object):
def __init__(self, name):
"""
based off of ogb NodePropPredDataset
https://github.com/snap-stanford/ogb/blob/master/ogb/nodeproppred/dataset.py
Gives torch tensors instead of numpy arrays
- name (str): name of the dataset
- root (str): root directory to store the dataset folder
- meta_dict: dictionary that stores all the meta-information about data. Default is None,
but when something is passed, it uses its information. Useful for debugging for external contributers.

Usage after construction:

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0]

Where the graph is a dictionary of the following form:
dataset.graph = {'edge_index': edge_index,
'edge_feat': None,
'node_feat': node_feat,
'num_nodes': num_nodes}
For additional documentation, see OGB Library-Agnostic Loader https://ogb.stanford.edu/docs/nodeprop/

"""

self.name = name # original name, e.g., ogbn-proteins
self.graph = {}
self.label = None

def get_idx_split(self, split_type='random', train_prop=.5, valid_prop=.25, label_num_per_class=20):
"""
split_type: 'random' for random splitting, 'class' for splitting with equal node num per class
train_prop: The proportion of dataset for train split. Between 0 and 1.
valid_prop: The proportion of dataset for validation split. Between 0 and 1.
label_num_per_class: num of nodes per class
"""

if split_type == 'random':
ignore_negative = False if self.name == 'ogbn-proteins' else True
train_mask, valid_mask, test_mask = rand_train_test_idx(
self.label, train_prop=train_prop, valid_prop=valid_prop, ignore_negative=ignore_negative)
train_mask = F.pad(train_mask, [0, 1])
valid_mask = F.pad(valid_mask, [0, 1])
test_mask = F.pad(test_mask, [0, 1])
split_idx = {'train': train_mask,
'valid': valid_mask,
'test': test_mask}
elif split_type == 'class':
train_mask, valid_mask, test_mask = class_rand_splits(self.label, label_num_per_class=label_num_per_class)
train_mask = F.pad(train_mask, [0, 1])
valid_mask = F.pad(valid_mask, [0, 1])
test_mask = F.pad(test_mask, [0, 1])
split_idx = {'train': train_mask,
'valid': valid_mask,
'test': test_mask}
return split_idx

def partition_patch(self, n_patches):
node_mask = metis_partition(g=self.graph, n_patches=n_patches)
patch = patch2batch(self.graph, node_mask)
self.graph['num_nodes'] += 1
self.graph['node_feat'] = F.pad(self.graph['node_feat'], [0, 0, 0, 1])
self.label = F.pad(self.label, [0, 1])
return patch

def __getitem__(self, idx):
assert idx == 0, 'This dataset has only one graph'
return self.graph, self.label

def __len__(self):
return 1

def __repr__(self):
return '{}({})'.format(self.__class__.__name__, len(self))


def get_data_batch(path, name, batch_size=100000):
if name in ('ogbn-products'):
dataset = load_ogb_dataset(path, name, batch_size)

return dataset


def load_ogb_dataset(data_dir, name, batch_size):
dataset = NCDataset(name)
ogb_dataset = NodePropPredDataset(name=name, root=f'{data_dir}/ogb')
graph = ogb_dataset.graph
graph['edge_index'] = torch.as_tensor(graph['edge_index'])
graph['node_feat'] = torch.as_tensor(graph['node_feat'])

label = torch.as_tensor(ogb_dataset.labels).squeeze(-1)

split_idx = ogb_dataset.get_idx_split()
train_mask = torch.zeros_like(label, dtype=torch.bool)
train_mask[split_idx['train']] = True
valid_mask = torch.zeros_like(label, dtype=torch.bool)
valid_mask[split_idx['valid']] = True
test_mask = torch.zeros_like(label, dtype=torch.bool)
test_mask[split_idx['test']] = True

graph['edge_index'] = to_undirected(graph['edge_index'])

return dataset
Loading