diff --git a/examples/unimp/readme.md b/examples/unimp/readme.md new file mode 100644 index 000000000..15cf3c0dd --- /dev/null +++ b/examples/unimp/readme.md @@ -0,0 +1,32 @@ +# Graph Convolutional Networks (GCN) + +- Paper link: [https://arxiv.org/abs/2009.03509](https://arxiv.org/abs/2009.03509) + +# Dataset Statics + +| Dataset | # Nodes | # Edges | # Classes | +|----------|---------|---------|-----------| +| Cora | 2,708 | 10,556 | 7 | +| Citeseer | 3,327 | 9,228 | 6 | +| Pubmed | 19,717 | 88,651 | 3 | + +Refer to [Planetoid](https://gammagl.readthedocs.io/en/latest/api/gammagl.datasets.html#gammagl.datasets.Planetoid). + +Results +------- + +```bash +# available dataset: "cora", "citeseer", "pubmed" +TL_BACKEND="tensorflow" python unimp_trainer.py --dataset cora +TL_BACKEND="tensorflow" python unimp_trainer.py --dataset citeseer +TL_BACKEND="tensorflow" python unimp_trainer.py --dataset pubmed +TL_BACKEND="torch" python unimp_trainer.py --dataset cora +TL_BACKEND="torch" python unimp_trainer.py --dataset citeseer +TL_BACKEND="torch" python unimp_trainer.py --dataset pubmed +``` + +| Dataset | Our(tf) | Our(torch) | +|----------|------------|------------| +| cora | 83.10±1.12 | 82.30±0.67 | +| citeseer | 79.90±0.68 | 78.53±0.18 | +| pubmed | 74.10±1.08 | 73.63±0.12 | diff --git a/examples/unimp/unimp_trainer.py b/examples/unimp/unimp_trainer.py new file mode 100644 index 000000000..d8b071b40 --- /dev/null +++ b/examples/unimp/unimp_trainer.py @@ -0,0 +1,111 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES']='0' +import random +import argparse +import tensorlayerx as tlx +from gammagl.models.unimp import Unimp +from gammagl.datasets import Planetoid +from gammagl.utils import mask_to_index +from tensorlayerx.model import TrainOneStep, WithLoss + +class CrossEntropyLoss(WithLoss): + def __init__(self, model, loss_func): + super(CrossEntropyLoss, self).__init__(model,loss_func) + + def forward(self, data, label): + out = self.backbone_network(data['x'], data['edge_index']) + out = tlx.gather(out, data['val_idx']) + label = tlx.reshape(tlx.gather(label, data['val_idx']),shape=(-1,)) + #print(out[0]) + #print(label[0]) + loss = self._loss_fn(out, label) + return loss + + +def calculate_acc(logits, y, metrics): + metrics.update(logits, y) + rst = metrics.result() + metrics.reset() + return rst + +def get_label_mask(label,node,dtype): + mask=[1 for i in range(node['train_node1'])]+[0 for i in range(node['train_node2'])] + random.shuffle(mask) + label_mask=[] + for i in range(node['train_node']): + if mask[i]==0: + label_mask.append([-1]) + else: + label_mask.append([(int)(label[i])]) + label_mask+=[[0] for i in range(node['num_node']-node['train_node'])] + return tlx.ops.convert_to_tensor(label_mask,dtype=dtype) + +def merge_feature_label(label,feature): + return tlx.ops.concat([label,feature],axis=1) + +def main(args): + dataset = Planetoid(root='./',name=args.dataset) + graph=dataset[0] + feature=graph.x + edge_index=graph.edge_index + label=graph.y + train_node=int(graph.num_nodes * 0.3) + train_node1=int(graph.num_nodes * 0.1) + node = { + 'train_node': train_node, + 'train_node1': train_node1, + 'train_node2': train_node-train_node1, + 'num_node': graph.num_nodes + } + val_mask = tlx.ops.concat( + [tlx.ops.zeros((train_node, 1),dtype=tlx.int32), + tlx.ops.ones((train_node-train_node1, 1),dtype=tlx.int32)],axis=0) + test_mask=graph.test_mask + model=Unimp(dataset) + loss = tlx.losses.softmax_cross_entropy_with_logits + optimizer = tlx.optimizers.Adam(lr=0.01, weight_decay=5e-4) + train_weights = model.trainable_weights + loss_func = CrossEntropyLoss(model, loss) + train_one_step = TrainOneStep(loss_func, optimizer, train_weights) + val_idx = mask_to_index(val_mask) + test_idx = mask_to_index(test_mask) + metrics = tlx.metrics.Accuracy() + data = { + "x": feature, + "y": label, + "edge_index": edge_index, + "val_idx":val_idx, + "test_idx": test_idx, + "num_nodes": graph.num_nodes, + } + + epochs=args.epochs + best_val_acc=0 + for epoch in range(epochs): + model.set_train() + label_mask=get_label_mask(label,node,feature[0].dtype) + data['x']=merge_feature_label(label_mask,feature) + train_loss = train_one_step(data, graph.y) + + model.set_eval() + logits = model(data['x'], data['edge_index']) + test_logits = tlx.gather(logits, data['test_idx']) + test_y = tlx.gather(data['y'], data['test_idx']) + test_acc = calculate_acc(test_logits, test_y, metrics) + + print("Epoch [{:0>3d}] ".format(epoch + 1) + + " train loss: {:.4f}".format(train_loss.item()) + + " val acc: {:.4f}".format(test_acc)) + + # save best model on evaluation set + if test_acc > best_val_acc: + best_val_acc = test_acc + model.save_weights('./'+ 'unimp' + ".npz", format='npz_dict') + print("The Best ACC : {:.4f}".format(best_val_acc)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--epochs", type=int, default=200, help="number of epoch") + parser.add_argument('--dataset', type=str, default='cora', help='dataset') + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/gammagl/datasets/OgbGraphData.csv b/gammagl/datasets/OgbGraphData.csv new file mode 100644 index 000000000..467d0626b --- /dev/null +++ b/gammagl/datasets/OgbGraphData.csv @@ -0,0 +1,16 @@ +,ogbg-molbace,ogbg-molbbbp,ogbg-molclintox,ogbg-molmuv,ogbg-molpcba,ogbg-molsider,ogbg-moltox21,ogbg-moltoxcast,ogbg-molhiv,ogbg-molesol,ogbg-molfreesolv,ogbg-mollipo,ogbg-molchembl,ogbg-ppa,ogbg-code2 +num tasks,1,1,2,17,128,27,12,617,1,1,1,1,1310,1,1 +eval metric,rocauc,rocauc,rocauc,ap,ap,rocauc,rocauc,rocauc,rocauc,rmse,rmse,rmse,rocauc,acc,F1 +download_name,bace,bbbp,clintox,muv,pcba,sider,tox21,toxcast,hiv,esol,freesolv,lipophilicity,chembl,ogbg_ppi_medium,code2 +version,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +url,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bace.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bbbp.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/clintox.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/muv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/sider.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/tox21.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/toxcast.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/esol.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/freesolv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/lipophilicity.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/chembl.zip,http://snap.stanford.edu/ogb/data/graphproppred/ogbg_ppi_medium.zip,http://snap.stanford.edu/ogb/data/graphproppred/code2.zip +add_inverse_edge,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False +data type,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,, +has_node_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True +has_edge_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False +task type,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,regression,regression,regression,binary classification,multiclass classification,subtoken prediction +num classes,2,2,2,2,2,2,2,2,2,-1,-1,-1,2,37,-1 +split,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,species,project +additional node files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,"node_is_attributed,node_dfs_order,node_depth" +additional edge files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None +binary,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False diff --git a/gammagl/datasets/OgbLinkData.csv b/gammagl/datasets/OgbLinkData.csv new file mode 100644 index 000000000..025971839 --- /dev/null +++ b/gammagl/datasets/OgbLinkData.csv @@ -0,0 +1,14 @@ +,ogbl-ppa,ogbl-collab,ogbl-citation2,ogbl-wikikg2,ogbl-ddi,ogbl-biokg,ogbl-vessel +eval metric,hits@100,hits@50,mrr,mrr,hits@20,mrr,rocauc +task type,link prediction,link prediction,link prediction,KG completion,link prediction,KG completion,link prediction +download_name,ppassoc,collab,citation-v2,wikikg-v2,ddi,biokg,vessel +version,1,1,1,1,1,1,1 +url,http://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip,http://snap.stanford.edu/ogb/data/linkproppred/collab.zip,http://snap.stanford.edu/ogb/data/linkproppred/citation-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/wikikg-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/ddi.zip,http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip,http://snap.stanford.edu/ogb/data/linkproppred/vessel.zip +add_inverse_edge,True,True,False,False,True,False,False +has_node_attr,True,True,True,False,False,False,True +has_edge_attr,False,False,False,False,False,False,True +split,throughput,time,time,time,target,random,spatial +additional node files,None,None,node_year,None,None,None,None +additional edge files,None,"edge_weight,edge_year",None,edge_reltype,None,edge_reltype,None +is hetero,False,False,False,False,False,True,False +binary,False,False,False,False,False,False,True diff --git a/gammagl/datasets/OgbNodeData.csv b/gammagl/datasets/OgbNodeData.csv new file mode 100644 index 000000000..026ded027 --- /dev/null +++ b/gammagl/datasets/OgbNodeData.csv @@ -0,0 +1,16 @@ +,ogbn-proteins,ogbn-products,ogbn-arxiv,ogbn-mag,ogbn-papers100M +num tasks,112,1,1,1,1 +num classes,2,47,40,349,172 +eval metric,rocauc,acc,acc,acc,acc +task type,binary classification,multiclass classification,multiclass classification,multiclass classification,multiclass classification +download_name,proteins,products,arxiv,mag,papers100M-bin +version,1,1,1,2,1 +url,http://snap.stanford.edu/ogb/data/nodeproppred/proteins.zip,http://snap.stanford.edu/ogb/data/nodeproppred/products.zip,http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip,http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip,http://snap.stanford.edu/ogb/data/nodeproppred/papers100M-bin.zip +add_inverse_edge,True,True,False,False,False +has_node_attr,False,True,True,True,True +has_edge_attr,True,False,False,False,False +split,species,sales_ranking,time,time,time +additional node files,node_species,None,node_year,node_year,node_year +additional edge files,None,None,None,edge_reltype,None +is hetero,False,False,False,True,False +binary,False,False,False,False,True diff --git a/gammagl/datasets/ogb_graph.py b/gammagl/datasets/ogb_graph.py new file mode 100644 index 000000000..ea6f8f2a8 --- /dev/null +++ b/gammagl/datasets/ogb_graph.py @@ -0,0 +1,156 @@ +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from gammagl.data import InMemoryDataset +from gammagl.data.download import download_url +from gammagl.data.extract import extract_zip +from gammagl.io.read_ogb import read_graph + + +class OgbGraphDataset(InMemoryDataset): + def __init__(self, name, root = 'dataset', transform=None, pre_transform = None, meta_dict = None): + ''' + - name (str): name of the dataset + - root (str): root directory to store the dataset folder + - transform, pre_transform (optional): transform/pre-transform graph objects + + - meta_dict: dictionary that stores all the meta-information about data. Default is None, + but when something is passed, it uses its information. Useful for debugging for external contributers. + ''' + + self.name = name ## original name, e.g., ogbg-molhiv + + if meta_dict is None: + self.dir_name = '_'.join(name.split('-')) + + # check if previously-downloaded folder exists. + # If so, use that one. + if osp.exists(osp.join(root, self.dir_name + '_gammagl')): + self.dir_name = self.dir_name + '_gammagl' + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbGraphData.csv'), index_col = 0) + if not self.name in master: + error_mssg = 'Invalid dataset name {}.\n'.format(self.name) + error_mssg += 'Available datasets are as follows:\n' + error_mssg += '\n'.join(master.keys()) + raise ValueError(error_mssg) + self.meta_info = master[self.name] + + else: + self.dir_name = meta_dict['dir_path'] + self.original_root = '' + self.root = meta_dict['dir_path'] + self.meta_info = meta_dict + + # check version + # First check whether the dataset has been already downloaded or not. + # If so, check whether the dataset version is the newest or not. + # If the dataset is not the newest version, notify this to the user. + if osp.isdir(self.root) and (not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))): + print(self.name + ' has been updated.') + if input('Will you update the dataset now? (y/N)\n').lower() == 'y': + shutil.rmtree(self.root) + + self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., tox21 + + self.num_tasks = int(self.meta_info['num tasks']) + self.eval_metric = self.meta_info['eval metric'] + self.task_type = self.meta_info['task type'] + self.__num_classes__ = int(self.meta_info['num classes']) + self.binary = self.meta_info['binary'] == 'True' + + super(OgbGraphDataset, self).__init__(self.root, transform, pre_transform) + + self.data, self.slices = self.load_data(self.processed_paths[0]) + + def get_idx_split(self, split_type = None): + if split_type is None: + split_type = self.meta_info['split'] + + path = osp.join(self.root, 'split', split_type) + + # short-cut if split_dict.pt exists + if os.path.isfile(os.path.join(path, 'split_dict.pt')): + return self.load_data(os.path.join(path, 'split_dict.pt')) + + train_idx = pd.read_csv(osp.join(path, 'train.csv.gz'), compression='gzip', header = None).values.T[0] + valid_idx = pd.read_csv(osp.join(path, 'valid.csv.gz'), compression='gzip', header = None).values.T[0] + test_idx = pd.read_csv(osp.join(path, 'test.csv.gz'), compression='gzip', header = None).values.T[0] + + return {'train': train_idx, 'valid': valid_idx, 'test': test_idx} + + @property + def num_classes(self): + return self.__num_classes__ + + @property + def raw_file_names(self): + if self.binary: + return ['data.npz'] + else: + file_names = ['edge'] + if self.meta_info['has_node_attr'] == 'True': + file_names.append('node-feat') + if self.meta_info['has_edge_attr'] == 'True': + file_names.append('edge-feat') + return [file_name + '.csv.gz' for file_name in file_names] + + @property + def processed_file_names(self): + return 'geometric_data_processed.pt' + + def download(self): + url = self.meta_info['url'] + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + shutil.rmtree(self.root) + shutil.move(osp.join(self.original_root, self.download_name), self.root) + + + def process(self): + add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' + + if self.meta_info['additional node files'] == 'None': + additional_node_files = [] + else: + additional_node_files = self.meta_info['additional node files'].split(',') + + if self.meta_info['additional edge files'] == 'None': + additional_edge_files = [] + else: + additional_edge_files = self.meta_info['additional edge files'].split(',') + + data_list = read_graph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary) + + if self.task_type == 'subtoken prediction': + graph_label_notparsed = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values + graph_label = [str(graph_label_notparsed[i][0]).split(' ') for i in range(len(graph_label_notparsed))] + + for i, g in enumerate(data_list): + g.y = graph_label[i] + + else: + if self.binary: + graph_label = np.load(osp.join(self.raw_dir, 'graph-label.npz'))['graph_label'] + else: + graph_label = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values + + has_nan = np.isnan(graph_label).any() + + for i, g in enumerate(data_list): + g.y = graph_label[i] + + if self.pre_transform is not None: + data_list = [self.pre_transform(data) for data in data_list] + + data, slices = self.collate(data_list) + + print('Saving...') + self.save_data((data, slices), self.processed_paths[0]) + + diff --git a/gammagl/datasets/ogb_link.py b/gammagl/datasets/ogb_link.py new file mode 100644 index 000000000..60226a56a --- /dev/null +++ b/gammagl/datasets/ogb_link.py @@ -0,0 +1,139 @@ +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from gammagl.data import InMemoryDataset +from gammagl.data.download import download_url +from gammagl.data.extract import extract_zip +from gammagl.io.read_ogb import read_graph, read_heterograph + +class OgbLinkDataset(InMemoryDataset): + def __init__(self, name, root = 'dataset', transform=None, pre_transform=None, meta_dict = None): + ''' + - name (str): name of the dataset + - root (str): root directory to store the dataset folder + + - meta_dict: dictionary that stores all the meta-information about data. Default is None, + but when something is passed, it uses its information. Useful for debugging for external contributers. + ''' + + self.name = name ## original name, e.g., ogbl-ppa + + if meta_dict is None: + self.dir_name = '_'.join(name.split('-')) + + # check if previously-downloaded folder exists. + # If so, use that one. + if osp.exists(osp.join(root, self.dir_name + '_gammagl')): + self.dir_name = self.dir_name + '_gammagl' + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbLinkData.csv'), index_col = 0) + if not self.name in master: + error_mssg = 'Invalid dataset name {}.\n'.format(self.name) + error_mssg += 'Available datasets are as follows:\n' + error_mssg += '\n'.join(master.keys()) + raise ValueError(error_mssg) + self.meta_info = master[self.name] + + else: + self.dir_name = meta_dict['dir_path'] + self.original_root = '' + self.root = meta_dict['dir_path'] + self.meta_info = meta_dict + + # check version + # First check whether the dataset has been already downloaded or not. + # If so, check whether the dataset version is the newest or not. + # If the dataset is not the newest version, notify this to the user. + if osp.isdir(self.root) and (not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))): + print(self.name + ' has been updated.') + if input('Will you update the dataset now? (y/N)\n').lower() == 'y': + shutil.rmtree(self.root) + + self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., ppassoc + + self.task_type = self.meta_info['task type'] + self.eval_metric = self.meta_info['eval metric'] + self.is_hetero = self.meta_info['is hetero'] == 'True' + self.binary = self.meta_info['binary'] == 'True' + + super(OgbLinkDataset, self).__init__(self.root, transform, pre_transform) + self.data, self.slices = self.load_data(self.processed_paths[0]) + + def get_edge_split(self, split_type = None): + if split_type is None: + split_type = self.meta_info['split'] + + path = osp.join(self.root, 'split', split_type) + + # short-cut if split_dict.pt exists + if os.path.isfile(os.path.join(path, 'split_dict.pt')): + return self.load_data(os.path.join(path, 'split_dict.pt')) + + train = self.load_data(osp.join(path, 'train.pt')) + valid = self.load_data(osp.join(path, 'valid.pt')) + test = self.load_data(osp.join(path, 'test.pt')) + + return {'train': train, 'valid': valid, 'test': test} + + @property + def raw_file_names(self): + if self.binary: + if self.is_hetero: + return ['edge_index_dict.npz'] + else: + return ['data.npz'] + else: + if self.is_hetero: + return ['num-node-dict.csv.gz', 'triplet-type-list.csv.gz'] + else: + file_names = ['edge'] + if self.meta_info['has_node_attr'] == 'True': + file_names.append('node-feat') + if self.meta_info['has_edge_attr'] == 'True': + file_names.append('edge-feat') + return [file_name + '.csv.gz' for file_name in file_names] + + @property + def processed_file_names(self): + return osp.join('geometric_data_processed.pt') + + def download(self): + url = self.meta_info['url'] + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + shutil.rmtree(self.root) + shutil.move(osp.join(self.original_root, self.download_name), self.root) + + + def process(self): + add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' + + if self.meta_info['additional node files'] == 'None': + additional_node_files = [] + else: + additional_node_files = self.meta_info['additional node files'].split(',') + + if self.meta_info['additional edge files'] == 'None': + additional_edge_files = [] + else: + additional_edge_files = self.meta_info['additional edge files'].split(',') + + if self.is_hetero: + data = read_heterograph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0] + else: + data = read_graph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0] + + data = data if self.pre_transform is None else self.pre_transform(data) + + print('Saving...') + self.save_data(self.collate([data]), self.processed_paths[0]) + + def __repr__(self): + return '{}()'.format(self.__class__.__name__) + + diff --git a/gammagl/datasets/ogb_node.py b/gammagl/datasets/ogb_node.py new file mode 100644 index 000000000..f82f357f0 --- /dev/null +++ b/gammagl/datasets/ogb_node.py @@ -0,0 +1,187 @@ +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from gammagl.data import InMemoryDataset +from gammagl.data.download import download_url +from gammagl.data.extract import extract_zip +from gammagl.io.read_ogb import read_node_label_hetero, read_graph, read_heterograph,read_nodesplitidx_split_hetero + + +class OgbNodeDataset(InMemoryDataset): + def __init__(self, name, root='dataset', transform=None, pre_transform=None, meta_dict=None): + ''' + - name (str): name of the dataset + - root (str): root directory to store the dataset folder + - transform, pre_transform (optional): transform/pre-transform graph objects + + - meta_dict: dictionary that stores all the meta-information about data. Default is None, + but when something is passed, it uses its information. Useful for debugging for external contributers. + ''' + + self.name = name ## original name, e.g., ogbn-proteins + + if meta_dict is None: + self.dir_name = '_'.join(name.split('-')) + + # check if previously-downloaded folder exists. + # If so, use that one. + if osp.exists(osp.join(root, self.dir_name + '_gammagl')): + self.dir_name = self.dir_name + '_gammagl' + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbNodeData.csv'), index_col=0) + if not self.name in master: + error_mssg = 'Invalid dataset name {}.\n'.format(self.name) + error_mssg += 'Available datasets are as follows:\n' + error_mssg += '\n'.join(master.keys()) + raise ValueError(error_mssg) + self.meta_info = master[self.name] + + else: + self.dir_name = meta_dict['dir_path'] + self.original_root = '' + self.root = meta_dict['dir_path'] + self.meta_info = meta_dict + + # check version + # First check whether the dataset has been already downloaded or not. + # If so, check whether the dataset version is the newest or not. + # If the dataset is not the newest version, notify this to the user. + if osp.isdir(self.root) and ( + not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))): + print(self.name + ' has been updated.') + if input('Will you update the dataset now? (y/N)\n').lower() == 'y': + shutil.rmtree(self.root) + + self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., tox21 + + self.num_tasks = int(self.meta_info['num tasks']) + self.task_type = self.meta_info['task type'] + self.eval_metric = self.meta_info['eval metric'] + self.__num_classes__ = int(self.meta_info['num classes']) + self.is_hetero = self.meta_info['is hetero'] == 'True' + self.binary = self.meta_info['binary'] == 'True' + + super(OgbNodeDataset, self).__init__(self.root, transform, pre_transform) + self.data, self.slices = self.load_data(self.processed_paths[0]) + def get_idx_split(self, split_type = None): + if split_type is None: + split_type = self.meta_info['split'] + + path = osp.join(self.root, 'split', split_type) + + if self.is_hetero: + train_idx_dict, valid_idx_dict, test_idx_dict = read_nodesplitidx_split_hetero(path) + for nodetype in train_idx_dict.keys(): + train_idx_dict[nodetype] = train_idx_dict[nodetype] + valid_idx_dict[nodetype] = valid_idx_dict[nodetype] + test_idx_dict[nodetype] = test_idx_dict[nodetype] + + return {'train': train_idx_dict, 'valid': valid_idx_dict, 'test': test_idx_dict} + + else: + train_idx = pd.read_csv(osp.join(path, 'train.csv.gz'), compression='gzip', header = None).values.T[0] + valid_idx = pd.read_csv(osp.join(path, 'valid.csv.gz'), compression='gzip', header = None).values.T[0] + test_idx = pd.read_csv(osp.join(path, 'test.csv.gz'), compression='gzip', header = None).values.T[0] + + return {'train': train_idx, 'valid': valid_idx, 'test': test_idx} + @property + def num_classes(self): + return self.__num_classes__ + + @property + def raw_file_names(self): + if self.binary: + if self.is_hetero: + return ['edge_index_dict.npz'] + else: + return ['data.npz'] + else: + if self.is_hetero: + return ['num-node-dict.csv.gz', 'triplet-type-list.csv.gz'] + else: + file_names = ['edge'] + if self.meta_info['has_node_attr'] == 'True': + file_names.append('node-feat') + if self.meta_info['has_edge_attr'] == 'True': + file_names.append('edge-feat') + return [file_name + '.csv.gz' for file_name in file_names] + + @property + def processed_file_names(self): + return osp.join('geometric_data_processed.pt') + + def download(self): + url = self.meta_info['url'] + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + shutil.rmtree(self.root) + shutil.move(osp.join(self.original_root, self.download_name), self.root) + + + def process(self): + add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' + + if self.meta_info['additional node files'] == 'None': + additional_node_files = [] + else: + additional_node_files = self.meta_info['additional node files'].split(',') + + if self.meta_info['additional edge files'] == 'None': + additional_edge_files = [] + else: + additional_edge_files = self.meta_info['additional edge files'].split(',') + + if self.is_hetero: + data = read_heterograph(self.raw_dir, add_inverse_edge=add_inverse_edge, + additional_node_files=additional_node_files, + additional_edge_files=additional_edge_files, binary=self.binary)[0] + + if self.binary: + tmp = np.load(osp.join(self.raw_dir, 'node-label.npz')) + node_label_dict = {} + for key in list(tmp.keys()): + node_label_dict[key] = tmp[key] + del tmp + else: + node_label_dict = read_node_label_hetero(self.raw_dir) + + data.y_dict = {} + if 'classification' in self.task_type: + for nodetype, node_label in node_label_dict.items(): + data.y_dict[nodetype] = node_label + else: + for nodetype, node_label in node_label_dict.items(): + data.y_dict[nodetype] = node_label + else: + data = \ + read_graph(self.raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, + additional_edge_files=additional_edge_files, binary=self.binary)[0] + ### adding prediction target + if self.binary: + node_label = np.load(osp.join(self.raw_dir, 'node-label.npz'))['node_label'] + else: + node_label = pd.read_csv(osp.join(self.raw_dir, 'node-label.csv.gz'), compression='gzip', + header=None).values + data.y = node_label + data = data if self.pre_transform is None else self.pre_transform(data) + self.data = data + print('Saving...') + self.save_data(self.collate([data]), self.processed_paths[0]) + + def __getitem__(self, idx): + assert idx == 0, 'This dataset has only one graph' + return self.data + + def __repr__(self): + return '{}()'.format(self.__class__.__name__) + + +if __name__ == '__main__': + data = OgbNodeDataset(name='ogbn-arxiv') + print(data[0]) + diff --git a/gammagl/io/read_ogb.py b/gammagl/io/read_ogb.py new file mode 100644 index 000000000..c7b7bc374 --- /dev/null +++ b/gammagl/io/read_ogb.py @@ -0,0 +1,772 @@ +import pandas as pd +import os.path as osp +import os +import numpy as np +from gammagl.data.download import download_url +from gammagl.data.extract import extract_zip +from tqdm import tqdm +from gammagl.data import Graph,HeteroGraph + +def read_graph(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], binary=False): + if binary: + # npz + graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge) + else: + # csv + graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge, additional_node_files=additional_node_files, + additional_edge_files=additional_edge_files) + + result_list = [] + + for graph in tqdm(graph_list): + g = Graph() + g.num_nodes = graph['num_nodes'] + g.edge_index = graph['edge_index'] + + del graph['num_nodes'] + del graph['edge_index'] + + if graph['edge_feat'] is not None: + g.edge_attr = graph['edge_feat'] + del graph['edge_feat'] + + if graph['node_feat'] is not None: + g.x = graph['node_feat'] + del graph['node_feat'] + + for key in additional_node_files: + g[key] = graph[key] + del graph[key] + + for key in additional_edge_files: + g[key] = graph[key] + del graph[key] + + result_list.append(g) + + return result_list + + +def read_heterograph(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], + binary=False): + if binary: + # npz + graph_list = read_binary_heterograph_raw(raw_dir, add_inverse_edge) + else: + # csv + graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files=additional_node_files, + additional_edge_files=additional_edge_files) + + graph_list = [] + + for graph in tqdm(graph_list): + g = HeteroGraph() + + g.__num_nodes__ = graph['num_nodes_dict'] + g.num_nodes_dict = graph['num_nodes_dict'] + + # add edge connectivity + g.edge_index_dict = {} + for triplet, edge_index in graph['edge_index_dict'].items(): + g.edge_index_dict[triplet] = edge_index + + del graph['edge_index_dict'] + + if graph['edge_feat_dict'] is not None: + g.edge_attr_dict = {} + for triplet in graph['edge_feat_dict'].keys(): + g.edge_attr_dict[triplet] = graph['edge_feat_dict'][triplet] + + del graph['edge_feat_dict'] + + if graph['node_feat_dict'] is not None: + g.x_dict = {} + for nodetype in graph['node_feat_dict'].keys(): + g.x_dict[nodetype] = graph['node_feat_dict'][nodetype] + + del graph['node_feat_dict'] + + for key in additional_node_files: + g[key] = {} + for nodetype in graph[key].keys(): + g[key][nodetype] = graph[key][nodetype] + + del graph[key] + + for key in additional_edge_files: + g[key] = {} + for triplet in graph[key].keys(): + g[key][triplet] = graph[key][triplet] + + del graph[key] + + graph_list.append(g) + + return graph_list + + +### reading raw files from a directory. +### for homogeneous graph +def read_csv_graph_raw(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[]): + ''' + raw_dir: path to the raw directory + add_inverse_edge (bool): whether to add inverse edge or not + + return: graph_list, which is a list of graphs. + Each graph is a dictionary, containing edge_index, edge_feat, node_feat, and num_nodes + edge_feat and node_feat are optional: if a graph does not contain it, we will have None. + + additional_node_files and additional_edge_files must be in the raw directory. + - The name should be {additional_node_file, additional_edge_file}.csv.gz + - The length should be num_nodes or num_edges + + additional_node_files must start from 'node_' + additional_edge_files must start from 'edge_' + + + ''' + + print('Loading necessary files...') + print('This might take a while.') + # loading necessary files + try: + edge = pd.read_csv(osp.join(raw_dir, 'edge.csv.gz'), compression='gzip', header=None).values.T.astype( + np.int64) # (2, num_edge) numpy array + num_node_list = \ + pd.read_csv(osp.join(raw_dir, 'num-node-list.csv.gz'), compression='gzip', header=None).astype(np.int64)[ + 0].tolist() # (num_graph, ) python list + num_edge_list = \ + pd.read_csv(osp.join(raw_dir, 'num-edge-list.csv.gz'), compression='gzip', header=None).astype(np.int64)[ + 0].tolist() # (num_edge, ) python list + + except FileNotFoundError: + raise RuntimeError('No necessary file') + + try: + node_feat = pd.read_csv(osp.join(raw_dir, 'node-feat.csv.gz'), compression='gzip', header=None).values + if 'int' in str(node_feat.dtype): + node_feat = node_feat.astype(np.int64) + else: + # float + node_feat = node_feat.astype(np.float32) + except FileNotFoundError: + node_feat = None + + try: + edge_feat = pd.read_csv(osp.join(raw_dir, 'edge-feat.csv.gz'), compression='gzip', header=None).values + if 'int' in str(edge_feat.dtype): + edge_feat = edge_feat.astype(np.int64) + else: + # float + edge_feat = edge_feat.astype(np.float32) + + except FileNotFoundError: + edge_feat = None + + additional_node_info = {} + for additional_file in additional_node_files: + assert (additional_file[:5] == 'node_') + + # hack for ogbn-proteins + if additional_file == 'node_species' and osp.exists(osp.join(raw_dir, 'species.csv.gz')): + os.rename(osp.join(raw_dir, 'species.csv.gz'), osp.join(raw_dir, 'node_species.csv.gz')) + + temp = pd.read_csv(osp.join(raw_dir, additional_file + '.csv.gz'), compression='gzip', header=None).values + + if 'int' in str(temp.dtype): + additional_node_info[additional_file] = temp.astype(np.int64) + else: + # float + additional_node_info[additional_file] = temp.astype(np.float32) + + additional_edge_info = {} + for additional_file in additional_edge_files: + assert (additional_file[:5] == 'edge_') + temp = pd.read_csv(osp.join(raw_dir, additional_file + '.csv.gz'), compression='gzip', header=None).values + + if 'int' in str(temp.dtype): + additional_edge_info[additional_file] = temp.astype(np.int64) + else: + # float + additional_edge_info[additional_file] = temp.astype(np.float32) + + graph_list = [] + num_node_accum = 0 + num_edge_accum = 0 + + print('Processing graphs...') + for num_node, num_edge in tqdm(zip(num_node_list, num_edge_list), total=len(num_node_list)): + + graph = dict() + + ### handling edge + if add_inverse_edge: + ### duplicate edge + duplicated_edge = np.repeat(edge[:, num_edge_accum:num_edge_accum + num_edge], 2, axis=1) + duplicated_edge[0, 1::2] = duplicated_edge[1, 0::2] + duplicated_edge[1, 1::2] = duplicated_edge[0, 0::2] + + graph['edge_index'] = duplicated_edge + + if edge_feat is not None: + graph['edge_feat'] = np.repeat(edge_feat[num_edge_accum:num_edge_accum + num_edge], 2, axis=0) + else: + graph['edge_feat'] = None + + for key, value in additional_edge_info.items(): + graph[key] = np.repeat(value[num_edge_accum:num_edge_accum + num_edge], 2, axis=0) + + else: + graph['edge_index'] = edge[:, num_edge_accum:num_edge_accum + num_edge] + + if edge_feat is not None: + graph['edge_feat'] = edge_feat[num_edge_accum:num_edge_accum + num_edge] + else: + graph['edge_feat'] = None + + for key, value in additional_edge_info.items(): + graph[key] = value[num_edge_accum:num_edge_accum + num_edge] + + num_edge_accum += num_edge + + ### handling node + if node_feat is not None: + graph['node_feat'] = node_feat[num_node_accum:num_node_accum + num_node] + else: + graph['node_feat'] = None + + for key, value in additional_node_info.items(): + graph[key] = value[num_node_accum:num_node_accum + num_node] + + graph['num_nodes'] = num_node + num_node_accum += num_node + + graph_list.append(graph) + + return graph_list + + +### reading raw files from a directory. +### npz ver +### for homogeneous graph +def read_binary_graph_raw(raw_dir, add_inverse_edge=False): + ''' + raw_dir: path to the raw directory + add_inverse_edge (bool): whether to add inverse edge or not + + return: graph_list, which is a list of graphs. + Each graph is a dictionary, containing edge_index, edge_feat, node_feat, and num_nodes + edge_feat and node_feat are optional: if a graph does not contain it, we will have None. + + raw_dir must contain data.npz + - edge_index + - num_nodes_list + - num_edges_list + - node_** (optional, node_feat is the default node features) + - edge_** (optional, edge_feat is the default edge features) + ''' + + if add_inverse_edge: + raise RuntimeError('add_inverse_edge is depreciated in read_binary') + + print('Loading necessary files...') + print('This might take a while.') + data_dict = np.load(osp.join(raw_dir, 'data.npz')) + + edge_index = data_dict['edge_index'] + num_nodes_list = data_dict['num_nodes_list'] + num_edges_list = data_dict['num_edges_list'] + + # storing node and edge features + node_dict = {} + edge_dict = {} + + for key in list(data_dict.keys()): + if key == 'edge_index' or key == 'num_nodes_list' or key == 'num_edges_list': + continue + + if key[:5] == 'node_': + node_dict[key] = data_dict[key] + elif key[:5] == 'edge_': + edge_dict[key] = data_dict[key] + else: + raise RuntimeError( + f"Keys in graph object should start from either \'node_\' or \'edge_\', but found \'{key}\'.") + + graph_list = [] + num_nodes_accum = 0 + num_edges_accum = 0 + + print('Processing graphs...') + for num_nodes, num_edges in tqdm(zip(num_nodes_list, num_edges_list), total=len(num_nodes_list)): + + graph = dict() + + graph['edge_index'] = edge_index[:, num_edges_accum:num_edges_accum + num_edges] + + for key, feat in edge_dict.items(): + graph[key] = feat[num_edges_accum:num_edges_accum + num_edges] + + if 'edge_feat' not in graph: + graph['edge_feat'] = None + + for key, feat in node_dict.items(): + graph[key] = feat[num_nodes_accum:num_nodes_accum + num_nodes] + + if 'node_feat' not in graph: + graph['node_feat'] = None + + graph['num_nodes'] = num_nodes + + num_edges_accum += num_edges + num_nodes_accum += num_nodes + + graph_list.append(graph) + + return graph_list + + +### reading raw files from a directory. +### for heterogeneous graph +def read_csv_heterograph_raw(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[]): + ''' + raw_dir: path to the raw directory + add_inverse_edge (bool): whether to add inverse edge or not + + return: graph_list, which is a list of heterogeneous graphs. + Each graph is a dictionary, containing the following keys: + - edge_index_dict + edge_index_dict[(head, rel, tail)] = edge_index for (head, rel, tail) + + - edge_feat_dict + edge_feat_dict[(head, rel, tail)] = edge_feat for (head, rel, tail) + + - node_feat_dict + node_feat_dict[nodetype] = node_feat for nodetype + + - num_nodes_dict + num_nodes_dict[nodetype] = num_nodes for nodetype + + * edge_feat_dict and node_feat_dict are optional: if a graph does not contain it, we will simply have None. + + We can also have additional node/edge features. For example, + - edge_reltype_dict + edge_reltype_dict[(head, rel, tail)] = edge_reltype for (head, rel, tail) + + - node_year_dict + node_year_dict[nodetype] = node_year + + ''' + + print('Loading necessary files...') + print('This might take a while.') + + # loading necessary files + try: + num_node_df = pd.read_csv(osp.join(raw_dir, 'num-node-dict.csv.gz'), compression='gzip') + num_node_dict = {nodetype: num_node_df[nodetype].astype(np.int64).tolist() for nodetype in num_node_df.keys()} + nodetype_list = sorted(list(num_node_dict.keys())) + + ## read edge_dict, num_edge_dict + triplet_df = pd.read_csv(osp.join(raw_dir, 'triplet-type-list.csv.gz'), compression='gzip', header=None) + triplet_list = sorted([(head, relation, tail) for head, relation, tail in + zip(triplet_df[0].tolist(), triplet_df[1].tolist(), triplet_df[2].tolist())]) + + edge_dict = {} + num_edge_dict = {} + + for triplet in triplet_list: + subdir = osp.join(raw_dir, 'relations', '___'.join(triplet)) + + edge_dict[triplet] = pd.read_csv(osp.join(subdir, 'edge.csv.gz'), compression='gzip', + header=None).values.T.astype(np.int64) + num_edge_dict[triplet] = \ + pd.read_csv(osp.join(subdir, 'num-edge-list.csv.gz'), compression='gzip', header=None).astype(np.int64)[ + 0].tolist() + + # check the number of graphs coincide + assert (len(num_node_dict[nodetype_list[0]]) == len(num_edge_dict[triplet_list[0]])) + + num_graphs = len(num_node_dict[nodetype_list[0]]) + + except FileNotFoundError: + raise RuntimeError('No necessary file') + + node_feat_dict = {} + for nodetype in nodetype_list: + subdir = osp.join(raw_dir, 'node-feat', nodetype) + + try: + node_feat = pd.read_csv(osp.join(subdir, 'node-feat.csv.gz'), compression='gzip', header=None).values + if 'int' in str(node_feat.dtype): + node_feat = node_feat.astype(np.int64) + else: + # float + node_feat = node_feat.astype(np.float32) + + node_feat_dict[nodetype] = node_feat + except FileNotFoundError: + pass + + edge_feat_dict = {} + for triplet in triplet_list: + subdir = osp.join(raw_dir, 'relations', '___'.join(triplet)) + + try: + edge_feat = pd.read_csv(osp.join(subdir, 'edge-feat.csv.gz'), compression='gzip', header=None).values + if 'int' in str(edge_feat.dtype): + edge_feat = edge_feat.astype(np.int64) + else: + # float + edge_feat = edge_feat.astype(np.float32) + + edge_feat_dict[triplet] = edge_feat + + except FileNotFoundError: + pass + + additional_node_info = {} + # e.g., additional_node_info['node_year'] = node_feature_dict for node_year + for additional_file in additional_node_files: + additional_feat_dict = {} + assert (additional_file[:5] == 'node_') + + for nodetype in nodetype_list: + subdir = osp.join(raw_dir, 'node-feat', nodetype) + + try: + node_feat = pd.read_csv(osp.join(subdir, additional_file + '.csv.gz'), compression='gzip', + header=None).values + if 'int' in str(node_feat.dtype): + node_feat = node_feat.astype(np.int64) + else: + # float + node_feat = node_feat.astype(np.float32) + + assert (len(node_feat) == sum(num_node_dict[nodetype])) + + additional_feat_dict[nodetype] = node_feat + + except FileNotFoundError: + pass + + additional_node_info[additional_file] = additional_feat_dict + + additional_edge_info = {} + # e.g., additional_edge_info['edge_reltype'] = edge_feat_dict for edge_reltype + for additional_file in additional_edge_files: + assert (additional_file[:5] == 'edge_') + additional_feat_dict = {} + for triplet in triplet_list: + subdir = osp.join(raw_dir, 'relations', '___'.join(triplet)) + + try: + edge_feat = pd.read_csv(osp.join(subdir, additional_file + '.csv.gz'), compression='gzip', + header=None).values + if 'int' in str(edge_feat.dtype): + edge_feat = edge_feat.astype(np.int64) + else: + # float + edge_feat = edge_feat.astype(np.float32) + + assert (len(edge_feat) == sum(num_edge_dict[triplet])) + + additional_feat_dict[triplet] = edge_feat + + except FileNotFoundError: + pass + + additional_edge_info[additional_file] = additional_feat_dict + + graph_list = [] + num_node_accum_dict = {nodetype: 0 for nodetype in nodetype_list} + num_edge_accum_dict = {triplet: 0 for triplet in triplet_list} + + print('Processing graphs...') + for i in tqdm(range(num_graphs)): + + graph = dict() + + ### set up default atribute + graph['edge_index_dict'] = {} + graph['edge_feat_dict'] = {} + graph['node_feat_dict'] = {} + graph['num_nodes_dict'] = {} + + ### set up additional node/edge attributes + for key in additional_node_info.keys(): + graph[key] = {} + + for key in additional_edge_info.keys(): + graph[key] = {} + + ### handling edge + for triplet in triplet_list: + edge = edge_dict[triplet] + num_edge = num_edge_dict[triplet][i] + num_edge_accum = num_edge_accum_dict[triplet] + + if add_inverse_edge: + ### add edge_index + # duplicate edge + duplicated_edge = np.repeat(edge[:, num_edge_accum:num_edge_accum + num_edge], 2, axis=1) + duplicated_edge[0, 1::2] = duplicated_edge[1, 0::2] + duplicated_edge[1, 1::2] = duplicated_edge[0, 0::2] + graph['edge_index_dict'][triplet] = duplicated_edge + + ### add default edge feature + if len(edge_feat_dict) > 0: + # if edge_feat exists for some triplet + if triplet in edge_feat_dict: + graph['edge_feat_dict'][triplet] = np.repeat( + edge_feat_dict[triplet][num_edge:num_edge + num_edge], 2, axis=0) + + else: + # if edge_feat is not given for any triplet + graph['edge_feat_dict'] = None + + ### add additional edge feature + for key, value in additional_edge_info.items(): + if triplet in value: + graph[key][triplet] = np.repeat(value[triplet][num_edge_accum: num_edge_accum + num_edge], 2, + axis=0) + + else: + ### add edge_index + graph['edge_index_dict'][triplet] = edge[:, num_edge_accum:num_edge_accum + num_edge] + + ### add default edge feature + if len(edge_feat_dict) > 0: + # if edge_feat exists for some triplet + if triplet in edge_feat_dict: + graph['edge_feat_dict'][triplet] = edge_feat_dict[triplet][num_edge:num_edge + num_edge] + + else: + # if edge_feat is not given for any triplet + graph['edge_feat_dict'] = None + + ### add additional edge feature + for key, value in additional_edge_info.items(): + if triplet in value: + graph[key][triplet] = value[triplet][num_edge_accum: num_edge_accum + num_edge] + + num_edge_accum_dict[triplet] += num_edge + + ### handling node + for nodetype in nodetype_list: + num_node = num_node_dict[nodetype][i] + num_node_accum = num_node_accum_dict[nodetype] + + ### add default node feature + if len(node_feat_dict) > 0: + # if node_feat exists for some node type + if nodetype in node_feat_dict: + graph['node_feat_dict'][nodetype] = node_feat_dict[nodetype][ + num_node_accum:num_node_accum + num_node] + + else: + graph['node_feat_dict'] = None + + ### add additional node feature + for key, value in additional_node_info.items(): + if nodetype in value: + graph[key][nodetype] = value[nodetype][num_node_accum: num_node_accum + num_node] + + graph['num_nodes_dict'][nodetype] = num_node + num_node_accum_dict[nodetype] += num_node + + graph_list.append(graph) + + return graph_list + + +def read_binary_heterograph_raw(raw_dir, add_inverse_edge=False): + ''' + raw_dir: path to the raw directory + add_inverse_edge (bool): whether to add inverse edge or not + + return: graph_list, which is a list of heterogeneous graphs. + Each graph is a dictionary, containing the following keys: + - edge_index_dict + edge_index_dict[(head, rel, tail)] = edge_index for (head, rel, tail) + + - edge_feat_dict + edge_feat_dict[(head, rel, tail)] = edge_feat for (head, rel, tail) + + - node_feat_dict + node_feat_dict[nodetype] = node_feat for nodetype + + - num_nodes_dict + num_nodes_dict[nodetype] = num_nodes for nodetype + + * edge_feat_dict and node_feat_dict are optional: if a graph does not contain it, we will simply have None. + + We can also have additional node/edge features. For example, + - edge_** + - node_** + + ''' + + if add_inverse_edge: + raise RuntimeError('add_inverse_edge is depreciated in read_binary') + + print('Loading necessary files...') + print('This might take a while.') + + # loading necessary files + try: + num_nodes_dict = read_npz_dict(osp.join(raw_dir, 'num_nodes_dict.npz')) + tmp = read_npz_dict(osp.join(raw_dir, 'num_edges_dict.npz')) + num_edges_dict = {tuple(key.split('___')): tmp[key] for key in tmp.keys()} + del tmp + tmp = read_npz_dict(osp.join(raw_dir, 'edge_index_dict.npz')) + edge_index_dict = {tuple(key.split('___')): tmp[key] for key in tmp.keys()} + del tmp + + ent_type_list = sorted(list(num_nodes_dict.keys())) + triplet_type_list = sorted(list(num_edges_dict.keys())) + + num_graphs = len(num_nodes_dict[ent_type_list[0]]) + + except FileNotFoundError: + raise RuntimeError('No necessary file') + + # storing node and edge features + # mapping from the name of the features to feat_dict + node_feat_dict_dict = {} + edge_feat_dict_dict = {} + + for filename in os.listdir(raw_dir): + if '.npz' not in filename: + continue + if filename in ['num_nodes_dict.npz', 'num_edges_dict.npz', 'edge_index_dict.npz']: + continue + + # do not read target label information here + if '-label.npz' in filename: + continue + + feat_name = filename.split('.')[0] + + if 'node_' in feat_name: + feat_dict = read_npz_dict(osp.join(raw_dir, filename)) + node_feat_dict_dict[feat_name] = feat_dict + elif 'edge_' in feat_name: + tmp = read_npz_dict(osp.join(raw_dir, filename)) + feat_dict = {tuple(key.split('___')): tmp[key] for key in tmp.keys()} + del tmp + edge_feat_dict_dict[feat_name] = feat_dict + else: + raise RuntimeError( + f"Keys in graph object should start from either \'node_\' or \'edge_\', but found \'{feat_name}\'.") + + graph_list = [] + num_nodes_accum_dict = {ent_type: 0 for ent_type in ent_type_list} + num_edges_accum_dict = {triplet: 0 for triplet in triplet_type_list} + + print('Processing graphs...') + for i in tqdm(range(num_graphs)): + + graph = dict() + + ### set up default atribute + graph['edge_index_dict'] = {} + graph['num_nodes_dict'] = {} + + for feat_name in node_feat_dict_dict.keys(): + graph[feat_name] = {} + + for feat_name in edge_feat_dict_dict.keys(): + graph[feat_name] = {} + + if not 'edge_feat_dict' in graph: + graph['edge_feat_dict'] = None + + if not 'node_feat_dict' in graph: + graph['node_feat_dict'] = None + + ### handling edge + for triplet in triplet_type_list: + edge_index = edge_index_dict[triplet] + num_edges = num_edges_dict[triplet][i] + num_edges_accum = num_edges_accum_dict[triplet] + + ### add edge_index + graph['edge_index_dict'][triplet] = edge_index[:, num_edges_accum:num_edges_accum + num_edges] + + ### add edge feature + for feat_name in edge_feat_dict_dict.keys(): + if triplet in edge_feat_dict_dict[feat_name]: + feat = edge_feat_dict_dict[feat_name][triplet] + graph[feat_name][triplet] = feat[num_edges_accum: num_edges_accum + num_edges] + + num_edges_accum_dict[triplet] += num_edges + + ### handling node + for ent_type in ent_type_list: + num_nodes = num_nodes_dict[ent_type][i] + num_nodes_accum = num_nodes_accum_dict[ent_type] + + ### add node feature + for feat_name in node_feat_dict_dict.keys(): + if ent_type in node_feat_dict_dict[feat_name]: + feat = node_feat_dict_dict[feat_name][ent_type] + graph[feat_name][ent_type] = feat[num_nodes_accum: num_nodes_accum + num_nodes] + + graph['num_nodes_dict'][ent_type] = num_nodes + num_nodes_accum_dict[ent_type] += num_nodes + + graph_list.append(graph) + + return graph_list + + +def read_npz_dict(path): + tmp = np.load(path) + dict = {} + for key in tmp.keys(): + dict[key] = tmp[key] + del tmp + return dict + + +def read_node_label_hetero(raw_dir): + df = pd.read_csv(osp.join(raw_dir, 'nodetype-has-label.csv.gz')) + label_dict = {} + for nodetype in df.keys(): + has_label = df[nodetype].values[0] + if has_label: + label_dict[nodetype] = pd.read_csv(osp.join(raw_dir, 'node-label', nodetype, 'node-label.csv.gz'), + compression='gzip', header=None).values + + if len(label_dict) == 0: + raise RuntimeError('No node label file found.') + + return label_dict + + +def read_nodesplitidx_split_hetero(split_dir): + df = pd.read_csv(osp.join(split_dir, 'nodetype-has-split.csv.gz')) + train_dict = {} + valid_dict = {} + test_dict = {} + for nodetype in df.keys(): + has_label = df[nodetype].values[0] + if has_label: + train_dict[nodetype] = \ + pd.read_csv(osp.join(split_dir, nodetype, 'train.csv.gz'), compression='gzip', header=None).values.T[0] + valid_dict[nodetype] = \ + pd.read_csv(osp.join(split_dir, nodetype, 'valid.csv.gz'), compression='gzip', header=None).values.T[0] + test_dict[nodetype] = \ + pd.read_csv(osp.join(split_dir, nodetype, 'test.csv.gz'), compression='gzip', header=None).values.T[0] + + if len(train_dict) == 0: + raise RuntimeError('No split file found.') + + return train_dict, valid_dict, test_dict + + +if __name__ == '__main__': + pass + + diff --git a/gammagl/layers/conv/multi_head.py b/gammagl/layers/conv/multi_head.py new file mode 100644 index 000000000..decd185ae --- /dev/null +++ b/gammagl/layers/conv/multi_head.py @@ -0,0 +1,100 @@ +import tensorlayerx as tlx +from gammagl.layers.conv import MessagePassing +from gammagl.utils import segment_softmax +import math +class MultiHead(MessagePassing): + r"""The graph transformer operator from the `"Masked Label Prediction: + Unified Message Passing Model for Semi-Supervised Classification" + `_ paper + + .. math:: + \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + + \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \mathbf{W}_2 \mathbf{x}_{j}, + + where the attention coefficients :math:`\alpha_{i,j}` are computed via + multi-head dot product attention: + + .. math:: + \alpha_{i,j} = \textrm{softmax} \left( + \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} (\mathbf{W}_4\mathbf{x}_j)} + {\sqrt{d}} \right) + + Args: + in_channels (int or tuple): Size of each input sample, or :obj:`-1` to + derive the size from the first input(s) to the forward method. + A tuple corresponds to the sizes of source and target + dimensionalities. + out_channels (int): Size of each output sample. + heads (int, optional): Number of multi-head-attentions. + (default: :obj:`1`) + .. math:: + \mathbf{x}^{\prime}_i = \beta_i \mathbf{W}_1 \mathbf{x}_i + + (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)} + \alpha_{i,j} \mathbf{W}_2 \vec{x}_j \right)}_{=\mathbf{m}_i} + beta + with :math:`\beta_i = \textrm{sigmoid}(\mathbf{w}_5^{\top} + [ \mathbf{W}_1 \mathbf{x}_i, \mathbf{m}_i, \mathbf{W}_1 + \mathbf{x}_i - \mathbf{m}_i ])` (default: :obj:`False`) + + .. math:: + \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + + \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \left( + \mathbf{W}_2 \mathbf{x}_{j} + \mathbf{W}_6 \mathbf{e}_{ij} + \right), + + where the attention coefficients :math:`\alpha_{i,j}` are now + computed via: + + .. math:: + \alpha_{i,j} = \textrm{softmax} \left( + \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} + (\mathbf{W}_4\mathbf{x}_j + \mathbf{W}_6 \mathbf{e}_{ij})} + {\sqrt{d}} \right) + """ + + def __init__(self, in_features, out_features, n_heads,num_nodes,beta=True): + super().__init__() + self.beta=beta + self.heads=n_heads + self.num_nodes=num_nodes + self.out_channels=out_features + self.linear = tlx.layers.Linear(out_features=out_features* n_heads, + in_features=in_features) + + self.lin_key = tlx.layers.Linear(in_features=in_features, out_features=n_heads * out_features, bias=True) + self.lin_query = tlx.layers.Linear(in_features=in_features, out_features=n_heads * out_features, bias=True) + self.lin_value = tlx.layers.Linear(in_features=in_features, out_features=n_heads * out_features, bias=True) + self.lin_skip = tlx.layers.Linear(in_features=in_features, out_features=n_heads * out_features, bias=True) + if beta: + self.lin_beta = tlx.layers.Linear(3 * n_heads * out_features, 1, bias=False) + self.reset_parameters() + + def reset_parameters(self): + self.lin_key.reset_parameters() + self.lin_query.reset_parameters() + self.lin_value.reset_parameters() + self.lin_skip.reset_parameters() + if self.beta: + self.lin_beta.reset_parameters() + + def message(self, query, key, value): + alpha = (query * key).sum(dim=-1) / math.sqrt(self.out_channels) + alpha = segment_softmax(alpha) + alpha = tlx.layers.Dropout(alpha) + out = value + out = out * alpha.view(-1, self.heads, 1) + return out + + def forward(self, x, edge_index): + H, C = self.heads, self.out_channels + query = self.lin_query(x[1]).view(-1, H, C) + key = self.lin_key(x[0]).view(-1, H, C) + value = self.lin_value(x[0]).view(-1, H, C) + out = self.propagate(edge_index, query=query, key=key, value=value) + out = out.view(-1, self.heads * self.out_channels) + if self.beta: + x_r = self.lin_skip(x[1]) + beta = self.lin_beta(tlx.ops.concat([out, x_r, out - x_r], aixs=-1)) + beta = beta.sigmoid() + out = beta * x_r + (1 - beta) * out + return out \ No newline at end of file diff --git a/gammagl/models/unimp.py b/gammagl/models/unimp.py new file mode 100644 index 000000000..af9ab867f --- /dev/null +++ b/gammagl/models/unimp.py @@ -0,0 +1,47 @@ +import tensorlayerx as tlx +import tlx.nn as nn +from gammagl.layers import MultiHead + +class Unimp(tlx.nn.Module): + + r"""The graph attentional operator from the `"Masked Label Prediction: Unified Message Passing Model for Semi-Supervised Classification" + `_ paper + + Parameters + ---------- + dataset: + num_node_features: int + Input feature dimension + num_nodes: int + Number of nodes + x: [num_nodes, num_node_features] + Feature of node + edge_index: [2, num_edges] + Graph connectivity in COO format + edge_attr: [num_edges, num_edge_features] + Edge feature matrix + y: [1. *] + Target to train against (may have arbitrary shape) + pos: [num_nodes, num_dimensions] + Node position matrix + """ + + def __init__(self,dataset): + super(Unimp, self).__init__() + + out_layer1=int(dataset.num_node_features/2) + self.layer1=MultiHead(dataset.num_node_features+1, out_layer1, 4,dataset[0].num_nodes) + self.norm1=nn.LayerNorm(out_layer1) + self.relu1=nn.ReLU() + + self.layer2=MultiHead(out_layer1, dataset.num_classes, 4,dataset[0].num_nodes) + self.norm2=nn.LayerNorm(dataset.num_classes) + self.relu2=nn.ReLU() + def forward(self, x, edge_index): + out1 = self.layer1(x, edge_index) + out2=self.norm1(out1) + out3=self.relu1(out2) + out4=self.layer2(out3,edge_index) + out5 = self.norm2(out4) + out6 = self.relu2(out5) + return out6 \ No newline at end of file diff --git a/tests/datasets/test_ogbgraphdataset.py b/tests/datasets/test_ogbgraphdataset.py new file mode 100644 index 000000000..43936a259 --- /dev/null +++ b/tests/datasets/test_ogbgraphdataset.py @@ -0,0 +1,8 @@ +from gammagl.datasets.ogb_graph import OgbGraphDataset + +def test_ogbgraphdataset(): + data=OgbGraphDataset('ogbg-molhiv') + print(data) + print(data[0]) + +test_ogbgraphdataset() \ No newline at end of file diff --git a/tests/datasets/test_ogblinkdataset.py b/tests/datasets/test_ogblinkdataset.py new file mode 100644 index 000000000..2f22b3ffa --- /dev/null +++ b/tests/datasets/test_ogblinkdataset.py @@ -0,0 +1,7 @@ +from gammagl.datasets.ogb_link import OgbLinkDataset + +def test_ogblinkdataset(): + data=OgbLinkDataset('ogbl-ppa') + print(data[0]) + +test_ogblinkdataset() \ No newline at end of file diff --git a/tests/datasets/test_ogbnodedataset.py b/tests/datasets/test_ogbnodedataset.py new file mode 100644 index 000000000..8be9198b0 --- /dev/null +++ b/tests/datasets/test_ogbnodedataset.py @@ -0,0 +1,7 @@ +from gammagl.datasets.ogb_node import OgbNodeDataset + +def test_ogbnodedataset(): + data=OgbNodeDataset('ogbn-arxiv') + print(data[0]) + +test_ogbnodedataset() \ No newline at end of file