diff --git a/examples/cobformer/cobformer_trainer.py b/examples/cobformer/cobformer_trainer.py new file mode 100644 index 00000000..624567cb --- /dev/null +++ b/examples/cobformer/cobformer_trainer.py @@ -0,0 +1,175 @@ +import os +# os.environ['CUDA_VISIBLE_DEVICES'] = '0' +# os.environ['TL_BACKEND'] = 'torch' +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +# 0:Output all; 1:Filter out INFO; 2:Filter out INFO and WARNING; 3:Filter out INFO, WARNING, and ERROR + +import argparse +import tensorlayerx as tlx +import numpy as np +from sklearn.metrics import f1_score +from partition import partition_patch +from gammagl.datasets import Planetoid +from gammagl.models.cobformer import CoBFormer +from tensorlayerx.model import TrainOneStep, WithLoss + + +def eval_f1(pred, label, num_classes): + pred = tlx.convert_to_numpy(pred) + label = tlx.convert_to_numpy(label) + micro = f1_score(label, pred, average='micro') + macro = f1_score(label, pred, average='macro') + return micro, macro + + +class CoLoss(WithLoss): + def __init__(self, model, loss_fn): + super(CoLoss, self).__init__(backbone=model, loss_fn=loss_fn) + self.alpha = model.alpha + self.tau = model.tau + + def forward(self, data, label): + pred1, pred2 = self.backbone_network(data['x'], data['patch'], data['edge_index'], edge_weight=data['edge_weight'], num_nodes=data['num_nodes']) + l1 = tlx.losses.softmax_cross_entropy_with_logits(pred1[data['train_mask']], label[data['train_mask']]) + l2 = tlx.losses.softmax_cross_entropy_with_logits(pred2[data['train_mask']], label[data['train_mask']]) + + pred1_scaled = pred1 * self.tau + pred2_scaled = pred2 * self.tau + + l3 = tlx.losses.softmax_cross_entropy_with_logits(pred1_scaled[~data['train_mask']], tlx.nn.Softmax()(pred2_scaled)[~data['train_mask']]) + l4 = tlx.losses.softmax_cross_entropy_with_logits(pred2_scaled[~data['train_mask']], tlx.nn.Softmax()(pred1_scaled)[~data['train_mask']]) + + return self.alpha * (l1 + l2) + (1 - self.alpha) * (l3 + l4) + + +def calculate_acc(logits, y, metrics): + """ + Args: + logits: node logits + y: node labels + metrics: tensorlayerx.metrics + + Returns: + rst + """ + + metrics.update(logits, y) + rst = metrics.result() + metrics.reset() + return rst + + +def main(args): + # load datasets + # set_device(5) + if str.lower(args.dataset) not in ['cora','pubmed','citeseer']: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + dataset = Planetoid(args.dataset) + graph = dataset[0] + + graph.train_mask = tlx.convert_to_numpy(graph.train_mask) + graph.val_mask = tlx.convert_to_numpy(graph.val_mask) + graph.test_mask = tlx.convert_to_numpy(graph.test_mask) + # Pad a dimension with value 0 at the end of each mask (1D array) using np.pad(mask, (0, 1), mode='constant') + graph.train_mask = np.pad(graph.train_mask, (0, 1), mode='constant') + graph.val_mask = np.pad(graph.val_mask, (0, 1), mode='constant') + graph.test_mask = np.pad(graph.test_mask, (0, 1), mode='constant') + + patch = partition_patch(graph, args.n_patch) + + # try: + # patch_copy = tlx.cast(patch, dtype=tlx.int64) + # except: + # patch_copy = tlx.convert_to_tensor(patch, dtype=tlx.int64) + + # Convert label to one-hot encoding and cast to float type + label = tlx.nn.OneHot(dataset.num_classes)(graph.y) + label = tlx.cast(label, dtype=tlx.float32) + + model = CoBFormer(graph.num_nodes, dataset.num_node_features, args.num_hidden, dataset.num_classes, layers=args.num_layers, + gcn_layers=args.gcn_layers, n_head=args.n_head, alpha=args.alpha, tau=args.tau, use_patch_attn=args.use_patch_attn) + + optimizer = tlx.optimizers.Adam(lr=args.lr, weight_decay=args.l2_coef) + train_weights = model.trainable_weights + + loss_func = CoLoss(model, tlx.losses.softmax_cross_entropy_with_logits) + train_one_step = TrainOneStep(loss_func, optimizer, train_weights) + + data = { + "x": graph.x, + "y": graph.y, + "edge_index": graph.edge_index, + "edge_weight": None, + "train_mask": graph.train_mask, + "test_mask": graph.test_mask, + "val_mask": graph.val_mask, + "num_nodes": graph.num_nodes, + 'train': graph.train_mask, + 'valid': graph.val_mask, + 'test': graph.test_mask, + 'patch': patch + } + + # best_val_acc = 0 + for epoch in range(args.n_epoch): + model.set_train() + loss = train_one_step(data, label) + model.set_eval() + + pred1, pred2 = model(data['x'], data['patch'], data['edge_index'], edge_weight=data['edge_weight'], num_nodes=data['num_nodes']) + + y = data['y'] + + num_classes = int(tlx.reduce_max(y) + 1) + + y1_ = tlx.argmax(pred1, axis=1) + + micro_val1, macro_val1 = eval_f1(y1_[data['valid']], y[data['valid']], num_classes) + # micro_test1, macro_test1 = eval_f1(y1_[data['test']], y[data['test']], num_classes) + + y2_ = tlx.argmax(pred2, axis=1) + if len(y2_.shape) > 1: + y2_ = y2_.view(-1) + + micro_val2, macro_val2 = eval_f1(y2_[data['valid']], y[data['valid']], num_classes) + # micro_test2, macro_test2 = eval_f1(y2_[data['test']], y[data['test']], num_classes) + + print("Epoch [{:0>3d}] ".format(epoch+1)\ + + " train loss: {:.4f}".format(loss.item())\ + + " GCN micro_val acc: {:.4f}".format(micro_val1)\ + + " GCN macro_val acc: {:.4f}".format(macro_val1)\ + + " COB micro_val acc: {:.4f}".format(micro_val2)\ + + " COB macro_val acc: {:.4f}".format(macro_val2)) + + + +if __name__ == '__main__': + # parameters setting + parser = argparse.ArgumentParser() + parser.add_argument('--dataset', type=str, default='cora', help='dataset') + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument("--l2_coef", type=float, default=5e-4, help="l2 loss coeficient") + parser.add_argument('--gcn_wd', type=float, default=5e-4) + parser.add_argument('--num_hidden', type=int, default=64, help='Number of hidden units') + parser.add_argument('--num_layers', type=int, default=1, help='Number of layers') + parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads') + parser.add_argument('--n_epoch', type=int, default=500, help='Number of training epochs') + parser.add_argument('--use_patch_attn', action='store_true', help='transformer use patch attention') + parser.add_argument('--show_details', type=bool, default=True) + parser.add_argument('--gcn_layers', type=int, default=2) + parser.add_argument('--n_patch', type=int, default=112) + parser.add_argument('--batch_size', type=int, default=100000) + parser.add_argument('--train_prop', type=float, default=.6) + parser.add_argument('--valid_prop', type=float, default=.2) + parser.add_argument('--alpha', type=float, default=.8) + parser.add_argument('--tau', type=float, default=.3) + parser.add_argument('--gpu', type=int, default=0) + + args = parser.parse_args() + + if args.gpu >= 0: + tlx.set_device("GPU", args.gpu) + else: + tlx.set_device("CPU") + + main(args) diff --git a/examples/cobformer/partition.py b/examples/cobformer/partition.py new file mode 100644 index 00000000..b0cbeff3 --- /dev/null +++ b/examples/cobformer/partition.py @@ -0,0 +1,88 @@ +import tensorlayerx as tlx +import numpy as np +import networkx as nx +import metis + +def partition_patch(graph, n_patches, load_path=None): + + if load_path is not None: + # 使用 numpy 加载数据,假设数据是保存为 .npy 文件 + patch = np.load(load_path) + patch = tlx.convert_to_tensor(patch, dtype=tlx.int64) + else: + if n_patches == 1: + patch = np.arange(graph.num_nodes + 1) + patch = tlx.convert_to_tensor(patch, dtype=tlx.int64) + patch = tlx.expand_dims(patch, axis=0) + else: + patch = metis_partition(g=graph, n_patches=n_patches) + + print('metis done!!!') + + print('patch done!!!') + + # Graph update operations + + # torch版本的pad,可对比看是否使用正确 + graph.num_nodes += 1 + ''' + torch版本 + graph.x = F.pad(graph.x, [0, 0, 0, 1]) + # label = F.pad(label, [0, 1]) + graph.y = F.pad(graph.y, [0, 1]) + ''' + # 对x进行padding + x_shape = graph.x.shape + padded_x = np.pad(tlx.convert_to_numpy(graph.x), + pad_width=((0, 1), (0, 0)), + mode='constant', + constant_values=0) + graph.x = tlx.convert_to_tensor(padded_x) + + # 对y进行padding + padded_y = np.pad(tlx.convert_to_numpy(graph.y), + pad_width=(0, 1), + mode='constant', + constant_values=0) + graph.y = tlx.convert_to_tensor(padded_y) + + return patch + +def metis_partition(g, n_patches=50): + + if g.num_nodes < n_patches: + # 如果节点数小于需要的分割数,则直接随机分配 + membership = np.random.permutation(n_patches) + membership = tlx.convert_to_tensor(membership, dtype=tlx.int64) + else: + # 如果节点数大于或等于分割数,使用 METIS 进行分割 + adjlist = g.edge_index.T # 获取边的邻接列表 + G = nx.Graph() # 创建一个空的无向图 + G.add_nodes_from(np.arange(g.num_nodes)) # 添加节点 + G.add_edges_from(adjlist.tolist()) # 添加边 + + # 使用 METIS 分割图 + cuts, membership = metis.part_graph(G, n_patches, recursive=True) + + # 确保每个节点的归属部分数量不小于节点数 + assert len(membership) >= g.num_nodes + membership = tlx.convert_to_tensor(membership[:g.num_nodes], dtype=tlx.int64) + + patch = [] # 用于存储每个分割部分的节点索引 + max_patch_size = -1 # 用于记录最大的子图大小 + + for i in range(n_patches): + patch.append(list()) + # 使用 numpy 的 np.where 来代替 torch.where + patch[-1] = np.where(tlx.convert_to_numpy(membership == i))[0].tolist() # 归属到 i 号部分的节点 + max_patch_size = max(max_patch_size, len(patch[-1])) # 更新最大的子图大小 + + # 填充所有子图,使它们的大小一致 + for i in range(len(patch)): + l = len(patch[i]) + if l < max_patch_size: + patch[i] += [g.num_nodes] * (max_patch_size - l) + + patch = tlx.convert_to_tensor(patch, dtype=tlx.int64) # 返回最终的分割结果 + + return patch diff --git a/examples/cobformer/readme.md b/examples/cobformer/readme.md new file mode 100644 index 00000000..b23017e3 --- /dev/null +++ b/examples/cobformer/readme.md @@ -0,0 +1,27 @@ +# Less is More: on the Over-Globalizing Problem in Graph Transformers (CoBformer) + +- Paper link: [Less is More: on the Over-Globalizing Problem in Graph Transformers](http://arxiv.org/abs/2405.01102) + +## Dataset Statistics + +Refer to [Planetoid](https://gammagl.readthedocs.io/en/latest/api/gammagl.datasets.html#gammagl.datasets.Planetoid). + +## Results +```bash +# Cora +python cobformer_trainer.py --dataset=Cora --learning_rate=0.01 --gcn_wd=1e-3 --weight_decay=5e-5 --gcn_type=1 --gcn_layers=2 --n_patch=112 --use_patch_attn --alpha=0.7 --tau=0.3 --gpu_id=0 + +# CiteSeer +python cobformer_trainer.py --dataset=CiteSeer --learning_rate=5e-3 --gcn_wd=1e-2 --weight_decay=5e-5 --gcn_type=1 --gcn_layers=2 --n_patch=144 --use_patch_attn --alpha=0.8 --tau=0.7 --gpu_id=0 + +# PubMed +python cobformer_trainer.py --dataset=PubMed --learning_rate=5e-3 --gcn_wd=1e-3 --weight_decay=1e-3 --gcn_type=1 --gcn_layers=2 --n_patch=224 --use_patch_attn --alpha=0.7 --tau=0.3 --gpu_id=0 + +``` + +| Dataset | Paper | Our(tf) | +| -------- | ----- | ----------- | +| cora | 85.28 | 83.16 ± 0.59 | +| citeseer | 74.52 | 71.20 ± 0.80 | +| pubmed | 81.42 | 81.84 ± 0.45 | + diff --git a/gammagl/layers/attention/__init__.py b/gammagl/layers/attention/__init__.py index e83c140a..c5286b18 100644 --- a/gammagl/layers/attention/__init__.py +++ b/gammagl/layers/attention/__init__.py @@ -5,6 +5,8 @@ from .heco_encoder import Sc_encoder from .heco_encoder import Mp_encoder from .sgformer_layer import TransConvLayer, GraphConvLayer +from .bga_layer import BGALayer + __all__ = [ 'Sc_encoder', 'Mp_encoder', @@ -14,6 +16,7 @@ 'GraphormerLayer', 'TransConvLayer', 'GraphConvLayer', + 'BGALayer' ] classes = __all__ diff --git a/gammagl/layers/attention/bga.py b/gammagl/layers/attention/bga.py new file mode 100644 index 00000000..62803bd4 --- /dev/null +++ b/gammagl/layers/attention/bga.py @@ -0,0 +1,35 @@ +import tensorlayerx as tlx +from .bga_layer import BGALayer + + +class BGA(tlx.nn.Module): + def __init__(self, num_nodes: int, in_channels: int, hidden_channels: int, out_channels: int, + layers: int, n_head: int, use_patch_attn=True, dropout1=0.5, dropout2=0.1, need_attn=False): + super(BGA, self).__init__() + self.layers = layers + self.n_head = n_head + self.num_nodes = num_nodes + self.dropout = tlx.nn.Dropout(p=dropout1) + # 初始化线性层,直接使用nn.Linear + self.attribute_encoder = tlx.nn.Linear(in_features=in_channels, out_features=hidden_channels) + self.BGALayers = tlx.nn.ModuleList() + for _ in range(0, layers): + self.BGALayers.append( + BGALayer(n_head, hidden_channels, use_patch_attn, dropout=dropout2)) + self.classifier = tlx.nn.Linear(in_features=hidden_channels, out_features=out_channels) + self.attn = [] + + def forward(self, x, patch, need_attn=False): + patch_mask = tlx.cast(patch != self.num_nodes - 1, dtype=tlx.float32) + patch_mask = tlx.expand_dims(patch_mask, axis=-1) + attn_mask = tlx.cast(tlx.matmul(patch_mask, tlx.transpose(patch_mask, perm=[0, 2, 1])), dtype=tlx.int32) + + x = tlx.relu(self.attribute_encoder(x)) + + for i in range(0, self.layers): + x = self.BGALayers[i](x, patch, attn_mask, need_attn) + if need_attn: + self.attn.append(self.BGALayers[i].attn) + x = self.dropout(x) + x = self.classifier(x) + return x \ No newline at end of file diff --git a/gammagl/layers/attention/bga_layer.py b/gammagl/layers/attention/bga_layer.py new file mode 100644 index 00000000..2140e7c4 --- /dev/null +++ b/gammagl/layers/attention/bga_layer.py @@ -0,0 +1,125 @@ +from tqdm import tqdm +import tensorlayerx as tlx +import math + + +class MultiHeadAttention(tlx.nn.Module): + ''' Multi-Head Attention module ''' + + def __init__(self, n_head, channels, dropout=0.1): + super(MultiHeadAttention, self).__init__() + + self.n_head = n_head + self.channels = channels + d_q = d_k = d_v = channels // n_head + + self.w_qs = tlx.nn.Linear(in_features=channels, out_features=channels, b_init=None) + self.w_ks = tlx.nn.Linear(in_features=channels, out_features=channels, b_init=None) + self.w_vs = tlx.nn.Linear(in_features=channels, out_features=channels, b_init=None) + self.fc = tlx.nn.Linear(in_features=channels, out_features=channels, b_init=None) + + self.temperature = d_k ** 0.5 + self.dropout = tlx.nn.Dropout(p=dropout) + + def forward(self, q, k, v, mask=None): + n_head = self.n_head + d_q = d_k = d_v = self.channels // n_head + + # Get batch size and sequence length + q_shape = tlx.get_tensor_shape(q) + k_shape = tlx.get_tensor_shape(k) + v_shape = tlx.get_tensor_shape(v) + + B_q, N_q = q_shape[0], q_shape[1] + B_k, N_k = k_shape[0], k_shape[1] + B_v, N_v = v_shape[0], v_shape[1] + + residual = q + + # Linear projections and reshape + q = tlx.reshape(self.w_qs(q), shape=[B_q, N_q, n_head, d_q]) + k = tlx.reshape(self.w_ks(k), shape=[B_k, N_k, n_head, d_k]) + v = tlx.reshape(self.w_vs(v), shape=[B_v, N_v, n_head, d_v]) + + # Transpose for attention + q = tlx.transpose(q, perm=[0, 2, 1, 3]) + k = tlx.transpose(k, perm=[0, 2, 1, 3]) + v = tlx.transpose(v, perm=[0, 2, 1, 3]) + + # Scaled Dot-Product Attention + attn = tlx.matmul(q / self.temperature, tlx.transpose(k, perm=[0, 1, 3, 2])) + + if mask is not None: + mask = tlx.expand_dims(mask, axis=1) + attn = tlx.where(mask == 0, tlx.ones_like(attn) * (-1e9), attn) + + attn = self.dropout(tlx.softmax(attn, axis=-1)) + output = tlx.matmul(attn, v) + + # Transpose back and reshape + output = tlx.transpose(output, perm=[0, 2, 1, 3]) + output = tlx.reshape(output, shape=[B_q, N_q, -1]) + + # Final linear projection + output = self.fc(output) + output = output + residual + + return output, attn + + +class FFN(tlx.nn.Module): + ''' A two-feed-forward-layer module ''' + + def __init__(self, channels, dropout=0.1): + super(FFN, self).__init__() + self.lin1 = tlx.nn.Linear(in_features=channels, out_features=channels) # position-wise + self.lin2 = tlx.nn.Linear(in_features=channels, out_features=channels) # position-wise + self.layer_norm = tlx.nn.LayerNorm(normalized_shape=channels, epsilon=1e-6) + self.dropout = tlx.nn.Dropout(p=dropout) + + def forward(self, x): + residual = x + x = self.layer_norm(x) + x = self.dropout(x) + x = tlx.relu(self.lin1(x)) + x = self.lin2(x) + residual + + return x + + +class BGALayer(tlx.nn.Module): + def __init__(self, n_head, channels, use_patch_attn=True, dropout=0.1): + super(BGALayer, self).__init__() + self.node_norm = tlx.nn.LayerNorm(normalized_shape=channels, epsilon=1e-6) + self.node_transformer = MultiHeadAttention(n_head, channels, dropout) + self.patch_norm = tlx.nn.LayerNorm(normalized_shape=channels, epsilon=1e-6) + self.patch_transformer = MultiHeadAttention(n_head, channels, dropout) + self.node_ffn = FFN(channels, dropout) + self.patch_ffn = FFN(channels, dropout) + self.fuse_lin = tlx.nn.Linear(in_features=2 * channels, out_features=channels) + self.use_patch_attn = use_patch_attn + + def forward(self, x, patch, attn_mask=None, need_attn=False): + x = self.node_norm(x) + + patch_x = x[patch] + patch_x, attn = self.node_transformer(patch_x, patch_x, patch_x, attn_mask) + patch_x = self.node_ffn(patch_x) + + if self.use_patch_attn: + p = self.patch_norm(tlx.reduce_mean(patch_x, axis=1, keepdims=False)) + p = tlx.expand_dims(p, axis=0) + p, _ = self.patch_transformer(p, p, p) + p = self.patch_ffn(p) + p = tlx.transpose(p, perm=[1, 0, 2]) + + # repeat操作 + patch_shape = tlx.get_tensor_shape(patch) + p = tlx.tile(p, [1, patch_shape[1], 1]) + + z = tlx.concat([patch_x, p], axis=2) + patch_x = tlx.relu(self.fuse_lin(z)) + patch_x + + x[patch] = patch_x + + return x \ No newline at end of file diff --git a/gammagl/models/__init__.py b/gammagl/models/__init__.py index f7a1de31..6485723b 100644 --- a/gammagl/models/__init__.py +++ b/gammagl/models/__init__.py @@ -65,6 +65,7 @@ from .rohehan import RoheHAN from .gcil import GCILModel, LogReg from .sgformer import SGFormerModel +from .cobformer import CoBFormer __all__ = [ 'HeCo', @@ -138,6 +139,7 @@ 'GCILModel', 'LogReg', 'sgformer', + 'CoBFormer', ] classes = __all__ diff --git a/gammagl/models/cobformer.py b/gammagl/models/cobformer.py new file mode 100644 index 00000000..38e67c37 --- /dev/null +++ b/gammagl/models/cobformer.py @@ -0,0 +1,35 @@ +import tensorlayerx as tlx +from gammagl.models import GCNModel as GCN +from gammagl.layers.attention.bga import BGA + + +class CoBFormer(tlx.nn.Module): + def __init__(self, num_nodes: int, in_channels: int, hidden_channels: int, out_channels: int, + gcn_layers: int, layers: int, n_head: int, dropout1=0.5, dropout2=0.1, + alpha=0.8, tau=0.5, use_patch_attn=True): + super(CoBFormer, self).__init__() + self.alpha = alpha + self.tau = tau + self.layers = layers + self.n_head = n_head + self.num_nodes = num_nodes + self.activation = tlx.ReLU() + self.gcn = GCN( + feature_dim=in_channels, + hidden_dim=hidden_channels, + num_class=out_channels, + drop_rate=dropout1, + num_layers=gcn_layers + ) + self.bga = BGA(num_nodes, in_channels, hidden_channels, out_channels, layers, n_head, + use_patch_attn, dropout1, dropout2) + self.attn = None + + def forward(self, x, patch, edge_index, edge_weight=None, num_nodes=None, need_attn=False): + z1 = self.gcn(x, edge_index, edge_weight=edge_weight, num_nodes=num_nodes) + z2 = self.bga(x, patch, need_attn) + + if need_attn: + self.attn = self.bga.attn + + return z1, z2