diff --git a/large/Indian_pines.png b/large/Indian_pines.png new file mode 100644 index 0000000..590cc7c Binary files /dev/null and b/large/Indian_pines.png differ diff --git a/large/__pycache__/build_graph.cpython-39.pyc b/large/__pycache__/build_graph.cpython-39.pyc new file mode 100644 index 0000000..2c2f541 Binary files /dev/null and b/large/__pycache__/build_graph.cpython-39.pyc differ diff --git a/large/__pycache__/data_utils.cpython-39.pyc b/large/__pycache__/data_utils.cpython-39.pyc new file mode 100644 index 0000000..63191be Binary files /dev/null and b/large/__pycache__/data_utils.cpython-39.pyc differ diff --git a/large/__pycache__/dataset.cpython-39.pyc b/large/__pycache__/dataset.cpython-39.pyc new file mode 100644 index 0000000..a710ca5 Binary files /dev/null and b/large/__pycache__/dataset.cpython-39.pyc differ diff --git a/large/__pycache__/eval.cpython-39.pyc b/large/__pycache__/eval.cpython-39.pyc new file mode 100644 index 0000000..97c9366 Binary files /dev/null and b/large/__pycache__/eval.cpython-39.pyc differ diff --git a/large/__pycache__/gnns.cpython-39.pyc b/large/__pycache__/gnns.cpython-39.pyc new file mode 100644 index 0000000..4e42654 Binary files /dev/null and b/large/__pycache__/gnns.cpython-39.pyc differ diff --git a/large/__pycache__/logger.cpython-39.pyc b/large/__pycache__/logger.cpython-39.pyc new file mode 100644 index 0000000..c35b52d Binary files /dev/null and b/large/__pycache__/logger.cpython-39.pyc differ diff --git a/large/__pycache__/ours.cpython-39.pyc b/large/__pycache__/ours.cpython-39.pyc new file mode 100644 index 0000000..40d46cb Binary files /dev/null and b/large/__pycache__/ours.cpython-39.pyc differ diff --git a/large/__pycache__/parse.cpython-39.pyc b/large/__pycache__/parse.cpython-39.pyc new file mode 100644 index 0000000..cbdc219 Binary files /dev/null and b/large/__pycache__/parse.cpython-39.pyc differ diff --git a/large/arxiv.bat b/large/arxiv.bat new file mode 100644 index 0000000..78428ab --- /dev/null +++ b/large/arxiv.bat @@ -0,0 +1,7 @@ +@echo off +echo Running ogbn-arxiv... +python main.py --method sgformer --dataset ogbn-arxiv --metric acc --lr 0.001 --hidden_channels 256 --use_graph --graph_weight 0.5 ^ +--gnn_num_layers 3 --gnn_dropout 0.5 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_act ^ +--trans_num_layers 1 --trans_dropout 0.5 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ +--seed 123 --runs 5 --epochs 1000 --eval_step 9 --device 0 --data_dir D:\ogb_dataset\ogb\arxiv +pause \ No newline at end of file diff --git a/large/best_model.pt b/large/best_model.pt new file mode 100644 index 0000000..70638a1 Binary files /dev/null and b/large/best_model.pt differ diff --git a/large/build_graph.py b/large/build_graph.py new file mode 100644 index 0000000..61b5cbb --- /dev/null +++ b/large/build_graph.py @@ -0,0 +1,126 @@ +import numpy as np +import torch +from torch import long +from sklearn.metrics import pairwise_distances +from sklearn.neighbors import NearestNeighbors +from sklearn.decomposition import PCA +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.preprocessing import StandardScaler + + +def build_graph_by_pos(height: long, width: long, node_features=None): + """ + 简单的八向建图 8*height*width + """ + edge_index = [] + for i in range(height): + for j in range(width): + node_idx = i * width + j + if i > 0: # up + edge_index.append([node_idx, node_idx - width]) + if i < height - 1: # down + edge_index.append([node_idx, node_idx + width]) + if j > 0: # left + edge_index.append([node_idx, node_idx - 1]) + if j < width - 1: # right + edge_index.append([node_idx, node_idx + 1]) + if i > 0 and j > 0: # left and up + edge_index.append([node_idx, node_idx - width - 1]) + if i > 0 and j < width - 1: # right and up + edge_index.append([node_idx, node_idx - width + 1]) + if i < height - 1 and j > 0: # left and down + edge_index.append([node_idx, node_idx + width - 1]) + if i < height - 1 and j < width - 1: # right and down + edge_index.append([node_idx, node_idx + width + 1]) + edge_index = torch.tensor(edge_index, dtype=torch.long).t() + return edge_index + +def build_graph_by_Knn(height: long, width: long, node_features=None, k=10): + num_nodes = height * width + edge_index = [] + nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(node_features) + distances, indices = nbrs.kneighbors(node_features) + + # 遍历每个节点的近邻 + for i in range(num_nodes): + for j in range(1, k): # 从1开始,跳过自己 + neighbor_index = indices[i][j] + edge_index.append((i, neighbor_index)) + + # 将边列表转换为 NumPy 数组 + edge_index = torch.tensor(edge_index, dtype=torch.long).T # 转置为 [2, num_edges] + + return edge_index + + +def get_weight(node_features, edge_index, sigma=10): + """根据节点特征和边索引计算边权重,并返回张量形式的权重""" + num_edges = edge_index.shape[1] # 边的数量 + edge_weights = torch.zeros(num_edges, dtype=torch.float) + + def gaussian_kernel(x, y, sig=1.0): + return torch.exp(-torch.norm(x - y) ** 2 / (sig ** 2)) + + # 遍历每条边 + for i in range(num_edges): + start_node = edge_index[0, i] # 从节点 + end_node = edge_index[1, i] # 到节点 + + # 提取节点特征 + x_i = node_features[start_node] # 从节点特征 + x_j = node_features[end_node] # 到节点特征 + + # 计算高斯核权重 + edge_weights[i] = gaussian_kernel(x_i, x_j, sigma) + + return edge_weights + + +def build_graph_by_fix(node_fea, ground_truth, train_idx, row, col): + """ + 先对数据进行LDA降维之后 对降维光谱特征+位置特征进行knn建图 + 主要是加入pos 但是单纯的加入pos对于欧式距离的衡量改变不大,所以先降维成channel再增加pos信息 + 这样pos信息所占比重就能上升,以此查看与单纯的220个波段的knn之间的区别 + """ + edge_index = [] + k = 15 # 邻居数量 + weight_factor = 15 # 权重系数 + + x = node_fea[train_idx] + y = ground_truth[train_idx] + lda = LinearDiscriminantAnalysis() + # 调用fit方法 对训练集像素点的特征和标签进行拟合 + lda.fit(x, y - 1) + x_new = lda.transform(node_fea) + + all_indices = np.arange(x_new.shape[0]) # 计算 x_new 中所有节点的索引 + x_coords = all_indices // col # 计算 x 坐标 + y_coords = all_indices % col # 计算 y 坐标 + + # 将坐标信息添加到 x_new 中 + coords = np.column_stack((x_coords, y_coords)) # 合并 x 和 y 坐标 + x_new_with_coords = np.hstack((x_new, coords)) # 将坐标与 x_new 合并 + + # 进行最小-最大标准化 + x_min = np.min(x_new_with_coords, axis=0) # 每列的最小值 + x_max = np.max(x_new_with_coords, axis=0) # 每列的最大值 + + # 标准化 + x_normalized = (x_new_with_coords - x_min) / (x_max - x_min) + + x_weighted = x_normalized.copy() + x_weighted[:, -2:] *= weight_factor # 增加最后两个维度的权重 + + knn = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(x_weighted) + distances, indices = knn.kneighbors(x_weighted) + + # 遍历每个节点的近邻 + for i in range(row * col): + for j in range(1, k): # 从1开始,跳过自己 + neighbor_index = indices[i][j] + edge_index.append((i, neighbor_index)) + + # 将边列表转换为 NumPy 数组 + edge_index = torch.tensor(edge_index, dtype=torch.long).T # 转置为 [2, num_edges] + + return edge_index \ No newline at end of file diff --git a/large/data_utils.py b/large/data_utils.py index da76ac7..c74e4b1 100644 --- a/large/data_utils.py +++ b/large/data_utils.py @@ -1,15 +1,18 @@ import os from collections import defaultdict - +import random import torch import torch.nn.functional as F import numpy as np from scipy import sparse as sp -from sklearn.metrics import roc_auc_score, f1_score - +from sklearn.metrics import roc_auc_score, f1_score, cohen_kappa_score from torch_sparse import SparseTensor +import matplotlib.pyplot as plt +import spectral as spy +from matplotlib import cm from google_drive_downloader import GoogleDriveDownloader as gdd + def rand_train_test_idx(label, train_prop=.5, valid_prop=.25, ignore_negative=True): """ randomly splits label into train/valid/test splits """ if ignore_negative: @@ -36,6 +39,7 @@ def rand_train_test_idx(label, train_prop=.5, valid_prop=.25, ignore_negative=Tr return train_idx, valid_idx, test_idx + def load_fixed_splits(data_dir, dataset, name, protocol): splits_lst = [] if name in ['cora', 'citeseer', 'pubmed'] and protocol == 'semi': @@ -46,7 +50,7 @@ def load_fixed_splits(data_dir, dataset, name, protocol): splits_lst.append(splits) elif name in ['cora', 'citeseer', 'pubmed', 'chameleon', 'squirrel', 'film', 'cornell', 'texas', 'wisconsin']: for i in range(10): - splits_file_path = '{}/geom-gcn/splits/{}'.format(data_dir, name) + '_split_0.6_0.2_'+str(i)+'.npz' + splits_file_path = '{}/geom-gcn/splits/{}'.format(data_dir, name) + '_split_0.6_0.2_' + str(i) + '.npz' splits = {} with np.load(splits_file_path) as splits_file: splits['train'] = torch.BoolTensor(splits_file['train_mask']) @@ -58,8 +62,9 @@ def load_fixed_splits(data_dir, dataset, name, protocol): return splits_lst + def class_rand_splits(label, label_num_per_class, valid_num=500, test_num=1000): - train_idx, non_train_idx = [], [] + train_idx, non_train_idx = [], [] # 训练集、非训练集 idx = torch.arange(label.shape[0]) class_list = label.squeeze().unique() for i in range(class_list.shape[0]): @@ -72,10 +77,119 @@ def class_rand_splits(label, label_num_per_class, valid_num=500, test_num=1000): train_idx = torch.as_tensor(train_idx) non_train_idx = torch.as_tensor(non_train_idx) non_train_idx = non_train_idx[torch.randperm(non_train_idx.shape[0])] - valid_idx, test_idx = non_train_idx[:valid_num], non_train_idx[valid_num:valid_num+test_num] + valid_idx, test_idx = non_train_idx[:valid_num], non_train_idx[valid_num:valid_num + test_num] return train_idx, valid_idx, test_idx + +def hsi_splits(label, train_prop=0.1, valid_prop=0.01): + """ + hsi数据集划分 每个类都选取0.1的样本 + return idx都是tensor形式 + """ + classCount = label.max().item() + print(f"class count = {classCount}") + train_rand_idx = [] + for i in range(classCount): + idx = torch.where(label == i + 1)[0] + samplesCount = len(idx) + if samplesCount > 0: # 只有当类别样本存在时才进行抽样 + # 0~len随机抽取idx + rand_list = [j for j in range(samplesCount)] + rand_idx = random.sample(rand_list, + int(np.ceil(samplesCount * train_prop))) + rand_real_idx_per_class = idx[rand_idx] + train_rand_idx.append(rand_real_idx_per_class) + # train_rand_idx是二维数组形式[[1, 2], [4, 7][8, 11]] 需要转换成一维 + """ + train_idx = [] + # 使用 len() 函数获取 train_rand_idx 的长度 + for i in range(len(train_rand_idx)): + # 获取第 i 个类别的随机样本索引数组 + temp = train_rand_idx[i] + train_idx.extend(temp) + """ + train_idx = torch.cat(train_rand_idx).tolist() if train_rand_idx else [] + + # train_labels = label[train_idx] + # if (train_labels == 0).any(): + # print("Warning: Train indices contain background samples (label=0).") + # else: + # print("train_correct") + + # 这里还要将这些idx变成set去重 得到测试集 + train_set = set(train_idx) + + # 所有数据的set + """ + all_set = [i for i in range(len(label))] + all_set = set(all_set) + """ + all_set = set(range(len(label))) # fixed + + """ + # 得到背景元素下标 + background_set = torch.where(label == 0)[0] + background_set = set(background_set) + # 测试集 + test_set = all_set - background_set - train_set + # 转化成列表方便选取验证集元素 + test_set = list(test_set) + """ + + # 得到背景元素下标 + background_set = set(torch.where(label == 0)[0].tolist()) + + # 计算测试集 + test_set = all_set - background_set - train_set + + # 转换为列表 + test_set = list(test_set) + + # 检查测试集是否包含背景类 + test_labels = label[torch.tensor(test_set)] + + # 这个检查是为了确保测试集不包含背景类 + if (test_labels == 0).any(): + print("Warning: test indices contain background samples (label=0).") + else: + print("test correct") + + # 生成验证集 + val_count = int(valid_prop * (len(train_set) + len(test_set))) + valid_idx = random.sample(test_set, min(val_count, len(test_set))) # 防止val_count超出范围 + + # 更新测试集,排除验证集 + valid_set = set(valid_idx) + test_set = list(set(test_set) - valid_set) # 转换为列表并去重 + """ + # 验证集大小 + valCount = int(valid_prop * (len(train_set) + len(test_set))) + valid_idx = random.sample(test_set, valCount) + + # 求出剩余的测试集 + valid_set = set(valid_idx) + test_set = set(test_set) - valid_set + test_set = list(test_set) + """ + + # train_labels = label[train_idx] + # if (train_labels == 0).any(): + # print("Warning: Train indices contain background samples (label=0).") + # test_labels = label[test_set] + # if (test_labels == 0).any(): + # print("Warning: test indices contain background samples (label=0).") + # valid_labels = label[valid_idx] + # if (valid_labels == 0).any(): + # print("Warning: valid indices contain background samples (label=0).") + + # 最后返回tensor形式 + test_idx = torch.tensor(test_set) + valid_idx = torch.tensor(valid_idx) + train_idx = torch.as_tensor(train_idx) + return train_idx, valid_idx, test_idx + + def even_quantile_labels(vals, nclasses, verbose=True): """ partitions vals into nclasses by a quantile based split, where the first class is less than the 1/nclasses quantile, @@ -191,11 +305,12 @@ def gen_normalized_adjs(dataset): D_isqrt = deg.pow(-0.5) D_isqrt[D_isqrt == float('inf')] = 0 - DAD = D_isqrt.view(-1,1) * adj * D_isqrt.view(1,-1) - DA = D_isqrt.view(-1,1) * D_isqrt.view(-1,1) * adj - AD = adj * D_isqrt.view(1,-1) * D_isqrt.view(1,-1) + DAD = D_isqrt.view(-1, 1) * adj * D_isqrt.view(1, -1) + DA = D_isqrt.view(-1, 1) * D_isqrt.view(-1, 1) * adj + AD = adj * D_isqrt.view(1, -1) * D_isqrt.view(1, -1) return DAD, DA, AD + def eval_f1(y_true, y_pred): acc_list = [] y_true = y_true.detach().cpu().numpy() @@ -205,7 +320,104 @@ def eval_f1(y_true, y_pred): f1 = f1_score(y_true, y_pred, average='micro') acc_list.append(f1) - return sum(acc_list)/len(acc_list) + return sum(acc_list) / len(acc_list) + + +def eval_oa(y_true, y_pred): # 直接修改了标签导致下标越界 + with torch.no_grad(): + y_true_detached = y_true.detach().cpu().numpy() + y_true_detached = np.squeeze(y_true_detached) + y_true_detached -= 1 + + y_pred_probs = torch.softmax(y_pred, dim=-1) # 计算每个类的概率 + y_pred_labels = y_pred_probs.argmax(dim=-1, keepdim=True).detach().cpu().numpy() # 获取预测的类别 + + total_samples = y_true_detached.shape[0] # 总样本数 + correct_samples = np.sum(y_pred_labels.flatten() == y_true_detached) # 计算正确样本数 + + oa = correct_samples / total_samples if total_samples > 0 else 0.0 # 避免除以零 + return oa + +def eval_aa(y_true, y_pred, class_count: int): + with torch.no_grad(): + # 先获取正确下标 从0开始 + y_true_detached = y_true.detach().cpu().numpy() + y_true_detached = np.squeeze(y_true_detached) + y_true_detached -= 1 + + y_pred_probs = torch.softmax(y_pred, dim=-1) # 计算每个类的概率 + y_pred_labels = y_pred_probs.argmax(dim=-1, keepdim=True).detach().cpu().numpy() # 获取预测的类别 + + # 初始化每个种类的统计情况 + correct_counts = np.zeros(class_count, dtype=int) + total_counts = np.zeros(class_count, dtype=int) + class_acc = np.zeros(class_count) + + for i in range(len(y_true_detached)): + true_class = int(y_true_detached[i]) + pred_class = int(y_pred_labels[i]) + # 类别是0~classCount-1 + if 0 <= true_class < class_count: + total_counts[true_class] += 1 + if true_class == pred_class: + correct_counts[true_class] += 1 + + for i in range(class_count): + if total_counts[i] > 0: + class_acc[i] = correct_counts[i] / total_counts[i] + else: + class_acc[i] = 0.0 + + aa = np.mean(class_acc) + + return aa + +def eval_kappa(y_true, y_pred): # 检查标签一致性 + with torch.no_grad(): + y_true_detached = y_true.detach().cpu().numpy() + y_true_detached = np.squeeze(y_true_detached) + y_true_detached -= 1 + + # 计算预测下标 + y_pred_probs = torch.softmax(y_pred, dim=-1) # 计算每个类的概率 + # 一维的分类结果 argmax + y_pred_labels = y_pred_probs.argmax(dim=-1, keepdim=False).detach().cpu().numpy() # 获取预测的类别 + + kappa = cohen_kappa_score(y_true_detached.astype(np.int16), y_pred_labels.astype(np.int16)) + + return kappa + +def draw_Classification_Map(label, name: str, scale: float = 4.0, dpi: int = 400): + ''' + get classification map , then save to given path + :param label: classification label, 2D 二维数组 + :param name: saving path and file's name + :param scale: scale of image. If equals to 1, then saving-size is just the label-size + :param dpi: default is OK <===> 400点每英寸 + :return: null + ''' + cmap = cm.get_cmap('jet', label.max().item()) + plt.set_cmap(cmap) + + fig, ax = plt.subplots() + num_label = np.array(label) + v = spy.imshow(classes=num_label.astype(np.int16), fignum=fig.number) + # 关闭坐标轴的显示。 X,Y的可见性设置为false + ax.set_axis_off() + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + # 设置图像大小 + fig.set_size_inches(label.shape[1] * scale / dpi, label.shape[0] * scale / dpi) + # 'get current figure' 获取当前图像 + foo_fig = plt.gcf() + # 移除X轴和Y轴的主要刻度标记。nullLocataor + plt.gca().xaxis.set_major_locator(plt.NullLocator()) + plt.gca().yaxis.set_major_locator(plt.NullLocator()) + # 整子图参数,使得图像边缘没有空白 + plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) + foo_fig.savefig(name + '.png', format='png', transparent=True, dpi=dpi, pad_inches=0) + pass + def eval_acc(y_true, y_pred): acc_list = [] @@ -215,9 +427,9 @@ def eval_acc(y_true, y_pred): for i in range(y_true.shape[1]): is_labeled = y_true[:, i] == y_true[:, i] correct = y_true[is_labeled, i] == y_pred[is_labeled, i] - acc_list.append(float(np.sum(correct))/len(correct)) + acc_list.append(float(np.sum(correct)) / len(correct)) - return sum(acc_list)/len(acc_list) + return sum(acc_list) / len(acc_list) def eval_rocauc(y_true, y_pred): @@ -227,7 +439,7 @@ def eval_rocauc(y_true, y_pred): y_true = y_true.detach().cpu().numpy() if y_true.shape[1] == 1: # use the predicted class for single-class classification - y_pred = F.softmax(y_pred, dim=-1)[:,1].unsqueeze(1).cpu().numpy() + y_pred = F.softmax(y_pred, dim=-1)[:, 1].unsqueeze(1).cpu().numpy() else: y_pred = y_pred.detach().cpu().numpy() @@ -236,22 +448,24 @@ def eval_rocauc(y_true, y_pred): if np.sum(y_true[:, i] == 1) > 0 and np.sum(y_true[:, i] == 0) > 0: is_labeled = y_true[:, i] == y_true[:, i] score = roc_auc_score(y_true[is_labeled, i], y_pred[is_labeled, i]) - + rocauc_list.append(score) if len(rocauc_list) == 0: raise RuntimeError( 'No positively labeled data available. Cannot compute ROC-AUC.') - return sum(rocauc_list)/len(rocauc_list) + return sum(rocauc_list) / len(rocauc_list) -def convert_to_adj(edge_index,n_node): + +def convert_to_adj(edge_index, n_node): '''convert from pyg format edge_index to n by n adj matrix''' - adj=torch.zeros((n_node,n_node)) - row,col=edge_index - adj[row,col]=1 + adj = torch.zeros((n_node, n_node)) + row, col = edge_index + adj[row, col] = 1 return adj + def adj_mul(adj_i, adj, N): adj_i_sp = torch.sparse_coo_tensor(adj_i, torch.ones(adj_i.shape[1], dtype=torch.float).to(adj.device), (N, N)) adj_sp = torch.sparse_coo_tensor(adj, torch.ones(adj.shape[1], dtype=torch.float).to(adj.device), (N, N)) @@ -259,7 +473,10 @@ def adj_mul(adj_i, adj, N): adj_j = adj_j.coalesce().indices() return adj_j + import subprocess + + def get_gpu_memory_map(): """Get the current gpu usage. Returns @@ -278,7 +495,10 @@ def get_gpu_memory_map(): # gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory)) return gpu_memory + import subprocess + + def get_gpu_memory_map(): """Get the current gpu usage. Returns @@ -297,16 +517,18 @@ def get_gpu_memory_map(): # gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory)) return gpu_memory + def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) + dataset_drive_url = { - 'snap-patents' : '1ldh23TSY1PwXia6dU0MYcpyEgX-w3Hia', - 'pokec' : '1dNs5E7BrWJbgcHeQ_zuy5Ozp2tRCWG0y', - 'yelp-chi': '1fAXtTVQS4CfEk4asqrFw9EPmlUPGbGtJ', + 'snap-patents': '1ldh23TSY1PwXia6dU0MYcpyEgX-w3Hia', + 'pokec': '1dNs5E7BrWJbgcHeQ_zuy5Ozp2tRCWG0y', + 'yelp-chi': '1fAXtTVQS4CfEk4asqrFw9EPmlUPGbGtJ', } splits_drive_url = { - 'snap-patents' : '12xbBRqd8mtG_XkNLH8dRRNZJvVM4Pw-N', - 'pokec' : '1ZhpAiyTNc0cE_hhgyiqxnkKREHK7MK-_', -} \ No newline at end of file + 'snap-patents': '12xbBRqd8mtG_XkNLH8dRRNZJvVM4Pw-N', + 'pokec': '1ZhpAiyTNc0cE_hhgyiqxnkKREHK7MK-_', +} diff --git a/large/dataset.py b/large/dataset.py index dceb4c5..18d183a 100644 --- a/large/dataset.py +++ b/large/dataset.py @@ -7,7 +7,7 @@ from sklearn.preprocessing import label_binarize import torch_geometric.transforms as T -from data_utils import rand_train_test_idx, even_quantile_labels, to_sparse_tensor, dataset_drive_url, class_rand_splits +from data_utils import rand_train_test_idx, even_quantile_labels, to_sparse_tensor, dataset_drive_url, class_rand_splits, hsi_splits from torch_geometric.datasets import Planetoid, Amazon, Coauthor from torch_geometric.transforms import NormalizeFeatures @@ -25,6 +25,10 @@ from torch_geometric.utils import subgraph, k_hop_subgraph, to_undirected import pickle as pkl +import csv +import json +from build_graph import build_graph_by_pos, build_graph_by_Knn + class NCDataset(object): def __init__(self, name): """ @@ -52,17 +56,23 @@ def __init__(self, name): """ self.name = name # original name, e.g., ogbn-proteins - self.graph = {} + self.graph = {} # dictionary self.label = None + self.row = 0 + self.col = 0 def get_idx_split(self, split_type='random', train_prop=.5, valid_prop=.25, label_num_per_class=20): """ + hyperSpectral image use class division by label_num_per_class = 5 + train_prop: The proportion of dataset for train split. Between 0 and 1. valid_prop: The proportion of dataset for validation split. Between 0 and 1. """ if split_type == 'random': + # 忽略负样本 ignore_negative = False if self.name == 'ogbn-proteins' else True + train_idx, valid_idx, test_idx = rand_train_test_idx( self.label, train_prop=train_prop, valid_prop=valid_prop, ignore_negative=ignore_negative) split_idx = {'train': train_idx, @@ -73,15 +83,24 @@ def get_idx_split(self, split_type='random', train_prop=.5, valid_prop=.25, labe split_idx = {'train': train_idx, 'valid': valid_idx, 'test': test_idx} + # 进行高光谱图像数据集划分 + elif split_type == 'hsi': + train_idx, valid_idx, test_idx = hsi_splits(self.label, train_prop, valid_prop) + split_idx = {'train': train_idx, + 'valid': valid_idx, + 'test': test_idx} return split_idx + # return the structure of graph 图结构graph structure 标签ground truth def __getitem__(self, idx): assert idx == 0, 'This dataset has only one graph' return self.graph, self.label + # return num of graph def __len__(self): return 1 + # represent 方便打印调试 def __repr__(self): return '{}({})'.format(self.__class__.__name__, len(self)) @@ -124,18 +143,100 @@ def load_dataset(data_dir, dataname, sub_dataname=''): dataset = papers100M_sub(data_dir) elif dataname == 'ogbn-papers100M': dataset = load_papers100M(data_dir) - elif dataname in ('cora', 'citeseer', 'pubmed'): + elif dataname in ('cora', 'citeseer', 'pubmed'): dataset = load_planetoid_dataset(data_dir, dataname) - elif dataname in ('amazon-photo', 'amazon-computer'): + elif dataname in ('amazon-photo', 'amazon-computer'): dataset = load_amazon_dataset(data_dir, dataname) - elif dataname in ('coauthor-cs', 'coauthor-physics'): + elif dataname in ('coauthor-cs', 'coauthor-physics'): dataset = load_coauthor_dataset(data_dir, dataname) elif dataname in ('chameleon', 'cornell', 'film', 'squirrel', 'texas', 'wisconsin'): dataset = load_geom_gcn_dataset(data_dir, dataname) + elif dataname in 'Indian_pines': + dataset = load_indianPines_mat(data_dir) else: - raise ValueError('Invalid dataname') + raise ValueError('Invalid data_name') + return dataset + + +def load_indianPines_mat(data_dir): + # input .mat + data_mat = scipy.io.loadmat(f'{data_dir}\\indian_pines_corrected.mat') + data = data_mat['indian_pines_corrected'] + gt_mat = scipy.io.loadmat(f'{data_dir}\\Indian_pines_gt.mat') + gt = gt_mat['indian_pines_gt'] + + # test input + height, width, bands = data.shape + print(height, width, bands) + + # build graph obj + dataset = NCDataset('Indian_pines') + # prepare data + node_feat = data.reshape(-1, bands) + node_feat = torch.tensor(node_feat).float() + labels = gt.reshape(-1) + num_nodes = int(node_feat.shape[0]) + # 八向建图 这里返回的edge_index是tensor形式 + #edge_index = build_graph_by_pos(height, width) + # knn建图 实际效果并不好 谱域接近但是空域不接近 + # edge_index = build_graph_by_Knn(height, width, node_feat) + + dataset.row = height + dataset.col = width + dataset.graph = { + 'edge_feat': None, + 'node_feat': node_feat, + 'num_nodes': num_nodes,} + dataset.label = torch.tensor(labels, dtype=torch.long) return dataset +def load_pokec_mat(data_dir): + """ requires pokec.mat """ + if not path.exists(f'{data_dir}/pokec/pokec.mat'): + gdd.download_file_from_google_drive( + file_id= dataset_drive_url['pokec'], \ + dest_path=f'{data_dir}/pokec/pokec.mat', showsize=True) + + try: + fulldata = scipy.io.loadmat(f'{data_dir}/pokec/pokec.mat') + edge_index = fulldata['edge_index'] + node_feat = fulldata['node_feat'] + label = fulldata['label'] + except: + edge_index = np.load(f'{data_dir}/pokec/edge_index.npy') + node_feat = np.load(f'{data_dir}/pokec/node_feat.npy') + label = np.load(f'{data_dir}/pokec/label.npy') + + dataset = NCDataset('pokec') + edge_index = torch.tensor(edge_index, dtype=torch.long) + node_feat = torch.tensor(node_feat).float() + num_nodes = int(node_feat.shape[0]) + dataset.graph = {'edge_index': edge_index, + 'edge_feat': None, + 'node_feat': node_feat, + 'num_nodes': num_nodes} + + label = torch.tensor(label).flatten() + dataset.label = torch.tensor(label, dtype=torch.long) + + def load_fixed_splits(train_prop=0.5, val_prop=0.25): + dir = f'{data_dir}pokec/split_0.5_0.25' + tensor_split_idx = {} + if os.path.exists(dir): + tensor_split_idx['train'] = torch.as_tensor(np.loadtxt(dir + '/pokec_train.txt'), dtype=torch.long) + tensor_split_idx['valid'] = torch.as_tensor(np.loadtxt(dir + '/pokec_valid.txt'), dtype=torch.long) + tensor_split_idx['test'] = torch.as_tensor(np.loadtxt(dir + '/pokec_test.txt'), dtype=torch.long) + else: + os.makedirs(dir) + tensor_split_idx['train'], tensor_split_idx['valid'], tensor_split_idx['test'] \ + = rand_train_test_idx(dataset.label, train_prop=train_prop, valid_prop=val_prop) + np.savetxt(dir + '/pokec_train.txt', tensor_split_idx['train'], fmt='%d') + np.savetxt(dir + '/pokec_valid.txt', tensor_split_idx['valid'], fmt='%d') + np.savetxt(dir + '/pokec_test.txt', tensor_split_idx['test'], fmt='%d') + return tensor_split_idx + + dataset.load_fixed_splits = load_fixed_splits + return dataset def load_twitch_dataset(data_dir, lang): assert lang in ('DE', 'ENGB', 'ES', 'FR', 'PTBR', 'RU', 'TW'), 'Invalid dataset' @@ -368,54 +469,6 @@ def ogb_idx_to_tensor(): return dataset -def load_pokec_mat(data_dir): - """ requires pokec.mat """ - if not path.exists(f'{data_dir}/pokec/pokec.mat'): - gdd.download_file_from_google_drive( - file_id= dataset_drive_url['pokec'], \ - dest_path=f'{data_dir}/pokec/pokec.mat', showsize=True) - - try: - fulldata = scipy.io.loadmat(f'{data_dir}/pokec/pokec.mat') - edge_index = fulldata['edge_index'] - node_feat = fulldata['node_feat'] - label = fulldata['label'] - except: - edge_index = np.load(f'{data_dir}/pokec/edge_index.npy') - node_feat = np.load(f'{data_dir}/pokec/node_feat.npy') - label = np.load(f'{data_dir}/pokec/label.npy') - - dataset = NCDataset('pokec') - edge_index = torch.tensor(edge_index, dtype=torch.long) - node_feat = torch.tensor(node_feat).float() - num_nodes = int(node_feat.shape[0]) - dataset.graph = {'edge_index': edge_index, - 'edge_feat': None, - 'node_feat': node_feat, - 'num_nodes': num_nodes} - - label = torch.tensor(label).flatten() - dataset.label = torch.tensor(label, dtype=torch.long) - - def load_fixed_splits(train_prop=0.5, val_prop=0.25): - dir = f'{data_dir}pokec/split_0.5_0.25' - tensor_split_idx = {} - if os.path.exists(dir): - tensor_split_idx['train'] = torch.as_tensor(np.loadtxt(dir + '/pokec_train.txt'), dtype=torch.long) - tensor_split_idx['valid'] = torch.as_tensor(np.loadtxt(dir + '/pokec_valid.txt'), dtype=torch.long) - tensor_split_idx['test'] = torch.as_tensor(np.loadtxt(dir + '/pokec_test.txt'), dtype=torch.long) - else: - os.makedirs(dir) - tensor_split_idx['train'], tensor_split_idx['valid'], tensor_split_idx['test'] \ - = rand_train_test_idx(dataset.label, train_prop=train_prop, valid_prop=val_prop) - np.savetxt(dir + '/pokec_train.txt', tensor_split_idx['train'], fmt='%d') - np.savetxt(dir + '/pokec_valid.txt', tensor_split_idx['valid'], fmt='%d') - np.savetxt(dir + '/pokec_test.txt', tensor_split_idx['test'], fmt='%d') - return tensor_split_idx - - dataset.load_fixed_splits = load_fixed_splits - return dataset - def load_snap_patents_mat(data_dir, nclass=5): if not path.exists(f'{data_dir}snap_patents.mat'): p = dataset_drive_url['snap-patents'] diff --git a/large/eval.py b/large/eval.py index 693e9b5..2c87590 100644 --- a/large/eval.py +++ b/large/eval.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F - +from data_utils import eval_aa, eval_kappa from torch_geometric.utils import subgraph @torch.no_grad() @@ -9,28 +9,38 @@ def evaluate(model, dataset, split_idx, eval_func, criterion, args, result=None) out = result else: model.eval() - out = model(dataset.graph['node_feat'], dataset.graph['edge_index']) + out = model(dataset.graph['node_feat'], dataset.graph['edge_index'], dataset.graph['edge_weight']) - train_acc = eval_func( - dataset.label[split_idx['train']], out[split_idx['train']]) - valid_acc = eval_func( - dataset.label[split_idx['valid']], out[split_idx['valid']]) - test_acc = eval_func( + test_oa = eval_func( dataset.label[split_idx['test']], out[split_idx['test']]) if args.dataset in ('yelp-chi', 'deezer-europe', 'twitch-e', 'fb100', 'ogbn-proteins'): if dataset.label.shape[1] == 1: - true_label = F.one_hot(dataset.label, dataset.label.max() + 1).squeeze(1) + if torch.cuda.is_available(): + true_label = F.one_hot(dataset.label, dataset.label.max() + 1).squeeze(1).to("cuda:0") + else: + true_label = F.one_hot(dataset.label, dataset.label.max() + 1).squeeze(1).to("cpu") else: true_label = dataset.label valid_loss = criterion(out[split_idx['valid']], true_label.squeeze(1)[ split_idx['valid']].to(torch.float)) + elif args.dataset in 'Indian_pines': + valid_idx = split_idx['valid'] + valid_labels = dataset.label[valid_idx].squeeze(1).to("cuda:0") + valid_labels -= 1 + valid_loss = criterion(out[valid_idx], valid_labels) else: out = F.log_softmax(out, dim=1) + # debug valid_loss = criterion( out[split_idx['valid']], dataset.label.squeeze(1)[split_idx['valid']]) - return train_acc, valid_acc, test_acc, valid_loss, out + # return (train_oa, valid_oa, test_oa, valid_loss, + # train_aa, valid_aa, test_aa, + # train_kpp, valid_kpp, test_kpp, + # out) + return test_oa, valid_loss, out + @torch.no_grad() def evaluate_large(model, dataset, split_idx, eval_func, criterion, args, device="cpu", result=None): diff --git a/large/logger.py b/large/logger.py index 1ea8d32..ebef61b 100644 --- a/large/logger.py +++ b/large/logger.py @@ -4,37 +4,52 @@ class Logger(object): """ Adapted from https://github.com/snap-stanford/ogb/ """ def __init__(self, runs, info=None): self.info = info - self.results = [[] for _ in range(runs)] + self.results = [[] for _ in range(runs)] # run个空列表的列表 def add_result(self, run, result): - assert len(result) == 4 + assert len(result) == 2 # check length 这里增加了 aa, kpp 6个参数。 从4->10 + # version2 = 这里只记录oa/loss/保存最优模型 最后评测即可 assert run >= 0 and run < len(self.results) - self.results[run].append(result) + self.results[run].append(result) # 对应列表增加结果 - def print_statistics(self, run=None, mode='max_acc'): + def print_best_epoch(self, step=10): + result = torch.tensor(self.results[0]) + ind = result[:, 1].argmin().item() + print(f'Chosen epoch: {(ind+1)*step}') + print(f"Final Valid Loss: {result[ind, 1]:.4f}") + + def print_statistics(self, run=None, mode='max_acc', step=10): + # 不再代码改进阶段不再使用 if run is not None: result = 100 * torch.tensor(self.results[run]) argmax = result[:, 1].argmax().item() argmin = result[:, 3].argmin().item() if mode == 'max_acc': - ind = argmax + ind = argmax # 选择训练集最大精确度 else: ind = argmin - print_str=f'Run {run + 1:02d}:' + \ - f'Highest Train: {result[:, 0].max():.2f} ' + \ - f'Highest Valid: {result[:, 1].max():.2f} ' + \ - f'Highest Test: {result[:, 2].max():.2f}\n' + \ - f'Chosen epoch: {ind+1} ' + \ - f'Final Train: {result[ind, 0]:.2f} ' + \ - f'Final Test: {result[ind, 2]:.2f}' + print_str = f'Run {run + 1:02d}:' + \ + f'Highest Train OA: {result[:, 0].max():.2f} ' + \ + f'Highest Valid OA: {result[:, 1].max():.2f} ' + \ + f'Highest Test OA: {result[:, 2].max():.2f}\n' + \ + f'Chosen epoch: {(ind+1)*step}\n' + \ + f'Final Train OA: {result[ind, 0]:.2f} ' + \ + f'Final Valid OA: {result[ind, 1]:.2f} ' + \ + f'Final Test OA: {result[ind, 2]:.2f}\n' + \ + f'Final Train AA: {result[ind, 4]:.2f} ' + \ + f'Final Valid AA: {result[ind, 5]:.2f} ' + \ + f'Final Test AA: {result[ind, 6]:.2f}\n' + \ + f'Final Train KPP: {result[ind, 7]:.2f} ' + \ + f'Final Valid KPP: {result[ind, 8]:.2f} ' + \ + f'Final Test KPP: {result[ind, 9]:.2f}\n' print(print_str) - self.test=result[ind, 2] + self.test = result[ind, 2] # 存下final的test_AA else: best_results = [] max_val_epoch=0 - - for r in self.results: - r=100*torch.tensor(r) + for r in self.results: # loop for runs + r = 100*torch.tensor(r) + # 取所有epoch中的最大 train1 = r[:, 0].max().item() test1 = r[:, 2].max().item() valid = r[:, 1].max().item() @@ -47,6 +62,7 @@ def print_statistics(self, run=None, mode='max_acc'): test2 = r[r[:, 3].argmin(), 2].item() best_results.append((train1, test1, valid, train2, test2)) + # best_result中有run组 数据 best_result = torch.tensor(best_results) print(f'All runs:') @@ -57,16 +73,16 @@ def print_statistics(self, run=None, mode='max_acc'): r = best_result[:, 2] print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}') r = best_result[:, 3] - print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}') + print(f'Final Train OA: {r.mean():.2f} ± {r.std():.2f}') r = best_result[:, 4] - print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}') + print(f'Final Test OA: {r.mean():.2f} ± {r.std():.2f}') - self.test=r.mean() + self.test = r.mean() import os def save_result(args, results): if not os.path.exists(f'results/{args.dataset}'): - os.makedirs(f'results/{args.dataset}') + os.makedirs(f'resu lts/{args.dataset}') filename = f'results/{args.dataset}/{args.method}.csv' print(f"Saving results to {filename}") with open(f"{filename}", 'a+') as write_obj: diff --git a/large/main.py b/large/main.py index e86023a..e385d14 100644 --- a/large/main.py +++ b/large/main.py @@ -11,9 +11,11 @@ from logger import Logger, save_result from dataset import load_dataset from data_utils import normalize, gen_normalized_adjs, eval_acc, eval_rocauc, eval_f1, to_sparse_tensor, \ - load_fixed_splits, adj_mul, get_gpu_memory_map, count_parameters + load_fixed_splits, adj_mul, get_gpu_memory_map, count_parameters, eval_oa, draw_Classification_Map, \ + eval_aa, eval_kappa from eval import evaluate from parse import parse_method, parser_add_main_args +from build_graph import get_weight, build_graph_by_fix import time import pickle @@ -29,14 +31,18 @@ def fix_seed(seed): torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True -### Parse args ### +# ----Parse args---- +# 创建一个新的参数解析器对象,并提供描述信息。 parser = argparse.ArgumentParser(description='Training Pipeline for Node Classification') +# 该函数向解析器添加了一系列命令行参数 parser_add_main_args(parser) args = parser.parse_args() print(args) +# 设置随机数 fix_seed(args.seed) +# 检测设备 if args.cpu: device = torch.device("cpu") else: @@ -45,11 +51,13 @@ def fix_seed(seed): ### Load and preprocess data ### dataset = load_dataset(args.data_dir, args.dataset, args.sub_dataset) + +# 转化维度 [num] ---> [num, 1] 将一维tensor转换为二维tensor if len(dataset.label.shape) == 1: dataset.label = dataset.label.unsqueeze(1) -dataset.label = dataset.label.to(device) +# print(len(dataset.label.shape)) -# get the splits for all runs +# get the splits for all runs 划分数据集 if args.rand_split: split_idx_lst = [dataset.get_idx_split(train_prop=args.train_prop, valid_prop=args.valid_prop) for _ in range(args.runs)] @@ -59,34 +67,56 @@ def fix_seed(seed): elif args.dataset in ['ogbn-proteins', 'ogbn-arxiv', 'ogbn-products']: split_idx_lst = [dataset.load_fixed_splits() for _ in range(args.runs)] +# hsi划分数据集 +elif args.dataset in 'Indian_pines': + split_idx_lst = [dataset.get_idx_split(split_type='hsi', train_prop=0.1, valid_prop=0.01) + for _ in range(args.runs)] else: split_idx_lst = load_fixed_splits(args.data_dir, dataset, name=args.dataset, protocol=args.protocol) +# 创建边集 +dataset.graph['edge_index'] = build_graph_by_fix(dataset.graph['node_feat'], dataset.label, + split_idx_lst[0]['train'], dataset.row, dataset.col) + ### Basic information of datasets ### n = dataset.graph['num_nodes'] e = dataset.graph['edge_index'].shape[1] # infer the number of classes for non one-hot and one-hot labels -c = max(dataset.label.max().item() + 1, dataset.label.shape[1]) +c = max(dataset.label.max().item(), dataset.label.shape[1]) d = dataset.graph['node_feat'].shape[1] print(f"dataset {args.dataset} | num nodes {n} | num edge {e} | num node feats {d} | num classes {c}") -# whether or not to symmetrize -if not args.directed and args.dataset != 'ogbn-proteins': +# symmetrize 对称化 +if not args.directed and args.dataset not in ['ogbn-proteins']: dataset.graph['edge_index'] = to_undirected(dataset.graph['edge_index']) +# 添加自环 dataset.graph['edge_index'], _ = remove_self_loops(dataset.graph['edge_index']) dataset.graph['edge_index'], _ = add_self_loops(dataset.graph['edge_index'], num_nodes=n) -dataset.graph['edge_index'], dataset.graph['node_feat'] = \ - dataset.graph['edge_index'].to(device), dataset.graph['node_feat'].to(device) + +# 计算边权 +dataset.graph['edge_weight'] = get_weight(dataset.graph['node_feat'], dataset.graph['edge_index']) +dataset.graph['edge_weight'] = dataset.graph['edge_weight'].to(device) + + +# 将edge_index和node_fea移动到GPU上 +# ver2 添加了label到GPU上 +dataset.graph['edge_index'], dataset.graph['node_feat'], dataset.label = \ + dataset.graph['edge_index'].to(device), dataset.graph['node_feat'].to(device), \ + dataset.label.to(device) + + ### Load method ### model = parse_method(args, c, d, device) ### Loss function (Single-class, Multi-class) ### if args.dataset in ('yelp-chi', 'deezer-europe', 'twitch-e', 'fb100', 'ogbn-proteins'): - criterion = nn.BCEWithLogitsLoss() + criterion = nn.BCEWithLogitsLoss() # 使用于二分类 二元交叉熵损失函数带 +elif args.dataset in 'Indian_pines': + criterion = nn.CrossEntropyLoss() # 使用于多分类 else: criterion = nn.NLLLoss() @@ -95,6 +125,8 @@ def fix_seed(seed): eval_func = eval_rocauc elif args.metric == 'f1': eval_func = eval_f1 +elif args.metric == 'oa': + eval_func = eval_oa else: eval_func = eval_acc @@ -110,6 +142,7 @@ def fix_seed(seed): else: split_idx = split_idx_lst[run] train_idx = split_idx['train'].to(device) + # 重置参数 model.reset_parameters() if args.method == 'sgformer': optimizer = torch.optim.Adam([ @@ -120,39 +153,99 @@ def fix_seed(seed): else: optimizer = torch.optim.Adam( model.parameters(), weight_decay=args.weight_decay, lr=args.lr) - best_val = float('-inf') + # 记录最佳损失 + best_loss = float('inf') for epoch in range(args.epochs): model.train() - optimizer.zero_grad() + optimizer.zero_grad() # 清楚梯度 train_start = time.time() - out = model(dataset.graph['node_feat'], dataset.graph['edge_index']) + # out返回提取后的node_fea + out = model(dataset.graph['node_feat'], dataset.graph['edge_index'], dataset.graph['edge_weight']) if args.dataset in ('yelp-chi', 'deezer-europe', 'twitch-e', 'fb100', 'ogbn-proteins'): if dataset.label.shape[1] == 1: - true_label = F.one_hot(dataset.label, dataset.label.max() + 1).squeeze(1) + true_label = F.one_hot(dataset.label, dataset.label.max()).squeeze(1).to(device) else: true_label = dataset.label loss = criterion(out[train_idx], true_label.squeeze(1)[ train_idx].to(torch.float)) + elif args.dataset in 'Indian_pines': + train_labels = dataset.label[train_idx].squeeze(1).to(device) + train_labels -= 1 + loss = criterion(out[train_idx], train_labels) else: out = F.log_softmax(out, dim=1) loss = criterion( out[train_idx], dataset.label.squeeze(1)[train_idx]) loss.backward() optimizer.step() - if epoch % args.eval_step == 0: result = evaluate(model, dataset, split_idx, eval_func, criterion, args) - logger.add_result(run, result[:-1]) - + cur_loss = result[1] # 当前模型验证集损失 + logger.add_result(run, result[:-1]) # check result length + # default args.display_step=1 if epoch % args.display_step == 0: print_str = f'Epoch: {epoch:02d}, ' + \ - f'Loss: {loss:.4f}, ' + \ - f'Train: {100 * result[0]:.2f}%, ' + \ - f'Valid: {100 * result[1]:.2f}%, ' + \ - f'Test: {100 * result[2]:.2f}%' + f'Train Loss: {loss:.4f}, ' + \ + f'Valid Loss: {result[1]:.4f}, ' + \ + f'Test OA: {100 * result[0]:.2f}% ' print(print_str) - logger.print_statistics(run) - -logger.print_statistics() \ No newline at end of file + if cur_loss < best_loss: + best_loss = cur_loss # 更新最佳验证集损失 + # 保存模型到指定路径 + torch.save(model.state_dict(), "best_model.pt") # 保存到当前文件夹 + print(f'Save best model in current folder, epoch = {epoch}\n') + # logger.print_statistics(run, mode="minLoss", step=args.eval_step) + + torch.cuda.empty_cache() + with torch.no_grad(): + model.load_state_dict(torch.load("best_model.pt")) + model.eval() + out = model(dataset.graph['node_feat'], dataset.graph['edge_index'], dataset.graph['edge_weight']) + # 提取的特征 + + # oa + test_oa = eval_func( + dataset.label[split_idx['test']], out[split_idx['test']]) + train_oa = eval_func( + dataset.label[split_idx['train']], out[split_idx['train']]) + valid_oa = eval_func( + dataset.label[split_idx['valid']], out[split_idx['valid']]) + + # aa + train_aa = eval_aa( + dataset.label[split_idx['train']], out[split_idx['train']], dataset.label.max()) + valid_aa = eval_aa( + dataset.label[split_idx['valid']], out[split_idx['valid']], dataset.label.max()) + test_aa = eval_aa( + dataset.label[split_idx['test']], out[split_idx['test']], dataset.label.max()) + + # kpp + train_kpp = eval_kappa( + dataset.label[split_idx['train']], out[split_idx['train']]) + valid_kpp = eval_kappa( + dataset.label[split_idx['valid']], out[split_idx['valid']]) + test_kpp = eval_kappa( + dataset.label[split_idx['test']], out[split_idx['test']]) + + logger.print_best_epoch() + # 打印结果,AA、OA、Kappa以百分比形式输出,损失保留四位小数 + print(f"Final Train OA: {train_oa * 100:.2f}%") + print(f"Final Valid OA: {valid_oa * 100:.2f}%") + print(f"Final Test OA: {test_oa * 100:.2f}%") + print(f"Final Train AA: {train_aa * 100:.2f}%") + print(f"Final Valid AA: {valid_aa * 100:.2f}%") + print(f"Final Test AA: {test_aa * 100:.2f}%") + print(f"Final Train Kappa: {train_kpp * 100:.2f}%") + print(f"Final Valid Kappa: {valid_kpp * 100:.2f}%") + print(f"Final Test Kappa: {test_kpp * 100:.2f}%") + + # draw graph + out = F.softmax(out, dim=-1) + predicted_labels = torch.argmax(out, 1).reshape([dataset.row, dataset.col]).cpu() + 1 + draw_Classification_Map(predicted_labels, dataset.name) + torch.cuda.empty_cache() + + +#logger.print_statistics() diff --git a/large/ours.py b/large/ours.py index 3d68409..7b495cf 100644 --- a/large/ours.py +++ b/large/ours.py @@ -22,22 +22,23 @@ def __init__(self, in_channels, out_channels, use_weight=True, use_init=False): def reset_parameters(self): self.W.reset_parameters() - def forward(self, x, edge_index, x0): + def forward(self, x, edge_index, x0, edge_weight): N = x.shape[0] - row, col = edge_index - d = degree(col, N).float() - d_norm_in = (1. / d[col]).sqrt() + row, col = edge_index # 记录起点、终点 + d = degree(col, N).float() # 记录col中每个节点出现的次数 + d_norm_in = (1. / d[col]).sqrt() # 每个起点的标准化Din d_norm_out = (1. / d[row]).sqrt() value = torch.ones_like(row) * d_norm_in * d_norm_out value = torch.nan_to_num(value, nan=0.0, posinf=0.0, neginf=0.0) + value = torch.maximum(value, torch.tensor(0.1)) # 确保没有小于0.1的值 trick adj = SparseTensor(row=col, col=row, value=value, sparse_sizes=(N, N)) - x = matmul(adj, x) # [N, D] + x = matmul(adj, x) # 特征聚合 if self.use_init: x = torch.cat([x, x0], 1) x = self.W(x) elif self.use_weight: - x = self.W(x) + x = self.W(x) # 对聚合后的特征进行线性变换 学习特征 return x @@ -52,6 +53,7 @@ def __init__(self, in_channels, hidden_channels, num_layers=2, dropout=0.5, use_ self.bns = nn.ModuleList() self.bns.append(nn.BatchNorm1d(hidden_channels)) + for _ in range(num_layers): self.convs.append( GraphConvLayer(hidden_channels, hidden_channels, use_weight, use_init)) @@ -71,26 +73,26 @@ def reset_parameters(self): for fc in self.fcs: fc.reset_parameters() - def forward(self, x, edge_index): - layer_ = [] + def forward(self, x, edge_index, edge_weight): # 从SGFormer处得到的边权 + layer_ = [] # 保存每层的输出 用于残差连接 - x = self.fcs[0](x) + x = self.fcs[0](x) # 通过第一个fc进行通道对齐 if self.use_bn: x = self.bns[0](x) x = self.activation(x) - x = F.dropout(x, p=self.dropout, training=self.training) + x = F.dropout(x, p=self.dropout, training=self.training) # 随机丢弃神经元 - layer_.append(x) + layer_.append(x) # 输出保存,残差连接使用 - for i, conv in enumerate(self.convs): - x = conv(x, edge_index, layer_[0]) + for i, conv in enumerate(self.convs): # 遍历卷积层实例 + x = conv(x, edge_index, layer_[0], edge_weight) if self.use_bn: x = self.bns[i+1](x) if self.use_act: x = self.activation(x) x = F.dropout(x, p=self.dropout, training=self.training) if self.use_residual: - x = x + layer_[-1] + x = x + layer_[-1] # 与上一层的输出相加 return x class TransConvLayer(nn.Module): @@ -247,7 +249,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, self.trans_conv = TransConv(in_channels, hidden_channels, trans_num_layers, trans_num_heads, trans_dropout, trans_use_bn, trans_use_residual, trans_use_weight, trans_use_act) self.graph_conv = GraphConv(in_channels, hidden_channels, gnn_num_layers, gnn_dropout, gnn_use_bn, gnn_use_residual, gnn_use_weight, gnn_use_init, gnn_use_act) self.use_graph = use_graph - self.graph_weight = graph_weight + self.graph_weight = graph_weight # 图卷级的权重 self.aggregate = aggregate @@ -258,20 +260,23 @@ def __init__(self, in_channels, hidden_channels, out_channels, else: raise ValueError(f'Invalid aggregate type:{aggregate}') + # 获取各层之间的可训练参数 放入list中 self.params1 = list(self.trans_conv.parameters()) self.params2 = list(self.graph_conv.parameters()) if self.graph_conv is not None else [] self.params2.extend(list(self.fc.parameters())) - def forward(self, x, edge_index): + def forward(self, x, edge_index, edge_weight): # 这里增加了边权 x1 = self.trans_conv(x) if self.use_graph: - x2 = self.graph_conv(x, edge_index) + x2 = self.graph_conv(x, edge_index, edge_weight) # 增加边权版本 if self.aggregate == 'add': x = self.graph_weight * x2 + (1 - self.graph_weight) * x1 else: + # transformer和GNN concat x = torch.cat((x1, x2), dim=1) else: x = x1 + # 全连接层 x = self.fc(x) return x diff --git a/large/parse.py b/large/parse.py index 7a33da3..08d0dc0 100644 --- a/large/parse.py +++ b/large/parse.py @@ -1,7 +1,11 @@ from gnns import * from ours import * +# 这里他调用了ours文件,也就是作者写的方法和对比方法的引入,所以我们只需要关注ours就可以 def parse_method(args, c, d, device): + """ + function: 根据提供的参数创建不同图神经网络(GNN)模型 + """ if args.method == 'gcn': model = GCN(in_channels=d, hidden_channels=args.hidden_channels, @@ -41,9 +45,10 @@ def parse_method(args, c, d, device): raise ValueError('Invalid method') return model - def parser_add_main_args(parser): - # dataset and evaluation + """ + function: dataset and evaluation 据命令行输入的信息进行参数的设置,包括默认参数的设置 + """ parser.add_argument('--dataset', type=str, default='cora') parser.add_argument('--sub_dataset', type=str, default='') parser.add_argument('--data_dir', type=str, default='../../../NodeFormer/data/') @@ -66,7 +71,7 @@ def parser_add_main_args(parser): parser.add_argument('--rand_split_class', action='store_true', help='use random splits with a fixed number of labeled nodes for each class') parser.add_argument('--label_num_per_class', type=int, default=20, help='labeled nodes randomly selected') - parser.add_argument('--metric', type=str, default='acc', choices=['acc', 'rocauc', 'f1'], + parser.add_argument('--metric', type=str, default='acc', choices=['acc', 'rocauc', 'f1', 'oa'], help='evaluation metric') # gnn branch diff --git a/large/partRun.bat b/large/partRun.bat new file mode 100644 index 0000000..20d4135 --- /dev/null +++ b/large/partRun.bat @@ -0,0 +1,6 @@ +@echo off +echo Running Indian_pines... +python main.py --method sgformer --dataset Indian_pines --metric oa --lr 0.001 --hidden_channels 256 --use_graph --graph_weight 0.5 ^ + --gnn_num_layers 3 --gnn_dropout 0.5 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_act ^ + --trans_num_layers 1 --trans_dropout 0.5 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ + --seed 123 --runs 1 --epochs 5000 --eval_step 10 --device 0 --data_dir D:\hsi_dataset\Indian_pines ^ diff --git a/large/requirements.txt b/large/requirements.txt index ef3e5d5..a034c3c 100644 --- a/large/requirements.txt +++ b/large/requirements.txt @@ -3,7 +3,7 @@ networkx==2.6.1 numpy==1.19.2 ogb==1.3.1 scikit_learn==1.1.3 -scipy==1.6.2 +scipy==1.6.2z torch==1.9.0 torch_geometric==1.7.2 torch_scatter==2.0.7 diff --git a/large/run.bat b/large/run.bat new file mode 100644 index 0000000..14cc330 --- /dev/null +++ b/large/run.bat @@ -0,0 +1,27 @@ +@echo off +echo Running ogbn-arxiv... +python main.py --method sgformer --dataset ogbn-arxiv --metric acc --lr 0.001 --hidden_channels 256 --use_graph --graph_weight 0.5 ^ + --gnn_num_layers 3 --gnn_dropout 0.5 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_act ^ + --trans_num_layers 1 --trans_dropout 0.5 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ + --seed 123 --runs 5 --epochs 1000 --eval_step 9 --device 0 + +echo Running ogbn-proteins... +python main-batch.py --method sgformer --dataset ogbn-proteins --metric rocauc --lr 0.01 --hidden_channels 64 ^ + --gnn_num_layers 2 --gnn_dropout 0.0 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_act ^ + --trans_num_layers 1 --trans_dropout 0.0 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ + --use_graph --graph_weight 0.5 ^ + --batch_size 10000 --seed 123 --runs 5 --epochs 1000 --eval_step 9 --device 1 + +echo Running amazon2m... +python main-batch.py --method sgformer --dataset amazon2m --metric acc --lr 0.01 --hidden_channels 256 ^ + --gnn_num_layers 3 --gnn_dropout 0.0 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_init --gnn_use_act ^ + --trans_num_layers 1 --trans_dropout 0.0 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ + --use_graph --graph_weight 0.5 ^ + --batch_size 100000 --seed 123 --runs 5 --epochs 1000 --eval_step 9 --device 1 + +echo Running pokec... +python main-batch.py --method sgformer --dataset pokec --rand_split --metric acc --lr 0.01 --hidden_channels 64 ^ + --gnn_num_layers 2 --gnn_dropout 0.0 --gnn_weight_decay 0.0 --gnn_use_residual --gnn_use_weight --gnn_use_bn --gnn_use_init --gnn_use_act ^ + --trans_num_layers 1 --trans_dropout 0.0 --trans_weight_decay 0.0 --trans_use_residual --trans_use_weight --trans_use_bn ^ + --use_graph --graph_weight 0.5 ^ + --batch_size 100000 --seed 123 --runs 5 --epochs 1000 --eval_step 9 --device 1 \ No newline at end of file