diff --git a/data_loader.py b/data_loader.py new file mode 100644 index 0000000..1130f33 --- /dev/null +++ b/data_loader.py @@ -0,0 +1,126 @@ +import warnings +import torch +import scipy.sparse as sp +import numpy as np +import os +import torch_geometric.transforms as T +from torch_geometric.datasets import Planetoid, WikipediaNetwork, Actor, WebKB, Amazon, Coauthor, WikiCS +from torch_geometric.utils import remove_self_loops + +warnings.simplefilter("ignore") + + +def get_split(num_samples: int, train_ratio: float = 0.1, test_ratio: float = 0.8, num_splits: int = 10): + + assert train_ratio + test_ratio < 1 + train_size = int(num_samples * train_ratio) + test_size = int(num_samples * test_ratio) + + trains, vals, tests = [], [], [] + + for _ in range(num_splits): + indices = torch.randperm(num_samples) + + train_mask = torch.zeros(num_samples, dtype=torch.bool) + train_mask.fill_(False) + train_mask[indices[:train_size]] = True + + test_mask = torch.zeros(num_samples, dtype=torch.bool) + test_mask.fill_(False) + test_mask[indices[train_size: test_size + train_size]] = True + + val_mask = torch.zeros(num_samples, dtype=torch.bool) + val_mask.fill_(False) + val_mask[indices[test_size + train_size:]] = True + + trains.append(train_mask.unsqueeze(1)) + vals.append(val_mask.unsqueeze(1)) + tests.append(test_mask.unsqueeze(1)) + + train_mask_all = torch.cat(trains, 1) + val_mask_all = torch.cat(vals, 1) + test_mask_all = torch.cat(tests, 1) + + return train_mask_all, val_mask_all, test_mask_all + + +def get_structural_encoding(edges, nnodes, str_enc_dim=16): + + row = edges[0, :].numpy() + col = edges[1, :].numpy() + data = np.ones_like(row) + + A = sp.csr_matrix((data, (row, col)), shape=(nnodes, nnodes)) + D = (np.array(A.sum(1)).squeeze()) ** -1.0 + + Dinv = sp.diags(D) + RW = A * Dinv + M = RW + + SE = [torch.from_numpy(M.diagonal()).float()] + M_power = M + for _ in range(str_enc_dim - 1): + M_power = M_power * M + SE.append(torch.from_numpy(M_power.diagonal()).float()) + SE = torch.stack(SE, dim=-1) + return SE + + +def load_data(dataset_name): + + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', dataset_name) + + if dataset_name in ['cora', 'citeseer', 'pubmed']: + dataset = Planetoid(path, dataset_name) + elif dataset_name in ['chameleon']: + dataset = WikipediaNetwork(path, dataset_name) + elif dataset_name in ['squirrel']: + dataset = WikipediaNetwork(path, dataset_name, transform=T.NormalizeFeatures()) + elif dataset_name in ['actor']: + dataset = Actor(path) + elif dataset_name in ['cornell', 'texas', 'wisconsin']: + dataset = WebKB(path, dataset_name) + elif dataset_name in ['computers', 'photo']: + dataset = Amazon(path, dataset_name, transform=T.NormalizeFeatures()) + elif dataset_name in ['cs', 'physics']: + dataset = Coauthor(path, dataset_name, transform=T.NormalizeFeatures()) + elif dataset_name in ['wikics']: + dataset = WikiCS(path) + + data = dataset[0] + + edges = remove_self_loops(data.edge_index)[0] + + features = data.x + [nnodes, nfeats] = features.shape + nclasses = torch.max(data.y).item() + 1 + + if dataset_name in ['computers', 'photo', 'cs', 'physics', 'wikics']: + train_mask, val_mask, test_mask = get_split(nnodes) + else: + train_mask, val_mask, test_mask = data.train_mask, data.val_mask, data.test_mask + + if len(train_mask.shape) < 2: + train_mask = train_mask.unsqueeze(1) + val_mask = val_mask.unsqueeze(1) + test_mask = test_mask.unsqueeze(1) + + labels = data.y + + path = '../data/se/{}'.format(dataset_name) + if not os.path.exists(path): + os.makedirs(path) + file_name = path + '/{}_{}.pt'.format(dataset_name, 16) + if os.path.exists(file_name): + se = torch.load(file_name) + # print('Load exist structural encoding.') + else: + print('Computing structural encoding...') + se = get_structural_encoding(edges, nnodes) + torch.save(se, file_name) + print('Done. The structural encoding is saved as: {}.'.format(file_name)) + + return features, edges, se, train_mask, val_mask, test_mask, labels, nnodes, nfeats + + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..e5bc7f3 --- /dev/null +++ b/main.py @@ -0,0 +1,161 @@ +import argparse +import numpy as np +import torch +import torch.nn.functional as F +import dgl +import random + +from data_loader import load_data +from model import * +from utils import * + +EOS = 1e-10 + + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + dgl.seed(seed) + dgl.random.seed(seed) + + +def train_cl(cl_model, discriminator, optimizer_cl, features, str_encodings, edges): + + cl_model.train() + discriminator.eval() + + adj_1, adj_2, weights_lp, _ = discriminator(torch.cat((features, str_encodings), 1), edges) + features_1, adj_1, features_2, adj_2 = augmentation(features, adj_1, features, adj_2, args, cl_model.training) + cl_loss = cl_model(features_1, adj_1, features_2, adj_2) + + optimizer_cl.zero_grad() + cl_loss.backward() + optimizer_cl.step() + + return cl_loss.item() + + +def train_discriminator(cl_model, discriminator, optimizer_disc, features, str_encodings, edges, args): + + cl_model.eval() + discriminator.train() + + adj_1, adj_2, weights_lp, weights_hp = discriminator(torch.cat((features, str_encodings), 1), edges) + rand_np = generate_random_node_pairs(features.shape[0], edges.shape[1]) + psu_label = torch.ones(edges.shape[1]).cuda() + + embedding = cl_model.get_embedding(features, adj_1, adj_2) + edge_emb_sim = F.cosine_similarity(embedding[edges[0]], embedding[edges[1]]) + + rnp_emb_sim_lp = F.cosine_similarity(embedding[rand_np[0]], embedding[rand_np[1]]) + loss_lp = F.margin_ranking_loss(edge_emb_sim, rnp_emb_sim_lp, psu_label, margin=args.margin_hom, reduction='none') + loss_lp *= torch.relu(weights_lp - 0.5) + + rnp_emb_sim_hp = F.cosine_similarity(embedding[rand_np[0]], embedding[rand_np[1]]) + loss_hp = F.margin_ranking_loss(rnp_emb_sim_hp, edge_emb_sim, psu_label, margin=args.margin_het, reduction='none') + loss_hp *= torch.relu(weights_hp - 0.5) + + rank_loss = (loss_lp.mean() + loss_hp.mean()) / 2 + + optimizer_disc.zero_grad() + rank_loss.backward() + optimizer_disc.step() + + return rank_loss.item() + + +def main(args): + + setup_seed(0) + features, edges, str_encodings, train_mask, val_mask, test_mask, labels, nnodes, nfeats = load_data(args.dataset) + results = [] + + for trial in range(args.ntrials): + + setup_seed(trial) + + cl_model = GCL(nlayers=args.nlayers_enc, nlayers_proj=args.nlayers_proj, in_dim=nfeats, emb_dim=args.emb_dim, + proj_dim=args.proj_dim, dropout=args.dropout, sparse=args.sparse, batch_size=args.cl_batch_size).cuda() + cl_model.set_mask_knn(features.cpu(), k=args.k, dataset=args.dataset) + discriminator = Edge_Discriminator(nnodes, nfeats + str_encodings.shape[1], args.alpha, args.sparse).cuda() + + optimizer_cl = torch.optim.Adam(cl_model.parameters(), lr=args.lr_gcl, weight_decay=args.w_decay) + optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=args.lr_disc, weight_decay=args.w_decay) + + features = features.cuda() + str_encodings = str_encodings.cuda() + edges = edges.cuda() + + best_acc_val = 0 + best_acc_test = 0 + + for epoch in range(1, args.epochs + 1): + + for _ in range(args.cl_rounds): + cl_loss = train_cl(cl_model, discriminator, optimizer_cl, features, str_encodings, edges) + rank_loss = train_discriminator(cl_model, discriminator, optimizer_discriminator, features, str_encodings, edges, args) + + print("[TRAIN] Epoch:{:04d} | CL Loss {:.4f} | RANK loss:{:.4f} ".format(epoch, cl_loss, rank_loss)) + + if epoch % args.eval_freq == 0: + cl_model.eval() + discriminator.eval() + adj_1, adj_2, _, _ = discriminator(torch.cat((features, str_encodings), 1), edges) + embedding = cl_model.get_embedding(features, adj_1, adj_2) + cur_split = 0 if (train_mask.shape[1]==1) else (trial % train_mask.shape[1]) + acc_test, acc_val = eval_test_mode(embedding, labels, train_mask[:, cur_split], + val_mask[:, cur_split], test_mask[:, cur_split]) + print( + '[TEST] Epoch:{:04d} | CL loss:{:.4f} | RANK loss:{:.4f} | VAL ACC:{:.2f} | TEST ACC:{:.2f}'.format( + epoch, cl_loss, rank_loss, acc_val, acc_test)) + + if acc_val > best_acc_val: + best_acc_val = acc_val + best_acc_test = acc_test + + results.append(best_acc_test) + + print('\n[FINAL RESULT] Dataset:{} | Run:{} | ACC:{:.2f}+-{:.2f}'.format(args.dataset, args.ntrials, np.mean(results), + np.std(results))) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # ESSENTIAL + parser.add_argument('-dataset', type=str, default='cornell', + choices=['cora', 'citeseer', 'pubmed', 'chameleon', 'squirrel', 'actor', 'cornell', + 'texas', 'wisconsin', 'computers', 'photo', 'cs', 'physics', 'wikics']) + parser.add_argument('-ntrials', type=int, default=10) + parser.add_argument('-sparse', type=int, default=0) + parser.add_argument('-eval_freq', type=int, default=20) + parser.add_argument('-epochs', type=int, default=400) + parser.add_argument('-lr_gcl', type=float, default=0.001) + parser.add_argument('-lr_disc', type=float, default=0.001) + parser.add_argument('-cl_rounds', type=int, default=2) + parser.add_argument('-w_decay', type=float, default=0.0) + parser.add_argument('-dropout', type=float, default=0.5) + + # DISC Module - Hyper-param + parser.add_argument('-alpha', type=float, default=0.1) + parser.add_argument('-margin_hom', type=float, default=0.5) + parser.add_argument('-margin_het', type=float, default=0.5) + + # GRL Module - Hyper-param + parser.add_argument('-nlayers_enc', type=int, default=2) + parser.add_argument('-nlayers_proj', type=int, default=1, choices=[1, 2]) + parser.add_argument('-emb_dim', type=int, default=128) + parser.add_argument('-proj_dim', type=int, default=128) + parser.add_argument('-cl_batch_size', type=int, default=0) + parser.add_argument('-k', type=int, default=20) + parser.add_argument('-maskfeat_rate_1', type=float, default=0.1) + parser.add_argument('-maskfeat_rate_2', type=float, default=0.5) + parser.add_argument('-dropedge_rate_1', type=float, default=0.5) + parser.add_argument('-dropedge_rate_2', type=float, default=0.1) + + args = parser.parse_args() + + print(args) + main(args) \ No newline at end of file diff --git a/model.py b/model.py new file mode 100644 index 0000000..6e81a78 --- /dev/null +++ b/model.py @@ -0,0 +1,221 @@ +from torch.nn import Sequential, Linear, ReLU +from sklearn.neighbors import kneighbors_graph +from scipy import sparse +from dgl.nn import EdgeWeightNorm +import random +import os +import torch +import torch.nn as nn +import torch.nn.functional as F +import dgl.function as fn + +from utils import * + +EOS = 1e-10 +norm = EdgeWeightNorm(norm='both') + + +class GCL(nn.Module): + def __init__(self, nlayers, nlayers_proj, in_dim, emb_dim, proj_dim, dropout, sparse, batch_size): + super(GCL, self).__init__() + + self.encoder1 = SGC(nlayers, in_dim, emb_dim, dropout, sparse) + self.encoder2 = SGC(nlayers, in_dim, emb_dim, dropout, sparse) + + if nlayers_proj == 1: + self.proj_head1 = Sequential(Linear(emb_dim, proj_dim)) + self.proj_head2 = Sequential(Linear(emb_dim, proj_dim)) + elif nlayers_proj == 2: + self.proj_head1 = Sequential(Linear(emb_dim, proj_dim), ReLU(inplace=True), Linear(proj_dim, proj_dim)) + self.proj_head2 = Sequential(Linear(emb_dim, proj_dim), ReLU(inplace=True), Linear(proj_dim, proj_dim)) + + self.batch_size = batch_size + + + def get_embedding(self, x, a1, a2, source='all'): + emb1 = self.encoder1(x, a1) + emb2 = self.encoder2(x, a2) + return torch.cat((emb1, emb2), dim=1) + + + def get_projection(self, x, a1, a2): + emb1 = self.encoder1(x, a1) + emb2 = self.encoder2(x, a2) + proj1 = self.proj_head1(emb1) + proj2 = self.proj_head2(emb2) + return torch.cat((proj1, proj2), dim=1) + + + def forward(self, x1, a1, x2, a2): + emb1 = self.encoder1(x1, a1) + emb2 = self.encoder2(x2, a2) + proj1 = self.proj_head1(emb1) + proj2 = self.proj_head2(emb2) + loss = self.batch_nce_loss(proj1, proj2) + return loss + + + def set_mask_knn(self, X, k, dataset, metric='cosine'): + if k != 0: + path = '../data/knn/{}'.format(dataset) + if not os.path.exists(path): + os.makedirs(path) + file_name = path + '/{}_{}.npz'.format(dataset, k) + if os.path.exists(file_name): + knn = sparse.load_npz(file_name) + # print('Load exist knn graph.') + else: + print('Computing knn graph...') + knn = kneighbors_graph(X, k, metric=metric) + sparse.save_npz(file_name, knn) + print('Done. The knn graph is saved as: {}.'.format(file_name)) + knn = torch.tensor(knn.toarray()) + torch.eye(X.shape[0]) + else: + knn = torch.eye(X.shape[0]) + self.pos_mask = knn + self.neg_mask = 1 - self.pos_mask + + + def batch_nce_loss(self, z1, z2, temperature=0.2, pos_mask=None, neg_mask=None): + if pos_mask is None and neg_mask is None: + pos_mask = self.pos_mask + neg_mask = self.neg_mask + + nnodes = z1.shape[0] + if (self.batch_size == 0) or (self.batch_size > nnodes): + loss_0 = self.infonce(z1, z2, pos_mask, neg_mask, temperature) + loss_1 = self.infonce(z2, z1, pos_mask, neg_mask, temperature) + loss = (loss_0 + loss_1) / 2.0 + else: + node_idxs = list(range(nnodes)) + random.shuffle(node_idxs) + batches = split_batch(node_idxs, self.batch_size) + loss = 0 + for b in batches: + weight = len(b) / nnodes + loss_0 = self.infonce(z1[b], z2[b], pos_mask[:,b][b,:], neg_mask[:,b][b,:], temperature) + loss_1 = self.infonce(z2[b], z1[b], pos_mask[:,b][b,:], neg_mask[:,b][b,:], temperature) + loss += (loss_0 + loss_1) / 2.0 * weight + return loss + + + def infonce(self, anchor, sample, pos_mask, neg_mask, tau): + pos_mask = pos_mask.cuda() + neg_mask = neg_mask.cuda() + sim = self.similarity(anchor, sample) / tau + exp_sim = torch.exp(sim) * neg_mask + log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True)) + loss = log_prob * pos_mask + loss = loss.sum(dim=1) / pos_mask.sum(dim=1) + return -loss.mean() + + + def similarity(self, h1: torch.Tensor, h2: torch.Tensor): + h1 = F.normalize(h1) + h2 = F.normalize(h2) + return h1 @ h2.t() + + +class Edge_Discriminator(nn.Module): + def __init__(self, nnodes, input_dim, alpha, sparse, hidden_dim=128, temperature=1.0, bias=0.0 + 0.0001): + super(Edge_Discriminator, self).__init__() + + self.embedding_layers = nn.ModuleList() + self.embedding_layers.append(nn.Linear(input_dim, hidden_dim)) + self.edge_mlp = nn.Linear(hidden_dim * 2, 1) + + self.temperature = temperature + self.bias = bias + self.nnodes = nnodes + self.sparse = sparse + self.alpha = alpha + + + def get_node_embedding(self, h): + for layer in self.embedding_layers: + h = layer(h) + h = F.relu(h) + return h + + + def get_edge_weight(self, embeddings, edges): + s1 = self.edge_mlp(torch.cat((embeddings[edges[0]], embeddings[edges[1]]), dim=1)).flatten() + s2 = self.edge_mlp(torch.cat((embeddings[edges[1]], embeddings[edges[0]]), dim=1)).flatten() + return (s1 + s2) / 2 + + + def gumbel_sampling(self, edges_weights_raw): + eps = (self.bias - (1 - self.bias)) * torch.rand(edges_weights_raw.size()) + (1 - self.bias) + gate_inputs = torch.log(eps) - torch.log(1 - eps) + gate_inputs = gate_inputs.cuda() + gate_inputs = (gate_inputs + edges_weights_raw) / self.temperature + return torch.sigmoid(gate_inputs).squeeze() + + + def weight_forward(self, features, edges): + embeddings = self.get_node_embedding(features) + edges_weights_raw = self.get_edge_weight(embeddings, edges) + weights_lp = self.gumbel_sampling(edges_weights_raw) + weights_hp = 1 - weights_lp + return weights_lp, weights_hp + + + def weight_to_adj(self, edges, weights_lp, weights_hp): + if not self.sparse: + adj_lp = get_adj_from_edges(edges, weights_lp, self.nnodes) + adj_lp += torch.eye(self.nnodes).cuda() + adj_lp = normalize_adj(adj_lp, 'sym', self.sparse) + + adj_hp = get_adj_from_edges(edges, weights_hp, self.nnodes) + adj_hp += torch.eye(self.nnodes).cuda() + adj_hp = normalize_adj(adj_hp, 'sym', self.sparse) + + mask = torch.zeros(adj_lp.shape).cuda() + mask[edges[0], edges[1]] = 1. + mask.requires_grad = False + adj_hp = torch.eye(self.nnodes).cuda() - adj_hp * mask * self.alpha + else: + adj_lp = dgl.graph((edges[0], edges[1]), num_nodes=self.nnodes, device='cuda') + adj_lp = dgl.add_self_loop(adj_lp) + weights_lp = torch.cat((weights_lp, torch.ones(self.nnodes).cuda())) + EOS + weights_lp = norm(adj_lp, weights_lp) + adj_lp.edata['w'] = weights_lp + + adj_hp = dgl.graph((edges[0], edges[1]), num_nodes=self.nnodes, device='cuda') + adj_hp = dgl.add_self_loop(adj_hp) + weights_hp = torch.cat((weights_hp, torch.ones(self.nnodes).cuda())) + EOS + weights_hp = norm(adj_hp, weights_hp) + weights_hp *= - self.alpha + weights_hp[edges.shape[1]:] = 1 + adj_hp.edata['w'] = weights_hp + return adj_lp, adj_hp + + + def forward(self, features, edges): + weights_lp, weights_hp = self.weight_forward(features, edges) + adj_lp, adj_hp = self.weight_to_adj(edges, weights_lp, weights_hp) + return adj_lp, adj_hp, weights_lp, weights_hp + + +class SGC(nn.Module): + def __init__(self, nlayers, in_dim, emb_dim, dropout, sparse): + super(SGC, self).__init__() + self.dropout = dropout + self.sparse = sparse + + self.linear = nn.Linear(in_dim, emb_dim) + self.k = nlayers + + def forward(self, x, g): + x = torch.relu(self.linear(x)) + + if self.sparse: + with g.local_scope(): + g.ndata['h'] = x + for _ in range(self.k): + g.update_all(fn.u_mul_e('h', 'w', 'm'), fn.sum(msg='m', out='h')) + return g.ndata['h'] + else: + for _ in range(self.k): + x = torch.matmul(g, x) + return x \ No newline at end of file diff --git a/scripts/run_actor.sh b/scripts/run_actor.sh new file mode 100644 index 0000000..ecf64f1 --- /dev/null +++ b/scripts/run_actor.sh @@ -0,0 +1,2 @@ +# Actor +python main.py -dataset actor -ntrials 10 -sparse 1 -epochs 1000 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.1 -k 20 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.8 -lr_disc 0.001 -margin_hom 0.1 -margin_het 0.5 -cl_rounds 2 -eval_freq 50 diff --git a/scripts/run_ama_computers.sh b/scripts/run_ama_computers.sh new file mode 100644 index 0000000..eb41182 --- /dev/null +++ b/scripts/run_ama_computers.sh @@ -0,0 +1,2 @@ +# Amazon Computers +python main.py -dataset computers -ntrials 10 -sparse 1 -epochs 1500 -cl_batch_size 5000 -nlayers_proj 1 -alpha 0.3 -k 10 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.0001 -margin_hom 0.1 -margin_het 0.5 -cl_rounds 3 -eval_freq 20 \ No newline at end of file diff --git a/scripts/run_ama_photo.sh b/scripts/run_ama_photo.sh new file mode 100644 index 0000000..df6cc07 --- /dev/null +++ b/scripts/run_ama_photo.sh @@ -0,0 +1,2 @@ +# Amazon Photo +python main.py -dataset photo -ntrials 10 -sparse 1 -epochs 1500 -cl_batch_size 5000 -nlayers_proj 1 -alpha 0.3 -k 30 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.8 -dropedge_rate_2 0.5 -lr_disc 0.0001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 3 -eval_freq 20 \ No newline at end of file diff --git a/scripts/run_chameleon.sh b/scripts/run_chameleon.sh new file mode 100644 index 0000000..b6515f4 --- /dev/null +++ b/scripts/run_chameleon.sh @@ -0,0 +1,2 @@ +# Chameleon +python main.py -dataset chameleon -ntrials 10 -sparse 0 -epochs 500 -cl_batch_size 0 -nlayers_proj 1 -alpha 0.1 -k 0 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 20 \ No newline at end of file diff --git a/scripts/run_citeseer.sh b/scripts/run_citeseer.sh new file mode 100644 index 0000000..511169d --- /dev/null +++ b/scripts/run_citeseer.sh @@ -0,0 +1,2 @@ +# CiteSeer +python main.py -dataset citeseer -ntrials 10 -sparse 0 -epochs 400 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.1 -k 30 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.8 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 5 \ No newline at end of file diff --git a/scripts/run_coauthor_cs.sh b/scripts/run_coauthor_cs.sh new file mode 100644 index 0000000..40bed79 --- /dev/null +++ b/scripts/run_coauthor_cs.sh @@ -0,0 +1,2 @@ +# CoAuthor CS +python main.py -dataset cs -ntrials 10 -sparse 1 -epochs 1500 -cl_batch_size 5000 -nlayers_proj 1 -alpha 0.5 -k 30 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.1 -margin_het 0.5 -cl_rounds 2 -eval_freq 50 \ No newline at end of file diff --git a/scripts/run_coauthor_phy.sh b/scripts/run_coauthor_phy.sh new file mode 100644 index 0000000..1c505a2 --- /dev/null +++ b/scripts/run_coauthor_phy.sh @@ -0,0 +1,2 @@ +# CoAuthor Physics +python main.py -dataset physics -ntrials 10 -sparse 1 -epochs 800 -cl_batch_size 2000 -nlayers_proj 1 -alpha 0.1 -k 25 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 50 \ No newline at end of file diff --git a/scripts/run_cora.sh b/scripts/run_cora.sh new file mode 100644 index 0000000..6b10590 --- /dev/null +++ b/scripts/run_cora.sh @@ -0,0 +1,2 @@ +# Cora +python main.py -dataset cora -ntrials 10 -sparse 0 -epochs 400 -cl_batch_size 0 -nlayers_proj 1 -alpha 0.5 -k 20 -maskfeat_rate_1 0.8 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.8 -dropedge_rate_2 0.8 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 5 \ No newline at end of file diff --git a/scripts/run_cornell.sh b/scripts/run_cornell.sh new file mode 100644 index 0000000..ce97cc0 --- /dev/null +++ b/scripts/run_cornell.sh @@ -0,0 +1,2 @@ +# Cornell +python main.py -dataset cornell -ntrials 10 -sparse 0 -epochs 400 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.3 -k 25 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.3 -margin_het 0.3 -cl_rounds 2 -eval_freq 10 \ No newline at end of file diff --git a/scripts/run_pubmed.sh b/scripts/run_pubmed.sh new file mode 100644 index 0000000..4939e54 --- /dev/null +++ b/scripts/run_pubmed.sh @@ -0,0 +1,2 @@ +# PubMed +python main.py -dataset pubmed -ntrials 10 -sparse 1 -epochs 800 -cl_batch_size 5000 -nlayers_proj 2 -alpha 0.1 -k 0 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.5 -dropedge_rate_1 0.5 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 20 \ No newline at end of file diff --git a/scripts/run_squirrel.sh b/scripts/run_squirrel.sh new file mode 100644 index 0000000..5e6833d --- /dev/null +++ b/scripts/run_squirrel.sh @@ -0,0 +1,2 @@ +# Squirrel +python main.py -dataset squirrel -ntrials 10 -sparse 0 -epochs 1000 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.1 -k 0 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.1 -dropedge_rate_2 0.8 -lr_disc 0.001 -margin_hom 0.1 -margin_het 0.3 -cl_rounds 2 -eval_freq 50 \ No newline at end of file diff --git a/scripts/run_texas.sh b/scripts/run_texas.sh new file mode 100644 index 0000000..3769ea0 --- /dev/null +++ b/scripts/run_texas.sh @@ -0,0 +1,2 @@ +# Texas +python main.py -dataset texas -ntrials 10 -sparse 0 -epochs 400 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.5 -k 20 -maskfeat_rate_1 0.5 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.1 -dropedge_rate_2 0.1 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 20 \ No newline at end of file diff --git a/scripts/run_wiki_cs.sh b/scripts/run_wiki_cs.sh new file mode 100644 index 0000000..b1c8c22 --- /dev/null +++ b/scripts/run_wiki_cs.sh @@ -0,0 +1,2 @@ +# Wiki-CS +python main.py -dataset wikics -ntrials 10 -sparse 1 -epochs 1500 -cl_batch_size 3000 -nlayers_proj 1 -alpha 0.1 -k 30 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.5 -dropedge_rate_2 0.5 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 50 \ No newline at end of file diff --git a/scripts/run_winsconsin.sh b/scripts/run_winsconsin.sh new file mode 100644 index 0000000..110d8fa --- /dev/null +++ b/scripts/run_winsconsin.sh @@ -0,0 +1,2 @@ +# Wisconsin +python main.py -dataset wisconsin -ntrials 10 -sparse 0 -epochs 400 -cl_batch_size 0 -nlayers_proj 2 -alpha 0.5 -k 25 -maskfeat_rate_1 0.1 -maskfeat_rate_2 0.1 -dropedge_rate_1 0.1 -dropedge_rate_2 0.3 -lr_disc 0.001 -margin_hom 0.5 -margin_het 0.5 -cl_rounds 2 -eval_freq 20 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..150a702 --- /dev/null +++ b/utils.py @@ -0,0 +1,156 @@ +import numpy as np +import scipy.sparse as sp +import torch +import torch.nn.functional as F +import dgl +import time +from sklearn.neighbors import kneighbors_graph +from sklearn.metrics import accuracy_score +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.multiclass import OneVsRestClassifier +from sklearn.preprocessing import normalize + +EOS = 1e-10 + + +def split_batch(init_list, batch_size): + groups = zip(*(iter(init_list),) * batch_size) + end_list = [list(i) for i in groups] + count = len(init_list) % batch_size + end_list.append(init_list[-count:]) if count != 0 else end_list + return end_list + + +def get_feat_mask(features, mask_rate): + feat_node = features.shape[1] + mask = torch.zeros(features.shape) + samples = np.random.choice(feat_node, size=int(feat_node * mask_rate), replace=False) + mask[:, samples] = 1 + if torch.cuda.is_available(): + mask = mask.cuda() + return mask, samples + + +def normalize_adj(adj, mode, sparse=False): + if not sparse: + if mode == "sym": + inv_sqrt_degree = 1. / (torch.sqrt(adj.sum(dim=1, keepdim=False)) + EOS) + return inv_sqrt_degree[:, None] * adj * inv_sqrt_degree[None, :] + elif mode == "row": + inv_degree = 1. / (adj.sum(dim=1, keepdim=False) + EOS) + return inv_degree[:, None] * adj + else: + exit("wrong norm mode") + else: + adj = adj.coalesce() + if mode == "sym": + inv_sqrt_degree = 1. / (torch.sqrt(torch.sparse.sum(adj, dim=1).values())) + D_value = inv_sqrt_degree[adj.indices()[0]] * inv_sqrt_degree[adj.indices()[1]] + + elif mode == "row": + inv_degree = 1. / (torch.sparse.sum(adj, dim=1).values() + EOS) + D_value = inv_degree[adj.indices()[0]] + else: + exit("wrong norm mode") + new_values = adj.values() * D_value + + return torch.sparse.FloatTensor(adj.indices(), new_values, adj.size()) + + +def get_adj_from_edges(edges, weights, nnodes): + adj = torch.zeros(nnodes, nnodes).cuda() + adj[edges[0], edges[1]] = weights + return adj + + +def augmentation(features_1, adj_1, features_2, adj_2, args, training): + # view 1 + mask_1, _ = get_feat_mask(features_1, args.maskfeat_rate_1) + features_1 = features_1 * (1 - mask_1) + if not args.sparse: + adj_1 = F.dropout(adj_1, p=args.dropedge_rate_1, training=training) + else: + adj_1.edata['w'] = F.dropout(adj_1.edata['w'], p=args.dropedge_rate_1, training=training) + + # # view 2 + mask_2, _ = get_feat_mask(features_1, args.maskfeat_rate_2) + features_2 = features_2 * (1 - mask_2) + if not args.sparse: + adj_2 = F.dropout(adj_2, p=args.dropedge_rate_2, training=training) + else: + adj_2.edata['w'] = F.dropout(adj_2.edata['w'], p=args.dropedge_rate_2, training=training) + + return features_1, adj_1, features_2, adj_2 + + +def generate_random_node_pairs(nnodes, nedges, backup=300): + rand_edges = np.random.choice(nnodes, size=(nedges + backup) * 2, replace=True) + rand_edges = rand_edges.reshape((2, nedges + backup)) + rand_edges = torch.from_numpy(rand_edges) + rand_edges = rand_edges[:, rand_edges[0,:] != rand_edges[1,:]] + rand_edges = rand_edges[:, 0: nedges] + return rand_edges.cuda() + + +def eval_debug_mode(embedding, labels, train_mask, val_mask, test_mask): + + t1 = time.time() + + X = embedding.detach().cpu().numpy() + Y = labels.detach().cpu().numpy() + + X = normalize(X, norm='l2') + + nb_split = train_mask.shape[1] + + accs = [] + for split in range(nb_split): + X_train = X[train_mask.cpu()[:, split]] + X_test = X[test_mask.cpu()[:, split]] + y_train = Y[train_mask.cpu()[:, split]] + y_test = Y[test_mask.cpu()[:, split]] + + logreg = LogisticRegression(solver='liblinear') + c = 2.0 ** np.arange(-10, 10) + clf = GridSearchCV(estimator=OneVsRestClassifier(logreg), + param_grid=dict(estimator__C=c), n_jobs=8, cv=5, + verbose=0) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + acc = accuracy_score(y_test, y_pred) + accs.append(acc) + + print('eval time:{:.4f}s'.format(time.time() - t1)) + + return accs + + +def eval_test_mode(embedding, labels, train_mask, val_mask, test_mask): + + X = embedding.detach().cpu().numpy() + Y = labels.detach().cpu().numpy() + X = normalize(X, norm='l2') + + X_train = X[train_mask.cpu()] + X_val = X[val_mask.cpu()] + X_test = X[test_mask.cpu()] + y_train = Y[train_mask.cpu()] + y_val = Y[val_mask.cpu()] + y_test = Y[test_mask.cpu()] + + logreg = LogisticRegression(solver='liblinear') + c = 2.0 ** np.arange(-10, 10) + clf = GridSearchCV(estimator=OneVsRestClassifier(logreg), + param_grid=dict(estimator__C=c), n_jobs=8, cv=5, + verbose=0) + clf.fit(X_train, y_train) + + y_pred_test = clf.predict(X_test) + acc_test = accuracy_score(y_test, y_pred_test) + y_pred_val = clf.predict(X_val) + acc_val = accuracy_score(y_val, y_pred_val) + + return acc_test * 100, acc_val * 100 +