Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

测试集没有label #128

Open
yysirs opened this issue Sep 10, 2021 · 11 comments
Open

测试集没有label #128

yysirs opened this issue Sep 10, 2021 · 11 comments

Comments

@yysirs
Copy link

yysirs commented Sep 10, 2021

你好,请问测试集中没有标签,无法进行建图,只用train的数据进行建图。可以吗?对结果的影响大吗?

@yysirs
Copy link
Author

yysirs commented Sep 10, 2021

给测试集伪标签吗?然后建图进行预测

@yao8839836
Copy link
Owner

@yysirs

你好,是的,测试集用默认标签构图,然后预测得到P值 + 标签

@yysirs
Copy link
Author

yysirs commented Sep 10, 2021

就比如:总的标签数量为 [1,2,3,4] 4类,test的数据集的伪标签都选择1,然后进行建图进行训练和预测。

@yao8839836
Copy link
Owner

yao8839836 commented Sep 10, 2021

@yysirs 对,这时候只能看val的准确率,test的准确率需要有标签才能验证。

@yysirs
Copy link
Author

yysirs commented Sep 10, 2021

好的,我试试,感谢回复。

@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

你好,我尝试在别的数据集上进行训练,但是发现train的acc和val的acc很快就达到100%, 对照了build_graph.py的文件发现并没有什么不一样,请问可能是哪里出了问题?
image

@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

model.py

#!/usr/bin/env python
import torch
import torch.nn as nn

class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim,support, \
                 act_func=None,featureless=False,dropout_rate=0., \
                 bias=False):
        super(GraphConvolution, self).__init__()
        self.support = support
        self.featureless = featureless

        for i in range(len(self.support)):
            setattr(self, 'W{}'.format(i), nn.Parameter(torch.randn(input_dim, output_dim)))

        if bias:
            self.b = nn.Parameter(torch.zeros(1, output_dim))

        self.act_func = act_func
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.dropout(x)

        for i in range(len(self.support)):
            if self.featureless:
                pre_sup = getattr(self, 'W{}'.format(i))
            else:
                pre_sup = x.mm(getattr(self, 'W{}'.format(i)))

            if i == 0:
                out = self.support[i].mm(pre_sup)
            else:
                out += self.support[i].mm(pre_sup)

        if self.act_func is not None:
            out = self.act_func(out)

        self.embedding = out
        return out

class GCN(nn.Module):
    def __init__(self, input_dim, \
                 support, \
                 dropout_rate=0., \
                 num_classes=35):
        super(GCN, self).__init__()

        # GraphConvolution
        self.layer1 = GraphConvolution(input_dim, 200, support, act_func=nn.ReLU(), featureless=True,
                                       dropout_rate=dropout_rate)
        self.layer2 = GraphConvolution(200, num_classes, support, dropout_rate=dropout_rate)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        return out

@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

train.py

from __future__ import division
from __future__ import print_function
from sklearn import metrics
import time
import sys
import os
import torch
import torch.nn as nn

import numpy as np

from utils import *
from gcn import GCN

from config import CONFIG
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
cfg = CONFIG()

if len(sys.argv) != 2:
    sys.exit("Use: python train.py <dataset>")

datasets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr', 'r8', 'risk']
dataset = sys.argv[1]
label_list = ['5-24', '6-34', '1-1', '6-8', '10-26', '2-3', '5-22', '6-28', '8-18', '1-4', '2-6', '6-21', '7-16', '6-29', '6-20', 
              '6-15', '6-13', '9-23', '5-35', '2-33', '5-30', '1-9', '8-27', '1-10', '6-19', '3-5', '2-2', '4-7', '2-17', '5-12', 
              '6-32', '6-31', '2-25', '2-11', '2-14']

class_list = [x.strip()
              for x in open('/data1/liushu/risk_data_grand/Text_GCN/data/labels.txt', encoding='utf8').readlines()]

if dataset not in datasets:
    sys.exit("wrong dataset name")
cfg.dataset = dataset

# Set random seed
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)


# Load data
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(cfg.dataset)

features = sp.identity(features.shape[0])  # featureless


# Some preprocessing
features = preprocess_features(features)
if cfg.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif cfg.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, cfg.max_degree)
    num_supports = 1 + cfg.max_degree
    model_func = GCN
else:
    raise ValueError('Invalid argument for model: ' + str(cfg.model))

# Define placeholders
t_features = torch.from_numpy(features)
t_y_train = torch.from_numpy(y_train)
t_y_val = torch.from_numpy(y_val)
t_y_test = torch.from_numpy(y_test)
t_train_mask = torch.from_numpy(train_mask.astype(np.float32))
tm_train_mask = torch.transpose(torch.unsqueeze(
    t_train_mask, 0), 1, 0).repeat(1, y_train.shape[1])

t_support = []
for i in range(len(support)):
    t_support.append(torch.Tensor(support[i]))

model = model_func(input_dim=features.shape[0], support=t_support, num_classes=y_train.shape[1])

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': cfg.weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
]
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=1e-8)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

# Define model evaluation function
def evaluate(features, labels, mask):
    t_test = time.time()
    model.eval()
    with torch.no_grad():
        logits = model(features)
        t_mask = torch.from_numpy(np.array(mask * 1., dtype=np.float32))
        tm_mask = torch.transpose(torch.unsqueeze(
            t_mask, 0), 1, 0).repeat(1, labels.shape[1])
        loss = criterion(logits * tm_mask, torch.max(labels, 1)[1])
        pred = torch.max(logits, 1)[1]
        acc = ((pred == torch.max(labels, 1)[1]).float(
        ) * t_mask).sum().item() / t_mask.sum().item()

    return loss.numpy(), acc, pred.numpy(), labels.numpy(), (time.time() - t_test)


val_losses = []

# Train model
for epoch in range(cfg.epochs):

    t = time.time()
    
    # Forward pass
    logits = model(t_features)
    loss = criterion(logits * tm_train_mask, torch.max(t_y_train, 1)[1])
    acc = ((torch.max(logits, 1)[1] == torch.max(t_y_train, 1)[
        1]).float() * t_train_mask).sum().item() / t_train_mask.sum().item()

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Validation
    val_loss, val_acc, pred, labels, duration = evaluate(
        t_features, t_y_val, val_mask)
    val_losses.append(val_loss)

    print(f"Epoch: {epoch+1:.0f}, train_loss: {loss:.4f}, train_acc: {acc:.2%}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.2%}")
    
    if epoch > cfg.early_stopping and val_losses[-1] > np.mean(val_losses[-(cfg.early_stopping + 1):-1]):
        print("Early stopping...")
        break

test_loss, test_acc, pred, labels, test_duration = evaluate(t_features, t_y_test, test_mask)
# Testing
test_pred = []
test_labels = []
for i in range(len(test_mask)):
    if test_mask[i]:
        test_pred.append(pred[i])
        test_labels.append(np.argmax(labels[i]))

print(metrics.classification_report(
    test_labels, test_pred, digits=4, zero_division=0, target_names=class_list))

@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

build_graph.py

"""
@file: 建立图
"""
import random
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from math import log
from nltk.corpus import wordnet as wn
import sys
random.seed(531)

word_embedding_dim = 300
word_vector_map = {}
doc_name_list = [] # 全部label
doc_train_list = [] # train label
doc_test_list = [] # test label
doc_content_list = [] # 全部item


with open('/data1/liushu/risk_data_grand/data/train.txt', 'r', encoding='utf-8') as f:
    for line_id, line in enumerate(f):
        id, sent_a, tgt = line.strip().split('\t')
        doc_train_list.append(tgt)
        doc_name_list.append(tgt)
        doc_content_list.append(sent_a)

with open('/data1/liushu/risk_data_grand/data/test.txt', 'r', encoding='utf-8') as f:
    for line_id, line in enumerate(f):
        id, sent_a = line.strip().split('\t')
        doc_test_list.append('0')
        doc_name_list.append('0')
        doc_content_list.append(sent_a)
    
train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
print(len(train_ids))
random.shuffle(train_ids)

train_ids_str = '\n'.join(str(index) for index in train_ids)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/risk.train.index','w',encoding='utf-8')
f.write(train_ids_str)
f.close()

test_ids = []
for test_name in doc_test_list:
    test_id = doc_test_list.index(test_name)
    test_ids.append(test_id)
print(len(test_ids))
random.shuffle(test_ids)

test_ids_str = '\n'.join(str(index) for index in test_ids)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/test.index','w',encoding='utf-8')
f.write(test_ids_str)
f.close()

ids = train_ids + test_ids
shuffle_doc_name_list = [] # 包含所有的label,乱序
shuffle_doc_words_list = [] # 包含所有的item,乱序
for id in ids:
    shuffle_doc_name_list.append(doc_name_list[int(id)])
    shuffle_doc_words_list.append(doc_content_list[int(id)])
shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/shuffle_label.txt','w',encoding='utf-8')
f.write(shuffle_doc_name_str)
f.close()

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/shuffle_item.txt','w',encoding='utf-8')
f.write(shuffle_doc_words_str)
f.close()

# build vocab
word_freq = {}
word_set = set()
for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

vocab = list(word_set)
vocab_size = len(vocab)

word_doc_list = {}

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

vocab_str = '\n'.join(vocab)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/vocab.txt', 'w')
f.write(vocab_str)
f.close()


# label list
label_set = set()
for doc_meta in shuffle_doc_name_list:
    temp = doc_meta.split('\t')
    # label_set.add(temp[2])
    label_set.add(temp[0])
label_list = list(label_set)
label_list_str = '\n'.join(label_list)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/labels.txt', 'w')
f.write(label_list_str)
f.close()

# x: feature vectors of training docs, no initial features
# slect 90% training set
train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size  # - int(0.5 * train_size)
# different training rates

real_train_doc_names = shuffle_doc_name_list[:real_train_size]
real_train_doc_names_str = '\n'.join(real_train_doc_names)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/real_train.name', 'w')
f.write(real_train_doc_names_str)
f.close()


row_x = []
col_x = []
data_x = []

for i in range(real_train_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_x.append(i)
        col_x.append(j)
        data_x.append(doc_vec[j]/doc_len)

x = sp.csr_matrix((data_x,(row_x,col_x)),shape=(real_train_size,word_embedding_dim))

y = [] # y.shape = [real_train_size,95]
for i in range(real_train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))] # label_list 35
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] =1
    y.append(one_hot)
y = np.array(y)

test_size = len(test_ids)
row_tx = []
col_tx = []
data_tx = []

for i in range(test_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i+train_size]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_tx.append(i)
        col_tx.append(j)
        data_tx.append(doc_vec[j]/doc_len)

tx = sp.csr_matrix((data_tx,(row_tx,col_tx)),shape=(test_size,word_embedding_dim))

ty = [] # ty.shape = [test_size,95]
for i in range(test_size):
    doc_meta = shuffle_doc_name_list[i+train_size]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))]
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] =1
    ty.append(one_hot)
ty = np.array(ty)

word_vectors = np.random.uniform(-0.01, 0.01,(vocab_size, word_embedding_dim))

for i in range(len(vocab)):
    word = vocab[i]
    if word in word_vector_map:
        vector = word_vector_map[word]
        word_vectors[i] = vector


# real_train_size 和 test 合并
row_allx = []
col_allx = []
data_allx = []

for i in range(train_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_allx.append(i)
        col_allx.append(j)
        data_allx.append(doc_vec[j]/doc_len)

for i in range(vocab_size):
    for j in range(word_embedding_dim):
        row_allx.append(int(i + train_size))  #这里添加的词是基于train上面再叠加的。
        col_allx.append(j)
        data_allx.append(word_vectors.item((i, j)))


row_allx = np.array(row_allx)
col_allx = np.array(col_allx)
data_allx = np.array(data_allx)

allx = sp.csr_matrix(
    (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embedding_dim))

ally = []
for i in range(train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))]
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] = 1
    ally.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(label_list))]
    #如果想改word的label,就在这里改,但是通过实验发现,改动之后不影响实验结果
    ally.append(one_hot)

ally = np.array(ally)
# (18288, 256) (18288, 95) (2257, 256) (2257, 95) (69440, 256) (69440, 95)
print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)
#完成了所有feature的提取

'''
Doc word heterogeneous graph
'''
window_size = 20
windows = [] # windows = [[window_1],[window_2],...[window_n]]

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)

word_window_freq = {} # word_window_freq = {的:20,是:18-->('是'在多少个window中出现)}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])

word_pair_count = {} # word_pair_count = {'0,1':10-->('的'和'是'共现的次数)}  word_pair_count>>word_window_freq
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

row = []
col = []
weight = []

# pmi as weights
num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) /
              (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)  #预留了trainsize的位置呢??????
    col.append(train_size + j)  #和前面的feature保持一致,先计算文档节点,再计算词和词之间的领结关系
    weight.append(pmi)

# word vector cosine similarity as weights

'''
for i in range(vocab_size):
    for j in range(vocab_size):
        if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
            vector_i = np.array(word_vector_map[vocab[i]])
            vector_j = np.array(word_vector_map[vocab[j]])
            similarity = 1.0 - cosine(vector_i, vector_j)
            if similarity > 0.9:
                print(vocab[i], vocab[j], similarity)
                row.append(train_size + i)
                col.append(train_size + j)
                weight.append(similarity)
'''

# doc word frequency
doc_word_freq = {} # doc_word_freq = {'doc:word':文档和词共现的频次} 如果一篇文档中同一个词出现多次,频次也多次

for doc_id in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[doc_id]
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffle_doc_words_list) /
                  word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))

dataset = 'risk'
# dump objects
f = open("data/ind.{}.x".format(dataset), 'wb')
pkl.dump(x, f)
f.close()

f = open("data/ind.{}.y".format(dataset), 'wb')
pkl.dump(y, f)
f.close()

f = open("data/ind.{}.tx".format(dataset), 'wb')
pkl.dump(tx, f)
f.close()

f = open("data/ind.{}.ty".format(dataset), 'wb')
pkl.dump(ty, f)
f.close()

f = open("data/ind.{}.allx".format(dataset), 'wb')
pkl.dump(allx, f)
f.close()

f = open("data/ind.{}.ally".format(dataset), 'wb')
pkl.dump(ally, f)
f.close()

f = open("data/ind.{}.adj".format(dataset), 'wb')
pkl.dump(adj, f)
f.close()


@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

希望大佬有空帮忙看下,十分感谢!

@yysirs
Copy link
Author

yysirs commented Sep 11, 2021

train的数据格式为:

id,content,label

test的数据格式为:

id,content

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants