DataLoaderTopic.py

import copy
import json
import os
import numpy as np
from enum import Enum
from tqdm import tqdm
from DataProcessor import clip_pad_sentence, clip_pad_context
from Vocab import Vocab
import torch
from torch.utils.data import Dataset
import pickle
import requests
import json as js

def collate_fn(batch_convs):
    nn_inputs = []
    for idx, batch_data in enumerate(zip(*batch_convs)):
        try:
            nn_inputs.append(torch.tensor(data=batch_data, dtype=torch.long))
        except:
            print('here')
    return nn_inputs

class DataLoaderTopic():
    def __init__(self, args, dataset, vocab):
        self.args = args
        self.dataset = dataset
        self.vocab = vocab
        self.final_topic = json.load(open('./dataset/{}/final_topic.json'.format(self.args.dataset), 'r'))
        self.topic_graph = self.get_topic_graph(dataset)
        self.topic_co_graph = self.get_topic_co_graph(dataset)
        
        if self.args.use_ckg != 0:
            self.ckg = self.get_ckg(args.dataset)
        
        self.user2character_metric = self.get_user2character_metric()
        
        self.processed_data = []
        self.processed_session()  

        
    def __iter__(self):
        return self

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]

    def get_ckg(self, dataset_name):
        ckg_path = "./dataset/{}/ckg.pkl".format(dataset_name)
        if os.path.exists(ckg_path):
            dump_data = pickle.load(open(ckg_path, 'rb'))
            ckg = dump_data[0]
            relation2idx = dump_data[1]
            other_entity2idx = dump_data[2]
        else:
            ckg = dict()
            if dataset_name == "TG-ReDial":
                language = 'zh'
            elif dataset_name == "PersonaChat":
                language = 'en'
            other_entity2idx = copy.deepcopy(self.vocab.topic2idx)
            relation2idx = {'self_loop': 0}
            for topic in tqdm(self.vocab.topic2idx.keys()):
                
                obj = requests.get('http://222.20.75.16:8884/c/' + language + '/' + topic + '?limit=2000').json()
                if topic not in ckg.keys():
                    ckg[topic] = set()
                if 'error' in obj.keys():  
                    print(topic)
                    continue
                for edge in obj['edges']:
                    start = edge['start']['label']
                    end = edge['end']['label']
                    relation = edge['rel']['label']
                    weight = edge['weight']
                    
                    if weight >= 1 and \
                        (start in self.vocab.topic2idx.keys() or start in self.vocab.word_list) and \
                        (end in self.vocab.topic2idx.keys() or end in self.vocab.word_list):
                        if start not in ckg.keys():
                            ckg[start] = set()
                        ckg[start].add((relation, end))
                        if relation not in relation2idx.keys():
                            relation2idx[relation] = len(relation2idx)
                            relation2idx[relation+'_inv'] = len(relation2idx)
                        if start not in other_entity2idx.keys():
                            other_entity2idx[start] = len(other_entity2idx)
                        if end not in other_entity2idx.keys():
                            other_entity2idx[end] = len(other_entity2idx)

            dump_data = [ckg, relation2idx, other_entity2idx]
            with open(ckg_path, 'wb') as keywords_conept_file:
                pickle.dump(dump_data, keywords_conept_file)

        self.relation2idx = relation2idx
        self.other_entity2idx = other_entity2idx

        ckg_trans = set()
        for head, edges in ckg.items():
            head_idx = other_entity2idx[head]
            ckg_trans.add((head_idx, relation2idx['self_loop'], head_idx))
            for relation, tail in edges:
                relation_id = relation2idx[relation]
                relation_inv_id = relation2idx[relation+'_inv']
                tail_id = other_entity2idx[tail]
                ckg_trans.add((head_idx, relation_id,tail_id))
                ckg_trans.add((tail_id, relation_inv_id,head_idx))
        edge_set = [[head for (head, relation, tail) in list(ckg_trans)],
                    [tail for (head, relation, tail) in list(ckg_trans)]]
        edge_type = [relation for (head, relation, tail) in list(ckg_trans)]

        self.edge_set = edge_set
        self.edge_type = edge_type


    def statistic(self):
        
        
        user2all_topic = {}
        user2session_topic = {}
        for conv in self.dataset:
            user_idx = int(conv[0])
            topic_list = conv[-1]
            if user_idx not in user2all_topic.keys():
                user2all_topic[user_idx] = set()
                user2session_topic[user_idx] = []
            user2all_topic[user_idx].update(topic_list)
            user2session_topic[user_idx].append(topic_list)
        user_cross_session = {'Y':0, 'N':0}
        topic_cross_session = {'Y':0, 'N':0}
        for user_idx, all_topic in user2all_topic.items():
            if len(user2session_topic[user_idx]) == 1:
                continue
            user_cross_flag = False
            for topic_idx in all_topic:
                topic_appear = [topic_idx in cur_topics for cur_topics in user2session_topic[user_idx]]
                if sum(topic_appear) > 1:
                    topic_cross_session['Y'] += 1
                    user_cross_flag = True
                else:
                    topic_cross_session['N'] += 1
            if user_cross_flag:
                user_cross_session['Y'] += 1
            else:
                user_cross_session['N'] += 1
        all_user = (user_cross_session['Y'] + user_cross_session['N'])
        user_cross_session['Y'] = user_cross_session['Y'] / all_user
        user_cross_session['N'] = user_cross_session['N'] / all_user
        all_topic = (topic_cross_session['Y'] + topic_cross_session['N'])
        topic_cross_session['Y'] = topic_cross_session['Y'] / all_topic
        topic_cross_session['N'] = topic_cross_session['N'] / all_topic
        print('user_cross_session', user_cross_session)
        print('topic_cross_session', topic_cross_session)

        
        topic2user = {}
        for user_idx, topic_set in user2all_topic.items():
            for topic in list(topic_list):
                if topic not in topic2user.keys():
                    topic2user[topic] = set()
                topic2user[topic].add(user_idx)
        chars_cross_user = {'Y':0, 'N':0}
        for topic, user_set in topic2user.items():
            if len(user_set) == 1:
                continue
            user_set = list(user_set)
            for i in range(len(user_set)):
                chars_i = set(self.vocab.user_to_Sentidx[str(user_set[i])])
                for j in range(i, len(user_set)):
                    chars_j = set(self.vocab.user_to_Sentidx[str(user_set[j])])
                    if not chars_i.isdisjoint(chars_j):
                        chars_cross_user['Y'] += 1
                    else:
                        chars_cross_user['N'] += 1
        all_user = chars_cross_user['Y'] + chars_cross_user['N']
        chars_cross_user['Y'] = chars_cross_user['Y'] / all_user
        chars_cross_user['N'] = chars_cross_user['N'] / all_user
        print('chars_cross_user', chars_cross_user)

    def processed_session(self):
        for conv in tqdm(self.dataset):
            if len(conv) > 5:
                processed_session = self.process(conv)
                self.processed_data.extend(processed_session)

    def process(self, conversation):
        session_segs = []
        user_id = int(conversation[0])
        contexts = conversation[-3]
        all_topics = conversation[-2]
        conv_id = conversation[-1]
        all_topic, all_topic_len = clip_pad_sentence(all_topics, self.args.all_topic_num, self.args.PAD_WORD)  
        all_topic = self.vocab.topic2index(all_topic)
        utterances = conversation[1:-3]
        uttr_len = len(utterances)
        pv_action = []
        if self.args.dataset == 'TG-ReDial':
            skip_len = 2
        elif self.args.dataset == 'PersonaChat':
            skip_len = 1
        for i in range(2, uttr_len, skip_len):
            if self.args.dataset == 'PersonaChat' and (utterances[i - 1][2][-1] == '[UNK]' or utterances[i][2][-1] == '[UNK]'):
                continue  
            response = utterances[i]
            action_R = response[2]
            if action_R == []:
                continue
            resp = response[0]
            resp, resp_len = clip_pad_sentence(resp, self.args.r_max_len, self.args.PAD_WORD, sos=self.args.BOS_RESPONSE, eos=self.args.EOS_RESPONSE)
            begin_turn = max(0, i-self.args.history_turn)
            context = contexts[begin_turn:i]
            context_all, context_all_len = clip_pad_context(context, self.args.context_all_max_len, self.args.PAD_WORD, self.args.SENTENCE_SPLITER)
            context, context_len = clip_pad_context(context, self.args.context_max_len, self.args.PAD_WORD, self.args.SENTENCE_SPLITER)
            final_topic_len = len(self.final_topic[str(conv_id) + '/' + str(i+1)])
            if self.args.not_topic_guide:
                state_U = response[1][:-final_topic_len]
                
                topic2context = []
                k = 0
                for topic in state_U[:-final_topic_len]:
                    if topic in conversation[k+1][-2]:
                        topic2context.append(k)
                    else:
                        while k <= len(conversation) - 1:
                            if topic in conversation[k + 1][-2]:
                                topic2context.append(k)
                                break
                            k += 1
                for _ in range(final_topic_len):
                    topic2context.append(i - 1)
                if max(topic2context) >= i:  
                    state_U = response[1]
                    
                    topic2context = []
                    k = 0
                    for topic in state_U:
                        if topic in conversation[k + 1][-2]:
                            topic2context.append(k)
                        else:
                            while k <= len(conversation) - 1:
                                if topic in conversation[k + 1][-2]:
                                    topic2context.append(k)
                                    break
                                k += 1
                assert len(state_U) == len(topic2context)
                topic2context = [topic2context[i]-begin_turn for i in range(len(topic2context)) if topic2context[i] >= begin_turn]
                state_U = [state_U[i] for i in range(len(topic2context)) if topic2context[i] >= begin_turn]
                if len(topic2context) >= self.args.state_num:
                    topic2context = topic2context[-self.args.state_num:]
                else:
                    topic2context = topic2context + [0] * (self.args.state_num - len(topic2context))
            else:
                state_U = response[1]
                
                topic2context = []
                k = 0
                for topic in state_U[:-final_topic_len]:
                    if topic in conversation[k+1][-2]:
                        topic2context.append(k)
                    else:
                        while k <= len(conversation) - 1:
                            if topic in conversation[k + 1][-2]:
                                topic2context.append(k)
                                break
                            k += 1
                for _ in range(final_topic_len):
                    topic2context.append(i - 1)
                if max(topic2context) >= i:  
                    state_U = response[1]
                    
                    topic2context = []
                    k = 0
                    for topic in state_U[:-1]:
                        if topic in conversation[k + 1][-2]:
                            topic2context.append(k)
                        else:
                            while k <= len(conversation) - 1:
                                if topic in conversation[k + 1][-2]:
                                    topic2context.append(k)
                                    break
                                k += 1
                    topic2context.append(i - 1)
                assert len(state_U) == len(topic2context)
                topic2context = [topic2context[i]-begin_turn for i in range(len(topic2context)) if topic2context[i] >= begin_turn]
                state_U = [state_U[i] for i in range(len(topic2context)) if topic2context[i] >= begin_turn]
                if len(topic2context) >= self.args.state_num:
                    topic2context = topic2context[-self.args.state_num:]
                else:
                    topic2context = topic2context + [0] * (self.args.state_num - len(topic2context))
            state_U, state_U_len = clip_pad_sentence(state_U, self.args.state_num, self.args.PAD_WORD)  
            Seeker = utterances[i - 1]
            action_U = Seeker[2]  
            if action_U != []:
                pv_action = action_U
            related_topics = self.get_related_topics(pv_action, self.args.relation_num, action_R)  
            related_topics, related_topics_len = clip_pad_sentence(related_topics, self.args.relation_num, self.args.PAD_WORD)
            context_all_idx = self.vocab.word2index(context_all)
            context_idx = self.vocab.word2index(context)
            response_idx = self.vocab.word2index(resp)
            state_U = self.vocab.topic2index(state_U)  
            related_topics = self.vocab.topic2index(related_topics)
            a_R, a_R_len = clip_pad_sentence(action_R, self.args.action_num, self.args.PAD_WORD)
            a_R = self.vocab.topic2index(a_R)
            
            
            session_segs.append([user_id, context_all_idx, context_all_len, context_idx, context_len, state_U, state_U_len, related_topics, related_topics_len, a_R, a_R_len, all_topic, all_topic_len, topic2context, 1, response_idx])  
        if len(session_segs) != 0:
            session_segs[0][-2] = 0
        return session_segs


    def get_topic_graph(self, dataset):
        
        if os.path.exists('./dataset/{}/topic_graph.json'.format(self.args.dataset)):
            print('load topic graph')
            with open('./dataset/{}/topic_graph.json'.format(self.args.dataset), 'r') as f:
                topic_graph = json.load(f)
        else:
            print('get topic graph')
            topic_graph = {}
            all_topic_list = list(self.vocab.topic2idx.keys())[0:1] + list(self.vocab.topic2idx.keys())[3:4] + list(self.vocab.topic2idx.keys())[11:]
            for topic in all_topic_list:  
                topic_graph[topic] = set()
            for data in dataset:
                for idx in range(2, len(data) - 2):
                    if len(data[idx - 1][2]) != 0 and len(data[idx][2]) != 0:
                        last_topic = data[idx - 1][2][-1]
                        cur_topic = data[idx][2][-1]
                        if last_topic is None:
                            last_topic = '[UNK]'
                        if cur_topic is None:
                            cur_topic = '[UNK]'
                        topic_graph[last_topic].add(cur_topic)
            for key, values in topic_graph.items():
                topic_graph[key] = list(values)
            with open('./dataset/{}/topic_graph.json'.format(self.args.dataset), 'w') as f:
                json.dump(topic_graph, f)
        return topic_graph

    def get_topic_co_graph(self, dataset):
        
        if os.path.exists('./dataset/{}/topic_co_graph.json'.format(self.args.dataset)):
            print('load topic graph')
            with open('./dataset/{}/topic_co_graph.json'.format(self.args.dataset), 'r') as f:
                topic_graph = json.load(f)
        else:
            print('get topic graph')
            topic_graph = {}
            all_topic_list = list(self.vocab.topic2idx.keys())[0:1] + list(self.vocab.topic2idx.keys())[3:4] + list(self.vocab.topic2idx.keys())[11:]
            for topic in all_topic_list:  
                topic_graph[topic] = set()
            for data in dataset:
                topic_list = data[-2]
                for i in range(len(topic_list)):
                    for j in range(len(topic_list)):
                        topic_graph[topic_list[i]].add(topic_list[j])
            for key, values in topic_graph.items():
                topic_graph[key] = list(values)
            with open('./dataset/{}/topic_co_graph.json'.format(self.args.dataset), 'w') as f:
                json.dump(topic_graph, f)
        return topic_graph

    def get_related_topics(self, action_U, relation_num, action_R):
        
        gth = []
        for i in range(0, len(action_R), 2):
            gth.append(action_R[i+1])
        related_topics = []
        a_len = len(action_U)
        for k in range(0, a_len, 2):
            action_type = action_U[k]
            topic = action_U[k+1]
            if '拒绝' in action_type:
                assert a_len > 1
                continue
            related_topic = self.topic_graph[topic][0:int(2*relation_num/a_len)]
            related_topics.extend(related_topic)
        return related_topics

    def get_cut_graph(self):
        
        if not os.path.exists('./dataset/{}/processed_data/'.format(self.args.dataset)):
            os.mkdir('./dataset/{}/processed_data/'.format(self.args.dataset))
        cut_trans_path = './dataset/{}/processed_data/cut_trans.pkl'.format(self.args.dataset)
        if os.path.exists(cut_trans_path):
            print('load c-u-t graph')
            cut_graph = pickle.load(open(cut_trans_path, 'rb'))
            edge_set = cut_graph['edge_set']
            edge_type = cut_graph['edge_type']
        else:
            print('create c-u-t graph')
            
            cut_trans = set()
            
            for conv in self.dataset:
                user_idx = int(conv[0]) + self.vocab.topic_len
                topic_list = conv[-2]
                for topic in topic_list:
                    topic_idx = self.vocab.topic2idx[topic]  
                    cut_trans.add((user_idx, self.vocab.relation2idx['user2topic'], topic_idx))
                    cut_trans.add((topic_idx, self.vocab.relation2idx['user2topic_inv'], user_idx))
            
            
            edge_set = [[head for (head, relation, tail) in list(cut_trans)], [tail for (head, relation, tail) in list(cut_trans)]]
            edge_type = [relation for (head, relation, tail) in list(cut_trans)]
            cut_graph = {'edge_set': edge_set, 'edge_type': edge_type}
            pickle.dump(cut_graph, open(cut_trans_path, 'wb'))
        return edge_set, edge_type


    def get_user2character_metric(self):
        
        print('create user2character metric')
        max_character_num = max([len(i) for i in self.vocab.user_to_Sentidx.values()])
        user2character_metric = np.zeros((self.vocab.n_user + 1, max_character_num), dtype=int)
        for user, sent_list in tqdm(self.vocab.user_to_Sentidx.items()):
            user_idx = int(user)
            for idx, sent_idx in enumerate(sent_list):
                user2character_metric[user_idx, idx] = sent_idx
        return user2character_metric

    def show_case(self):
        for conversation in self.dataset:
            if len(conversation) > 5:
                print('\n')
                user_id = int(conversation[0])
                contexts = conversation[-3]
                all_topics = conversation[-2]
                conv_id = conversation[-1]
                utterances = conversation[1:-3]
                persona_ids = list(self.user2character_metric[user_id])
                persona_sents = [self.vocab.idx_to_userSent[i] for i in persona_ids]
                print('user profilie:')
                for i in persona_sents:
                    print('\t'+i)
                print()
                print('contexts:')
                for i in range(len(utterances)):
                    print('\t'+str(i%2)+' '+ ''.join(utterances[i][0]) + '\t'+' '.join(utterances[i][2]))
        a = 1


def one_hot_scatter(indice, num_classes, dtype=torch.float):
    indice_shape = list(indice.shape)
    placeholder = torch.zeros(*(indice_shape + [num_classes]), device=indice.device, dtype=dtype)
    v = 1 if dtype == torch.long else 1.0
    placeholder.scatter_(-1, indice.unsqueeze(-1), v)
    return placeholder