Vocab.py

import numpy as np
import os
import json
import pickle
import jieba
import requests
from tqdm import tqdm
from transformers import BertTokenizer


class Vocab(object):
    def __init__(self, args, task='', word_vocab=False, topic_vocab=False):
        super(Vocab, self).__init__()
        self.args = args
        self.word_list, self.word_len, self.topic_list, self.topic_len = self.get_vocab(task)
        self.word2idx = dict(zip(self.word_list, range(len(self.word_list))))
        self.idx2word = {id:word for word,id in self.word2idx.items()}
        self.topic2idx = dict(zip(self.topic_list, range(len(self.topic_list))))
        self.idx2topic = {id:word for word,id in self.topic2idx.items()}
        self.word_vocab = word_vocab
        self.topic_vocab = topic_vocab
        self.get_userSent(args.dataset)
        
        
        if self.args.gpt2:
            self.tokenizer = BertTokenizer(vocab_file='vocabulary/vocab_small.txt')
            self.vocab_size = len(self.tokenizer)
            self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]')
            self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]')

    def get_userSent(self, dataset_name):
        
        if not os.path.exists('./dataset/{}/processed_data'.format(dataset_name)):
            os.mkdir('./dataset/{}/processed_data'.format(dataset_name))
        userSent_to_idx_path = './dataset/{}/processed_data/userSent_to_idx.json'.format(dataset_name)
        user_to_Sentidx_path = './dataset/{}/processed_data/user_to_Sentidx.json'.format(dataset_name)
        if os.path.exists(userSent_to_idx_path) and os.path.exists(user_to_Sentidx_path):
            userSent_to_idx = json.load(open(userSent_to_idx_path, 'r'))
            user_to_Sentidx = json.load(open(user_to_Sentidx_path, 'r'))
        else:
            print('get user profile data')
            if dataset_name == 'TG-ReDial':
                user_to_topic_sents = pickle.load(open("./dataset/{}/user2TopicSent.pkl".format(dataset_name), 'rb'))
                
                userSent_to_idx = {'[PAD]': 0}
                for idx, sent_set in user_to_topic_sents.items():
                    for sent in list(sent_set):
                        if sent not in userSent_to_idx.keys():
                            userSent_to_idx[sent] = len(userSent_to_idx)
                user_to_Sentidx = {}
                for idx, sent_set in user_to_topic_sents.items():
                    if idx not in user_to_Sentidx.keys():
                        user_to_Sentidx[int(idx)] = []
                    for sent in list(sent_set):
                        user_to_Sentidx[int(idx)].append(userSent_to_idx[sent])
                with open(userSent_to_idx_path, 'w') as f:
                    json.dump(userSent_to_idx, f)
                with open(user_to_Sentidx_path, 'w') as f:
                    json.dump(user_to_Sentidx, f)
            elif dataset_name == 'PersonaChat':
                train_data = open('dataset/PersonaChat/ConvAI2/train_both_original_no_cands.txt', 'r').readlines()
                valid_data = open('dataset/PersonaChat/ConvAI2/valid_both_original_no_cands.txt', 'r').readlines()
                all_data = train_data + valid_data
                userSent_to_idx = {'[PAD]': 0}
                user_idx = -1
                user_to_Sentidx = {}
                for line in tqdm(all_data):
                    line = line.strip()
                    if line[:2] == '1 ':
                        user_idx += 1
                        user_to_Sentidx[user_idx] = []
                    if 'your persona: ' in line:
                        line = line[line.find('your persona: ') + len('your persona: '):]
                        if line not in userSent_to_idx.keys():
                            userSent_to_idx[line] = len(userSent_to_idx)
                        user_to_Sentidx[user_idx].append(userSent_to_idx[line])
                    elif "partner's persona: " in line:
                        line = line[line.find("partner's persona: ") + len("partner's persona: "):]
                        if line not in userSent_to_idx.keys():
                            userSent_to_idx[line] = len(userSent_to_idx)
                        user_to_Sentidx[user_idx].append(userSent_to_idx[line])
                
                character_set = list(set([frozenset(i) for i in user_to_Sentidx.values()]))
                user_to_Sentidx = {}
                for i in range(len(character_set)):
                    user_to_Sentidx[str(i)] = list(character_set[i])
                with open(userSent_to_idx_path, 'w') as f:
                    json.dump(userSent_to_idx, f)
                with open(user_to_Sentidx_path, 'w') as f:
                    json.dump(user_to_Sentidx, f)
        Sentset_to_user = {}
        for (user, Sentlist) in user_to_Sentidx.items():
            Sentset_to_user[frozenset(Sentlist)] = user
        self.Sentset_to_user = Sentset_to_user
        self.userSent_to_idx = userSent_to_idx
        self.idx_to_userSent = {v:k for k, v in userSent_to_idx.items()}
        self.user_to_Sentidx = user_to_Sentidx
        self.n_user = max([int(i) for i in user_to_Sentidx.keys()])
        self.n_character = len(userSent_to_idx)

    def get_Character2topic(self, dataset_name):
        
        character2topic_path = './dataset/{}/processed_data/character2topic.json'.format(dataset_name)
        if os.path.exists(character2topic_path):
            character2topic = json.load(open(character2topic_path, 'r'))
        else:
            character2topic = {}
            for character, char_idx in self.userSent_to_idx.items():
                if character =='[PAD]':
                    character2topic[character] = '[PAD]'
                    continue
                is_match = False
                character_cuted = list(jieba.cut(character, cut_all=True))
                for topic, topic_idx in self.topic2idx.items():
                    if topic in character_cuted:
                        is_match = True
                        character2topic[character] = topic
                        break
                if is_match == False:  
                    for topic, topic_idx in self.topic2idx.items():
                        if topic in character:
                            is_match = True
                            character2topic[character] = topic
                            break
                if is_match == False:
                    print('not match', character)
            with open(character2topic_path, 'w') as f:
                json.dump(character2topic, f)
        self.character2topic = character2topic

    def get_vocab(self, task):
        action_type = ['谈论', '拒绝', '请求推荐', '允许推荐', '推荐电影', '反馈', '反馈，结束']
        RESERVED_WORDS = [self.args.PAD_WORD, self.args.BOS_PRE, self.args.BOS_PRO, self.args.UNK_WORD]
        topic_vocab = []
        word_vocab = []
        if task == 'rec':
            with open(self.args.topic_movie_file.format(self.args.dataset), encoding='utf-8') as topic_file:
                for line in topic_file:
                    line = line.strip('\n')
                    topic_vocab.append(line)
            topic_vocab = RESERVED_WORDS + action_type + topic_vocab
            topic_len = len(topic_vocab)
            with open(self.args.vocab_movie_file.format(self.args.dataset), encoding='utf-8') as vocab_file:
                for line in vocab_file.readlines():
                    line = line.strip('\n')
                    word_vocab.append(line)
            word_len = len(word_vocab)
        else:
            with open(self.args.topic_file.format(self.args.dataset), encoding='utf-8') as topic_file:
                for line in topic_file:
                    line = line.strip('\n')
                    topic_vocab.append(line)
            topic_vocab = RESERVED_WORDS + action_type + topic_vocab
            topic_len = len(topic_vocab)
            with open(self.args.vocab_file.format(self.args.dataset), encoding='utf-8') as vocab_file:
                for line in vocab_file.readlines():
                    line = line.strip('\n')
                    word_vocab.append(line)
            word_len = len(word_vocab)
        return word_vocab, word_len, topic_vocab, topic_len

    def word2index(self, word):
        unk_id = self.word2idx.get('[UNK]')
        if isinstance(word, str):
            return self.word2idx.get(word, unk_id)
        elif isinstance(word, list):
            return [self.word2index(w) for w in word]
        else:
            raise ValueError("wrong type {}".format(type(word)))

    def index2word(self, index):
        if isinstance(index, int):
            if index < len(self.word_list):
                return self.word_list[index]
            else:
                raise ValueError("{} is out of {}".format(index, len(self.word_list)))
        elif isinstance(index, np.ndarray):
            index = index.tolist()
            return [self.index2word(i) for i in index]
        elif isinstance(index, list):
            return [self.index2word(i) for i in index]
        else:
            raise ValueError("wrong type {}".format(type(index)))

    def topic2index(self, topic):
        unk_id = self.topic2idx.get('[UNK]')
        if isinstance(topic, str):
            return self.topic2idx.get(topic, unk_id)
        elif isinstance(topic, list):
            return [self.topic2index(w) for w in topic]
        elif isinstance(topic,int):
            return int
        elif topic is None:
            return self.topic2idx.get(self.args.PAD_WORD)
        else:
            raise ValueError("wrong type {}".format(type(topic)))

    def index2topic(self, index):
        if isinstance(index, int):
            if index < len(self.topic_list):
                return self.topic_list[index]
            elif index == len(self.topic_list):
                return None
            else:
                raise ValueError("{} is out of {}".format(index, len(self.word_list)))
        elif isinstance(index, np.ndarray):
            index = index.tolist()
            return [self.index2topic(i) for i in index]
        elif isinstance(index, list):
            return [self.index2topic(i) for i in index]
        else:
            raise ValueError("wrong type {}".format(type(index)))

    def item_in(self, word):
        if self.word_vocab:
            return self.word2index(word)
        elif self.topic_vocab:
            return self.topic2index(word)
        else:
            raise ValueError("word_vocab or topic_vocab must be true")

    def __len__(self,word=False,topic=False):
        if word:
            return self.word_len
        elif topic:
            return self.topic_len
        else:
            raise ValueError("word_vocab or topic_vocab must be true")

    def vocab_transfer(self):
        
        glo2loc = []
        for word in self.word_list:
            glo2loc.append(self.topic2index(word))
        loc2glo = []
        for index, topic in enumerate(self.topic_list):
            loc2glo.append(self.word2index(topic))
            
            
        return glo2loc, loc2glo

    def get_word_pad(self):
        return self.word2index('[PAD]')

    def get_topic_pad(self):
        return self.topic2index('[PAD]')

    def topic_num(self):
        return self.topic_len

    def movie_num(self):
        non_movie = self.topic2index('<movie>') + 1
        movienum = self.topic_num() - non_movie
        return movienum