DataProcessor.py

import json
import os
import pickle
import json
import copy
from tqdm import tqdm
import re
import collections
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
_lemmatizer = WordNetLemmatizer()

def tokenize(example, ppln):
    for fn in ppln:
        example = fn(example)
    return example


def kw_tokenize(string):
    return tokenize(string, [nltk_tokenize, lower, pos_tag, to_basic_form])


def simp_tokenize(string):
    return tokenize(string, [nltk_tokenize, lower])


def nltk_tokenize(string):
    return nltk.word_tokenize(string)


def lower(tokens):
    if not isinstance(tokens, str):
        return [lower(token) for token in tokens]
    return tokens.lower()


def pos_tag(tokens):
    return nltk.pos_tag(tokens)


def to_basic_form(tokens):
    if not isinstance(tokens, tuple):
        return [to_basic_form(token) for token in tokens]
    word, tag = tokens
    if tag.startswith('NN'):
        pos = 'n'
    elif tag.startswith('VB'):
        pos = 'v'
    elif tag.startswith('JJ'):
        pos = 'a'
    else:
        return word
    return _lemmatizer.lemmatize(word, pos)


class DataSet():
    def __init__(self, args, vocab):
        super(DataSet, self).__init__()
        self.args = args
        self.vocab = vocab
        self.userSet = set()
        self.topics = self.get_topics()
        if os.path.exists('./dataset/{}/final_topic.json'.format(self.args.dataset)):
            self.final_topic = json.load(open('./dataset/{}/final_topic.json'.format(self.args.dataset), 'r'))
        else:
            self.final_topic = None

    def get_dialog(self, task):
        from DataLoaderTopic import DataLoaderTopic
        from DataLoaderResp import DataLoaderResp
        if self.args.processed:
            with open('./dataset/{}/train_topic.pkl'.format(self.args.dataset), 'rb+') as train_set:
                train = pickle.load(train_set)
            with open('./dataset/{}/valid_topic.pkl'.format(self.args.dataset), 'rb+') as valid_set:
                valid = pickle.load(valid_set)
            with open('./dataset/{}/test_topic.pkl'.format(self.args.dataset), 'rb+') as test_set:
                test = pickle.load(test_set)
            all = [train, valid, test]
            users = []
            for dataset in all:
                for data in dataset:
                    user_id = data[0]
                    if user_id not in users:
                        user_id = int(user_id)
                        users.append(user_id)
            user_cont = max(users)+1
            
            if task == 'topic':
                train_set = DataLoaderTopic(self.args, train, self.vocab)
                valid_set = DataLoaderTopic(self.args, valid, self.vocab)
                test_set = DataLoaderTopic(self.args, test, self.vocab)
            elif task == 'gene':
                train_set = DataLoaderResp(self.args, train, self.vocab)
                valid_set = DataLoaderResp(self.args, valid, self.vocab)
                test_set = DataLoaderResp(self.args, test, self.vocab)
            return train_set, valid_set, test_set, users, user_cont
        else:
            if self.args.dataset == 'TG-ReDial':
                train_data = pickle.load(open('dataset/TG-ReDial/train_data.pkl', 'rb+'))[:]
                valid_data = pickle.load(open('dataset/TG-ReDial/valid_data.pkl', 'rb+'))[:]
                test_data = pickle.load(open('dataset/TG-ReDial/test_data.pkl', 'rb+'))[:]
                def _excute_data(conversations):
                    convs = []
                    for conversation in tqdm(conversations):
                        conv_id, user_id, conv_id, utterances, topic_thread, movies = conversation['conv_id'], conversation['user_id'], conversation['conv_id'], conversation['messages'], conversation['goal_path'], conversation['mentionMovies']
                        conv = []
                        self.userSet.add(user_id)
                        conv.append(user_id)
                        contents_word = []
                        states = []
                        alltopic = []
                        ks = 1
                        
                        
                        for utterance in utterances:
                            processed_sentence = []
                            utter_round, role, content = int(utterance['local_id']), utterance['role'], utterance['content']
                            goal = topic_thread[utter_round] if utter_round != 1 else [0]
                            action, topics = self.get_action(goal, movies, utter_round)
                            if '推荐电影' in action or '反馈，结束' in action:
                                action, topics = [], []
                            
                            
                            if utter_round != 1:
                                final_topic = self.get_final_topic(conv_id, utter_round)
                                final_states = states.copy()
                                for topic in final_topic:
                                    final_states.append(topic)
                                word_level, word2token, leng, ks = self.tokenize_sentence(content,movies,utter_round,ks)
                            else:
                                action = []
                                final_states = states
                                word_level, word2token, leng, ks = self.tokenize_sentence(content,movies,utter_round,ks)
                            contents_word.append(word_level)
                            processed_sentence.append(word_level)
                            processed_sentence.append(final_states.copy())
                            processed_sentence.append(action)
                            processed_sentence.append([utter_round])
                            conv.append(processed_sentence)
                            for topic in topics:
                                states.append(topic)
                        for topic in states:
                            if topic not in alltopic and topic is not None:
                                alltopic.append(topic)
                        conv.append(contents_word)
                        conv.append(alltopic)
                        conv.append(conv_id)
                        convs.append(conv)
                    return convs
                train = _excute_data(train_data)
                valid = _excute_data(valid_data)
                test = _excute_data(test_data)
            elif self.args.dataset == 'PersonaChat':
                all_data = open('dataset/PersonaChat/ConvAI2/train_both_original_no_cands.txt', 'r').readlines() + open(
                    'dataset/PersonaChat/ConvAI2/valid_both_original_no_cands.txt', 'r').readlines()
                all_data = self.process_raw_data(all_data, 0)
                self.get_vocab(all_data)
                idf_dict = self.cal_idf(all_data)
                
                kg_1hop_triples = pickle.load(open('dataset/PersonaChat/dict_file_1hop.pkl', 'rb'))
                kg_1hop = {}
                for head, tails in kg_1hop_triples.items():
                    kg_1hop[head] = []
                    for relation, tail in tails:
                        kg_1hop[head].append(tail)
                self.kg_1hop = kg_1hop
                all_data = self.get_topic(all_data, idf_dict)
                
                if True:
                    self.final_topic = self.get_all_final_topic(all_data, target_len='kg')
                kw_counter = collections.Counter()
                for data in all_data:
                    kw_counter.update(data['all_topics'])
                kw_freq = {}
                kw_sum = sum(kw_counter.values())
                for k, v in kw_counter.most_common():
                    kw_freq[k] = v / kw_sum
                for data in all_data:
                    data['score'] = 0.
                    for kw in set(data['all_topics']):
                        data['score'] += kw_freq[kw]
                    data['score'] /= len(set(data['all_topics']))
                all_data.sort(key=lambda x: x['score'], reverse=True)
                train_data, valid_data, test_data = [], [], []
                all_dataset_num = len(all_data)
                test_end_id = 500
                valid_end_id = 500 + int((all_dataset_num - 500) * 0.05)
                for idx, data in enumerate(all_data):
                    if idx < test_end_id:
                        test_data.append(data)
                    elif idx < valid_end_id:
                        valid_data.append(data)
                    else:
                        train_data.append(data)
                def _excute_data(conversations, dataset):
                    
                    convs = []
                    for idx, conversation in enumerate(tqdm(conversations)):
                        user_id, conv_id, utterances, topic_thread, movies = conversation['user_id'], conversation['conv_id'], conversation['messages'], conversation['goal_path'], conversation['mentionMovies']
                        conv = []
                        self.userSet.add(user_id)
                        conv.append(str(user_id))
                        contents_word = []
                        states = []
                        alltopic = []
                        ks = 1
                        for utterance in utterances:
                            processed_sentence = []
                            utter_round, role, content = int(utterance['local_id']), utterance['role'], utterance['content']
                            goal = topic_thread[utter_round]
                            action, topics = self.get_action(goal, movies, utter_round)
                            final_topic = self.get_final_topic(conv_id, utter_round)
                            final_states = states.copy()
                            for topic in final_topic:
                                final_states.append(topic)
                            if dataset == 'TG-ReDial':
                                word_level, word2token, leng, ks = self.tokenize_sentence(content, movies, utter_round, ks)
                            elif dataset == 'PersonaChat':
                                word_level = self.chinese_tokenize_sentence(content)
                            contents_word.append(word_level)
                            processed_sentence.append(word_level)
                            processed_sentence.append(final_states.copy())
                            processed_sentence.append(action)
                            processed_sentence.append([utter_round])
                            conv.append(processed_sentence)
                            for topic in topics:
                                if topic is None:
                                    states.append('[UNK]')
                                else:
                                    states.append(topic)
                        for topic in states:
                            if topic not in alltopic:
                                alltopic.append(topic)
                        conv.append(contents_word)
                        conv.append(alltopic)
                        conv.append(conv_id)
                        convs.append(conv)
                    return convs
                train = _excute_data(train_data, self.args.dataset)
                valid = _excute_data(valid_data, self.args.dataset)
                test = _excute_data(test_data, self.args.dataset)
            with open('./dataset/{}/train_topic.pkl'.format(self.args.dataset), 'wb+') as f:
                pickle.dump(train, f)
            with open('./dataset/{}/valid_topic.pkl'.format(self.args.dataset), 'wb+') as f:
                pickle.dump(valid, f)
            with open('./dataset/{}/test_topic.pkl'.format(self.args.dataset), 'wb+') as f:
                pickle.dump(test, f)
            train_set = DataLoaderTopic(self.args, train, self.vocab)
            valid_set = DataLoaderTopic(self.args, valid, self.vocab)
            test_set = DataLoaderTopic(self.args, test, self.vocab)
            return train_set, valid_set, test_set, self.userSet, len(self.userSet)

    def get_vocab(self, dataset):
        counter = collections.Counter()
        for data in dataset:
            dialog = data['messages']
            for uttr in dialog:
                counter.update(simp_tokenize(uttr['content']))
        print('total vocab count: ', len(counter.items()))
        sepetial_vocab = ['[PAD]', '[s_context]', '[ / s_context]', '[s_response >]', '[ / s_response]', '[sent]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
        vocab = sepetial_vocab + [token for token, times in sorted(list(counter.items()), key=lambda x: (-x[1], x[0]))]
        with open('./dataset/{}/tpvocab.txt'.format(self.args.dataset), 'w') as f:
            for word in vocab:
                f.write(word + '\n')
        print('save vocab in vocab.txt')

    def cal_idf(self, dataset):
        print('get topic cal idf')
        counter = collections.Counter()
        total = 0.
        for data in tqdm(dataset):
            dialog = data['messages']
            for uttr in dialog:
                total += 1
                counter.update(set(kw_tokenize(uttr['content'])))
        idf_dict = {}
        for k, v in counter.items():
            idf_dict[k] = np.log10(total / (v+1.))
        return idf_dict

    def get_topic(self, dataset, idf_dict):
        print('extract topic')
        keyword_extractor = KeywordExtractor(self.args, idf_dict, self.kg_1hop)
        for data in tqdm(dataset):
            dialog = data['messages']
            data['all_topics'] = []
            his_topic = None
            for uttr in dialog:
                topic = keyword_extractor.idf_extract(uttr['content'], his_topic=his_topic)
                his_topic = topic
                data['goal_path'][uttr['local_id']] = [uttr['role'], '谈论', topic]
                if topic != None:
                    data['all_topics'].append(topic)
        return dataset

    def process_raw_data(self, raw_data: list, conv_id: int):
        print('process_raw_data')
        data_list = []
        processed_data = {
            'conv_id': conv_id,
            'messages': [],
            'goal_path': {},
            'mentionMovies': {},
            'user_id': conv_id,
            'user_profile': []
        }
        role = ['Recommender', 'Seeker']
        local_id = 1
        for idx, line in enumerate(tqdm(raw_data)):
            line = line.strip()
            if line[:2] == '1 ' and idx != 0:
                
                user_profile_set = processed_data['user_profile']
                user_profile_set = frozenset([self.vocab.userSent_to_idx[sent] for sent in user_profile_set])
                user_id = self.vocab.Sentset_to_user[user_profile_set]
                processed_data['user_id'] = user_id
                data_list.append(processed_data)
                conv_id += 1
                local_id = 1
                processed_data = {
                    'conv_id': conv_id,
                    'messages': [],
                    'goal_path': {},
                    'mentionMovies': {},
                    'user_id': None,
                    'user_profile': []
                }
            if 'your persona: ' in line:
                line = line[line.find('your persona: ') + len('your persona: '):]
                processed_data['user_profile'].append(line)
            elif "partner's persona: " in line:
                line = line[line.find("partner's persona: ") + len("partner's persona: "):]
                processed_data['user_profile'].append(line)
            else:
                line = line[line.find(" ") + 1:]
                line = line.split('\t')
                processed_data['messages'].append({
                    'local_id': local_id,
                    'role': role[local_id % 2],
                    'content': line[0]
                })
                local_id += 1
                processed_data['messages'].append({
                    'local_id': local_id,
                    'role': role[local_id % 2],
                    'content': line[1]
                })
                local_id += 1
        user_profile_set = processed_data['user_profile']
        user_profile_set = frozenset([self.vocab.userSent_to_idx[sent] for sent in user_profile_set])
        user_id = self.vocab.Sentset_to_user[user_profile_set]
        processed_data['user_id'] = user_id
        data_list.append(processed_data)
        return data_list

    def chinese_tokenize_sentence(self, sentence: str):
        return simp_tokenize(sentence)

    def tokenize_sentence(self,sentence: str, movies, turn, ks):
        raw_sentence = copy.copy(sentence)
        if turn in movies:
            assert "《" in sentence and "》" in sentence
            movie_id = movies[turn][0]
            con = re.sub(r'《(.*)》', '<movie>', sentence)
            split_content = con.split('<movie>')
            sentence = split_content[0] + '<movie>' + split_content[1]
        processed_sentence = []
        while (sentence):
            flag = 0
            for topic in self.topics:
                if topic in sentence:
                    idx = sentence.index(topic)
                    if idx == 0:
                        flag = 1
                        processed_sentence.append(topic)
                        sentence = sentence[len(topic):]
                        continue
            if turn in movies and movies[turn][0] in sentence:
                if sentence.index(movies[turn][0]) == 0:
                    flag = 1
                    processed_sentence.append(movies[turn][0])
                    sentence = sentence[len(movies[turn][0]):]
            if flag == 0:
                word = sentence[0]
                processed_sentence.append(word)
                sentence = sentence[1:]
        word2token = []
        for word in processed_sentence:
            if word == '<movie>':
                length = 3
            else:
                length = len(word)
            word2token.append([ks+j for j in range(length)])
            ks+=length
        leng = []
        for word in word2token:
            leng.append(len(word))
        word2token_pad = []
        for i in range(len(word2token)):
            word = word2token[i]
            length = leng[i]
            pad_token = word + [0]*(10-length)
            word2token_pad.append(pad_token)
        if turn == 1:
            processed_sentence = [i for i in raw_sentence]
        return processed_sentence, word2token_pad, leng, ks

    def get_action(self, goals, movies, utter_round):
        action = []
        topic_path = []
        goal = goals[1:]
        if '反馈' in goal:
            assert goal[0] == '反馈'
            goal = goal[2:4]
        
        
        if '谈论' in goal and '请求推荐' in goal:  
            goal = goal[:2]
        if len(goal) == 2:
            action_type = goal[0]
            topics = goal[1]
            if '推荐电影' in action_type:
                if isinstance(topics, str):
                    action.append(action_type)
                    movie = movies[utter_round][0]
                    action.append('<movie>')
                    if '拒绝' not in action_type:
                        topic_path.append(movie)
                elif isinstance(topics, list):
                    for topic in topics:
                        action.append(action_type)
                        action.append('<movie>')
                        if '拒绝' not in action_type:
                            topic_path.append(topic)
            else:
                if isinstance(topics, str):
                    action.append(action_type)
                    action.append(topics)
                    topic_path.append(topics)
                elif isinstance(topics, list):
                    for topic in topics:
                        action.append(action_type)
                        action.append(topic)
                        topic_path.append(topic)
                elif topics is None:
                    action.append(action_type)
                    action.append('[UNK]')
                    topic_path.append(topics)
        elif len(goal) == 4:
            for i in range(0, 4, 2):
                action_type = goal[i]
                topics = goal[i + 1]
                if '推荐电影' in action_type:
                    continue
                    
                    
                else:
                    if isinstance(topics, str):
                        action.append(action_type)
                        action.append(topics)
                        
                        
                        if '拒绝' not in action_type:
                            topic_path.append(topics)
                    if isinstance(topics, list):
                        for topic in topics:
                            action.append(action_type)
                            action.append(topic)
                            if '拒绝' not in action_type:
                                topic_path.append(topic)
        return action, topic_path

    def get_state(self,action):
        state = []
        delete_state = []

        action_len = len(action)
        for k in range(0, action_len, 2):
            action_type = action[k]
            topic = action[k+1]
            if '拒绝' in action_type:
                delete_state.append(topic)
            else:
                state.append(topic)
        return state, delete_state

    def get_final_topic(self, conv_id, utter_id):
        kw_list = []
        conv_id = str(conv_id)
        utter_id = str(utter_id)
        identity = conv_id + '/' + utter_id
        if identity in self.final_topic:
            kw_list = self.final_topic[identity]
        return kw_list

    def get_all_final_topic(self, dataset, target_len=None):
        print('get_all_final_topic')
        all_trans = 0.
        all_num = 0.
        all_final_topic = {}
        for data in tqdm(dataset):
            conv_id = str(data['conv_id'])
            if target_len is None:
                if len([topic[-1] for idx, topic in data['goal_path'].items() if topic[-1] is not None]) != 0:
                    final_topic = [topic[-1] for idx, topic in data['goal_path'].items() if topic[-1] is not None][-1]  
                else:
                    final_topic = '[UNK]'
            dialog = data['messages']
            for uttr in dialog[1:]:
                utter_id = str(uttr['local_id'])  
                identity = conv_id + '/' + utter_id
                if target_len is None:
                    all_final_topic[identity] = [final_topic]
                elif target_len == 'kg':  
                    for j in range(uttr['local_id'], len(dialog)+1):
                        if data['goal_path'][j][-1] is None or data['goal_path'][j-1][-1] not in self.kg_1hop.keys() or data['goal_path'][j][-1] not in self.kg_1hop[data['goal_path'][j-1][-1]]:
                            break
                    all_final_topic[identity] = [data['goal_path'][min(len(dialog), j)][-1]]
                    all_trans += max(1, j+1-uttr['local_id'])
                    all_num += 1
                else:
                    all_final_topic[identity] = [final_topic if uttr['local_id']+target_len > len(dialog) else data['goal_path'][uttr['local_id']+target_len][-1]]
        with open('./dataset/{}/final_topic.json'.format(self.args.dataset), 'w') as f:
            json.dump(all_final_topic, f)
        if all_num != 0:
            print('avg trans hop is ', all_trans/all_num)
        return all_final_topic

    def get_topics(self):
        topic_file = open(self.args.topic_file.format(self.args.dataset), encoding='utf-8')
        topic_vocab = []
        for line in topic_file.readlines():
            line = line.strip('\n')
            topic_vocab.append(line)
        return topic_vocab

    def get_sparsity(self):
        
        with open('./dataset/{}/train_topic.pkl'.format(self.args.dataset), 'rb+') as train_set:
            train = pickle.load(train_set)
        with open('./dataset/{}/valid_topic.pkl'.format(self.args.dataset), 'rb+') as valid_set:
            valid = pickle.load(valid_set)
        with open('./dataset/{}/test_topic.pkl'.format(self.args.dataset), 'rb+') as test_set:
            test = pickle.load(test_set)
        data_all = train + valid + test
        user_set = set([int(data[0]) for data in data_all])
        user2topic = np.zeros((self.vocab.n_user + 1, self.vocab.topic_len))
        for data in tqdm(data_all):
            user_id = int(data[0])
            topics = self.vocab.topic2index(data[-2])
            user2topic[user_id, topics] = 1
        all_interacions_num = user2topic.sum()
        Sparsity = 1 - all_interacions_num / (len(user_set) * self.vocab.topic_len)
        print('Sparsity is ', Sparsity)

    def get_co_topic(self, datasets):
        
        co_topic_path = './dataset/{}/processed_data/co_topic.pkl'.format(self.args.dataset)
        if os.path.exists(co_topic_path) and False:
            print('load co-occurrence topic')
            co_topic = pickle.load(open(co_topic_path, 'rb'))
            co_topic_graph = co_topic['co_topic_graph']
            persona_co_topic = co_topic['persona_co_topic']
        else:
            print('create co-occurrence topic')
            co_topic_graph = np.zeros([self.vocab.topic_len, self.vocab.topic_len], dtype=np.int32)
            persona_co_topic = np.zeros([self.vocab.n_character, self.vocab.topic_len, self.vocab.topic_len], dtype=np.int8)
            for dataset in datasets:
                for conv in tqdm(dataset):
                    user_idx = int(conv[0])
                    topic_list = conv[-2]
                    personas = self.vocab.user_to_Sentidx[str(user_idx)]
                    for i in range(len(topic_list)):
                        for j in range(i, len(topic_list)):
                            idx = self.vocab.topic2idx[topic_list[i]]
                            jdx = self.vocab.topic2idx[topic_list[j]]
                            co_topic_graph[idx, jdx] += 1
                            co_topic_graph[jdx, idx] += 1
                            for pid in personas:
                                persona_co_topic[pid, idx, jdx] += 1
                                persona_co_topic[pid, jdx, idx] += 1
            co_topic = {'co_topic_graph': co_topic_graph, 'persona_co_topic': persona_co_topic}
            pickle.dump(co_topic, open(co_topic_path, 'wb'))

class KeywordExtractor():
    def __init__(self, args, idf_dict=None, kg_1hop=None):
        self.args = args
        self.idf_dict = idf_dict
        candi_keyword_path = args.topic_file
        self.candiwords = [x.strip() for x in open(candi_keyword_path).readlines()]
        self.kg_1hop = kg_1hop

    @staticmethod
    def is_keyword_tag(tag):
        return tag.startswith('VB') or tag.startswith('NN') or tag.startswith('JJ')

    @staticmethod
    def cal_tag_score(tag):
        if tag.startswith('VB'):  
            return 1.
        if tag.startswith('NN'):  
            return 2.
        if tag.startswith('JJ'):  
            return 0.5
        return 0.

    def idf_extract(self, string, con_kw=None, his_topic=None):
        tokens = simp_tokenize(string)
        seq_len = len(tokens)
        tokens = pos_tag(tokens)
        source = kw_tokenize(string)
        candi = []
        for i, (word, tag) in enumerate(tokens):
            score = self.cal_tag_score(tag)
            if source[i] not in self.candiwords or score == 0.:
                continue
            if con_kw is not None and source[i] in con_kw:
                continue
            score *= source.count(source[i])
            score *= 1 / seq_len
            score *= self.idf_dict[source[i]]
            candi.append((source[i], score))
            
            
        if len(candi) > 0:
            if his_topic is not None:
                kg_candi = [(i, j) for (i, j) in candi if i in self.kg_1hop[his_topic]]
                if len(kg_candi) != 0:
                    max_idx = np.argmax([i[1] for i in kg_candi])
                    topic = kg_candi[max_idx][0]
                else:
                    max_idx = np.argmax([i[1] for i in candi])
                    topic = candi[max_idx][0]
            else:
                max_idx = np.argmax([i[1] for i in candi])
                topic = candi[max_idx][0]
        else:
            topic = None
        return topic

    def extract(self, string):
        tokens = simp_tokenize(string)
        tokens = pos_tag(tokens)
        source = kw_tokenize(string)
        kwpos_alters = []
        for i, (word, tag) in enumerate(tokens):
            if source[i] and self.is_keyword_tag(tag):
                kwpos_alters.append(i)
        _, keywords = [], []
        for id in kwpos_alters:
            if source[id]:
                keywords.append(source[id])
        return list(set(keywords))

def clip_pad_sentence(sentence, max_len, pad, sos=None, eos=None, save_prefix=False, pad_suffix=True, return_length=True):
    ml = max_len
    if eos is not None:
        ml = ml - 2
    if save_prefix:
        sentence = sentence[:ml]
    else:
        sentence = sentence[-ml:]
    if eos is not None:
        sentence = [sos] + sentence
        sentence = sentence + [eos]
    length = None
    if return_length:
        length = len(sentence)
    if pad_suffix:
        sentence += [pad] * (max_len - len(sentence))
    else:
        sentence = [pad] * (max_len - len(sentence)) + sentence
    if not return_length:
        return sentence
    return sentence, length

def clip_pad_context(context, max_len, pad, sent, pad_suffix=True):
    sentence = []
    for turn in context:
        turn = turn + [sent]
        sentence = sentence + turn
    real_len = len(sentence)
    if real_len > max_len:
        sentence = sentence[-max_len:]
    else:
        if pad_suffix:
            sentence = sentence + [pad] * (max_len - real_len)
        else:
            sentence = [pad] * (max_len - real_len) + sentence
    return sentence, real_len