-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.py
118 lines (106 loc) · 4.84 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 21 10:48:01 2019
@author: Yan
"""
# follows looesely https://medium.com/coinmonks/language-prediction-using-deep-neural-networks-42eb131444a5
import wikipedia as wiki
import string
import unicodedata
import numpy as np
# def get_words(dict, remove_duplicates = True):
# ''' Search articles on wikipedia and output words in the article. Words are cleaned
# of punctuation, numbers and accents.
# Input: dict {language: [word_to_search, word_to_search,...],...}
# Output: dict {language: [word_in_article, word_in_article,...],...} '''
# new_dict = {}
# for language, queries in dict.items():
# wiki.set_lang(language)
# words = []
# for query in queries:
# text = wiki.WikipediaPage(title = query).content # extract text from wiki page
# text_words = text.split() # split the text into words
# text_words_clean = map(clean_word, text_words) # clean every word
# words += text_words_clean # add all the extracted words to the list
# if remove_duplicates:
# new_dict[language] = list(set(words)) # remove duplicate words in a given language
# else:
# new_dict[language] = words # don't remove duplicate words
# return new_dict
#
#
# def clean_word(word):
# ''' Clean a given word by putting it in lower case, removing punctuation, numbers and accents.
# Input: str word
# Output str cleaned_word '''
# word = word.lower() # convert everything to lowercase
# word = word.translate(str.maketrans('', '', string.punctuation)) # remove any punctuation
# word = word.translate(str.maketrans('', '', string.digits)) # remove any number
# word = unicodedata.normalize('NFKD', word).encode('ASCII', 'ignore').decode('utf-8') # remove accents
# return word
#
#
def word_to_array(word, max_len = 12):
''' Convert word to numpy array, each column is a letter written in one-hot encoding. It only takes
the first max_len letters of the word.
Inputs: str word
int max_len (optional)
Output: array of size (26,max_len) encoded_word '''
word_array = np.zeros((26,max_len))
for i, char in enumerate(word):
char_num = ord(char) - 97
if char_num >= 0 and char_num <= 26 and i < max_len:
word_array[char_num, i] = 1
return word_array
def beautifyOutput(word, sentence):
print("sentence: ", sentence)
if word[0:1] == word[0:1].upper(): # check if first letter is uppercase
capitalizeFirst = sentence[0:1].upper() + sentence[1:-1] # last letter is white space
else: capitalizeFirst = sentence
print("capitalizeFirst: ", capitalizeFirst)
# connectLastPunctuationMark
final = capitalizeFirst
lastLetter = capitalizeFirst[len(capitalizeFirst)-1]
last2ndLetter = capitalizeFirst[len(capitalizeFirst)-2]
if (lastLetter == '.' or lastLetter == '!' or lastLetter == '?') \
and last2ndLetter == ' ':
final = capitalizeFirst[:-2] + lastLetter
print("output: ", final)
return final
# def create_data(query_dict, max_len = 12, min_len = 3, remove_duplicates = True):
# ''' Create data for words in different languages in form of numpy array to feed to a NN.
# It only keeps the words of length in (min_len,max_len) from the wikipedia pages and cleans them.
# Input: dict {language: [word_to_search, word_to_search,...],...}
# int max_len (optional)
# int min_len (optional)
# Outputs: array of size (num_words, 26, max_len) containing all the one-hot encoded words
# array of size (num_words, num_languages) containing the one-hot encoded language for each word '''
# word_dict = get_words(query_dict, remove_duplicates)
# features = []
# labels = []
# for i, (language, word_list) in enumerate(word_dict.items()):
# for word in word_list:
# if len(word) >= min_len and len(word) <= max_len:
# features.append(word_to_array(word, max_len))
# label = np.zeros(len(word_dict))
# label[i] = 1
# labels.append(label)
# features = np.stack(features)
# labels = np.stack(labels)
# return features, labels
# def array_to_word(arr):
# ''' Convert one-hot encoded array for a word to a string word.
# Inputs: array of size (26,max_len) one-hot encoded word '''
# word = ''
# word_len = arr.shape[1]
# for j in range(word_len):
# for i in range(26):
# if arr[i,j] == 1:
# word += chr(97 + i)
# return word
# def count_words(data, labels):
# ''' Count number of words of each language in a dataset of words. '''
# for i in range(labels.shape[1]):
# num = np.sum(labels[:,i] == 1)
# print('Number of words in language ', i, ': ', num)
# print('Total number of words: ', data.shape[0])