Skip to content

Commit dc097c6

Browse files
author
Klaus Strauch
committed
Retrieving cluster for a word is now more efficient
1 parent e97f436 commit dc097c6

File tree

2 files changed

+15
-35
lines changed

2 files changed

+15
-35
lines changed

load.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def _load_tweets_pos_dependecy(dep_file):
3838
:return:
3939
tweet_data (list) : list of tweets to per parsed
4040
"""
41-
42-
tweet_data = open(dep_file).read().split('\n\n')
41+
# please leave the encoding parameter. It doesnt work on a windows system without it.
42+
tweet_data = open(dep_file, encoding="utf-8").read().split('\n\n')
4343
tweet_data.pop()
4444

4545
return tweet_data

tweets_feature_extractor.py

+13-33
Original file line numberDiff line numberDiff line change
@@ -107,16 +107,16 @@ def get_clusters(cluster_path, stem):
107107
stem (bool) : stem word if true
108108
109109
:returns:
110-
clusters (dict) : key = cluster id, value = set of words
110+
clusters (dict) : key = word, value = cluster
111111
"""
112112

113-
clusters = defaultdict(set)
114-
115-
with open(cluster_path) as infile:
113+
clusters = {}
114+
# also doesnt work on windows without the encoding parameter
115+
with open(cluster_path, encoding="utf-8") as infile:
116116
for line in infile:
117-
c,w,i = line.split('\t')
118-
w = stem_word(w,stem)
119-
clusters[c].add(w)
117+
cluster, word, i = line.split('\t')
118+
word = stem_word(word, stem)
119+
clusters[word] = cluster
120120

121121
return clusters
122122

@@ -137,17 +137,16 @@ def tweetclusterfeatures(texts, clusters):
137137

138138
vec = CountVectorizer(preprocessor = ' '.join, tokenizer = str.split)
139139

140-
mapped_texts = []
140+
mapped_document = []
141141

142142
for text in texts:
143143
mapped_text = []
144-
for w in text:
145-
for k,v in clusters.items():
146-
if w in v:
147-
mapped_text.append(k)
148-
mapped_texts.append(mapped_text)
144+
for word in text:
145+
if word in clusters:
146+
mapped_text.append(clusters[word])
147+
mapped_document.append(mapped_text)
149148

150-
cluster_features = vec.fit_transform(mapped_texts)
149+
cluster_features = vec.fit_transform(mapped_document)
151150

152151
return cluster_features
153152

@@ -309,24 +308,6 @@ def process_subjectivity_file(filename,stem):
309308
scores[stem_word(word,stem)] = score
310309

311310
return scores
312-
313-
# WHAT THIS FUNCTION IS DOING EXACTLY?
314-
def coalesce(token):
315-
new_tokens = []
316-
for char in token:
317-
if len(new_tokens) < 2 or char != new_tokens[-1] or char != new_tokens[-2]:
318-
new_tokens.append(char)
319-
return ''.join(new_tokens)
320-
321-
def clusters(tokens, cluster_lookup):
322-
323-
clusters = []
324-
325-
for token in tokens:
326-
if token in cluster_lookup:
327-
clusters.append(cluster_lookup[token])
328-
329-
return clusters
330311

331312
def tweet_wordnet(tweets):
332313
"""
@@ -431,5 +412,4 @@ def get_sents_dependency(texts,deps,pos_vocab,neg_vocab):
431412
dep_features = vec.fit_transform(dep_tweets)
432413

433414
return dep_features
434-
435415

0 commit comments

Comments
 (0)