@@ -107,16 +107,16 @@ def get_clusters(cluster_path, stem):
107
107
stem (bool) : stem word if true
108
108
109
109
:returns:
110
- clusters (dict) : key = cluster id , value = set of words
110
+ clusters (dict) : key = word , value = cluster
111
111
"""
112
112
113
- clusters = defaultdict ( set )
114
-
115
- with open (cluster_path ) as infile :
113
+ clusters = {}
114
+ # also doesnt work on windows without the encoding parameter
115
+ with open (cluster_path , encoding = "utf-8" ) as infile :
116
116
for line in infile :
117
- c , w , i = line .split ('\t ' )
118
- w = stem_word (w , stem )
119
- clusters [c ]. add ( w )
117
+ cluster , word , i = line .split ('\t ' )
118
+ word = stem_word (word , stem )
119
+ clusters [word ] = cluster
120
120
121
121
return clusters
122
122
@@ -137,17 +137,16 @@ def tweetclusterfeatures(texts, clusters):
137
137
138
138
vec = CountVectorizer (preprocessor = ' ' .join , tokenizer = str .split )
139
139
140
- mapped_texts = []
140
+ mapped_document = []
141
141
142
142
for text in texts :
143
143
mapped_text = []
144
- for w in text :
145
- for k ,v in clusters .items ():
146
- if w in v :
147
- mapped_text .append (k )
148
- mapped_texts .append (mapped_text )
144
+ for word in text :
145
+ if word in clusters :
146
+ mapped_text .append (clusters [word ])
147
+ mapped_document .append (mapped_text )
149
148
150
- cluster_features = vec .fit_transform (mapped_texts )
149
+ cluster_features = vec .fit_transform (mapped_document )
151
150
152
151
return cluster_features
153
152
@@ -309,24 +308,6 @@ def process_subjectivity_file(filename,stem):
309
308
scores [stem_word (word ,stem )] = score
310
309
311
310
return scores
312
-
313
- # WHAT THIS FUNCTION IS DOING EXACTLY?
314
- def coalesce (token ):
315
- new_tokens = []
316
- for char in token :
317
- if len (new_tokens ) < 2 or char != new_tokens [- 1 ] or char != new_tokens [- 2 ]:
318
- new_tokens .append (char )
319
- return '' .join (new_tokens )
320
-
321
- def clusters (tokens , cluster_lookup ):
322
-
323
- clusters = []
324
-
325
- for token in tokens :
326
- if token in cluster_lookup :
327
- clusters .append (cluster_lookup [token ])
328
-
329
- return clusters
330
311
331
312
def tweet_wordnet (tweets ):
332
313
"""
@@ -431,5 +412,4 @@ def get_sents_dependency(texts,deps,pos_vocab,neg_vocab):
431
412
dep_features = vec .fit_transform (dep_tweets )
432
413
433
414
return dep_features
434
-
435
415
0 commit comments