Skip to content

Commit

Permalink
Base_Classifier wasn't unicode-ready.
Browse files Browse the repository at this point in the history
Fixed bug where _word_set was based on train_set, even if train_set
is filelike instead of iterable.
emmyzero committed May 7, 2017
1 parent 7505da4 commit 61c7e47
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions textblob/classifiers.py
Original file line number Diff line number Diff line change
@@ -83,11 +83,11 @@ def basic_extractor(document, train_set):
el_zero = iter(train_set).next() #Infer input from first element.
except StopIteration:
return {}
if isinstance(el_zero, str):
if isinstance(el_zero, basestring):
word_features = [w for w in chain([el_zero],train_set)]
else:
try:
assert(isinstance(el_zero[0], str))
assert(isinstance(el_zero[0], basestring))
word_features = _get_words_from_dataset(chain([el_zero],train_set))
except:
raise ValueError('train_set is proabably malformed.')
@@ -136,7 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **
self.train_set = self._read_data(train_set, format)
else: # train_set is a list of tuples
self.train_set = train_set
self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words.
self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words.
self.train_features = None

def _read_data(self, dataset, format=None):

0 comments on commit 61c7e47

Please sign in to comment.