diff --git a/notebook-widget/wizmap/wizmap.py b/notebook-widget/wizmap/wizmap.py index 7126381..7949848 100644 --- a/notebook-widget/wizmap/wizmap.py +++ b/notebook-widget/wizmap/wizmap.py @@ -519,6 +519,7 @@ def generate_topic_dict( svg_width=1000, svg_height=1000, ideal_tile_width=35, + stop_words: list[str] = "english", ): """Generate a topic dictionary object that encodes the topics of different regions in the embedding map across scales. @@ -530,6 +531,7 @@ def generate_topic_dict( max_zoom_scale (float): The maximal zoom scale (default to zoom x 30) svg_width (float): The approximate size of the wizmap window svg_height (float): The approximate size of the wizmap window + stop_words (list[str]): Stop words for the count vectorizer Returns: dict: A dictionary object encodes the contour plot. @@ -552,7 +554,7 @@ def generate_topic_dict( # Build the count matrix root = tree.get_node_representation() - cv = CountVectorizer(stop_words="english", ngram_range=(1, 1)) + cv = CountVectorizer(stop_words=stop_words, ngram_range=(1, 1)) count_mat = cv.fit_transform(texts) ngrams = cv.get_feature_names_out() @@ -623,6 +625,7 @@ def generate_grid_dict( image_label=None, image_url_prefix=None, opacity=None, + stop_words: list[str] = "english", ): """Generate a grid dictionary object that encodes the contour plot and the associated topics of different regions on the projected embedding space. @@ -651,6 +654,7 @@ def generate_grid_dict( image_url_prefix (str): The url prefix for all image texts opacity (float): The opacity of data points. If it is None, WizMap will dynamically adjust the opacity values. Defaults to None. + stop_words (list[str]): A set of stop words to filter out when generating topics Returns: dict: A dictionary object encodes the grid data. @@ -677,6 +681,7 @@ def generate_grid_dict( svg_width=svg_width, svg_height=svg_height, ideal_tile_width=ideal_tile_width, + stop_words=stop_words, ) # Add meta data to the final output