Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional custom stopwords. #11

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion notebook-widget/wizmap/wizmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ def generate_topic_dict(
svg_width=1000,
svg_height=1000,
ideal_tile_width=35,
stop_words: list[str] = "english",
):
"""Generate a topic dictionary object that encodes the topics of different
regions in the embedding map across scales.
Expand All @@ -530,6 +531,7 @@ def generate_topic_dict(
max_zoom_scale (float): The maximal zoom scale (default to zoom x 30)
svg_width (float): The approximate size of the wizmap window
svg_height (float): The approximate size of the wizmap window
stop_words (list[str]): Stop words for the count vectorizer

Returns:
dict: A dictionary object encodes the contour plot.
Expand All @@ -552,7 +554,7 @@ def generate_topic_dict(
# Build the count matrix
root = tree.get_node_representation()

cv = CountVectorizer(stop_words="english", ngram_range=(1, 1))
cv = CountVectorizer(stop_words=stop_words, ngram_range=(1, 1))
count_mat = cv.fit_transform(texts)
ngrams = cv.get_feature_names_out()

Expand Down Expand Up @@ -623,6 +625,7 @@ def generate_grid_dict(
image_label=None,
image_url_prefix=None,
opacity=None,
stop_words: list[str] = "english",
):
"""Generate a grid dictionary object that encodes the contour plot and the
associated topics of different regions on the projected embedding space.
Expand Down Expand Up @@ -651,6 +654,7 @@ def generate_grid_dict(
image_url_prefix (str): The url prefix for all image texts
opacity (float): The opacity of data points. If it is None, WizMap will
dynamically adjust the opacity values. Defaults to None.
stop_words (list[str]): A set of stop words to filter out when generating topics

Returns:
dict: A dictionary object encodes the grid data.
Expand All @@ -677,6 +681,7 @@ def generate_grid_dict(
svg_width=svg_width,
svg_height=svg_height,
ideal_tile_width=ideal_tile_width,
stop_words=stop_words,
)

# Add meta data to the final output
Expand Down