Skip to content

Commit f36ea6e

Browse files
author
florian
committed
cleanup
1 parent f1f15d8 commit f36ea6e

22 files changed

+95
-248
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,5 +166,5 @@ cython_debug/
166166
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
167167
#.idea/
168168

169-
data
169+
/data
170170
.env

CONTRIBUTING.md

-133
This file was deleted.

codecov.yaml

-9
This file was deleted.

docs/index.md

-8
This file was deleted.

docs/modules.md

-1
This file was deleted.

frontend/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,5 @@ yarn-error.log*
3434
# typescript
3535
*.tsbuildinfo
3636
next-env.d.ts
37+
38+
.next

frontend/.next/server/app-paths-manifest.json

-1
This file was deleted.

frontend/.next/server/interception-route-rewrite-manifest.js

-1
This file was deleted.

frontend/.next/server/middleware-manifest.json

-6
This file was deleted.

frontend/.next/server/pages-manifest.json

-1
This file was deleted.

frontend/.next/server/server-reference-manifest.js

-2
This file was deleted.

frontend/.next/server/server-reference-manifest.json

-5
This file was deleted.

frontend/.next/types/package.json

-3
This file was deleted.

mkdocs.yml

-54
This file was deleted.

poetry.toml

-2
This file was deleted.

pypi_llm/api/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@
3434

3535
# Initialize vector database interface
3636
vector_database_interface = VectorDatabaseInterface(
37-
pinecone_token=config.PINECONE_TOKEN, pinecone_index_name=config.PINECONE_INDEX_NAME, embeddings_model=model
37+
pinecone_token=config.PINECONE_TOKEN,
38+
pinecone_index_name=config.PINECONE_INDEX_NAME,
39+
embeddings_model=model,
40+
pinecone_namespace=config.PINECONE_NAMESPACE,
3841
)
3942

4043

pypi_llm/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
class Config:
88
DATA_DIR: Path = Path("data")
99
PINECONE_INDEX_NAME = "pypi"
10+
PINECONE_NAMESPACE = "ns1"
1011
PINECONE_TOKEN: str = field(default_factory=lambda: os.getenv("PINECONE_TOKEN"))
1112
EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2"
1213
EMBEDDINGS_DIMENSION = 768

pypi_llm/data/description_cleaner.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import re
2+
from dataclasses import dataclass
3+
4+
import polars as pl
5+
from bs4 import BeautifulSoup
6+
7+
CLEANING_FAILED = "cleaning failed!"
8+
9+
10+
@dataclass
11+
class DescriptionCleaner:
12+
def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame:
13+
df = df.with_columns(pl.col(input_col).apply(self._clean_text).alias(output_col))
14+
return df
15+
16+
def _clean_text(self, text: str) -> str:
17+
try:
18+
text = self._remove_html_tags(text)
19+
text = self._remove_markdown_image_links(text)
20+
text = self._remove_markdown_badges(text)
21+
text = self._remove_markdown_links(text)
22+
text = self._remove_urls(text)
23+
text = self._remove_special_markdown_characters(text)
24+
text = self._remove_markdown_headers(text)
25+
text = self._remove_extra_whitespaces(text)
26+
except: # noqa: E722
27+
return CLEANING_FAILED
28+
29+
return text
30+
31+
@staticmethod
32+
def _remove_html_tags(text: str) -> str:
33+
soup = BeautifulSoup(text, "lxml")
34+
return soup.get_text(separator=" ")
35+
36+
@staticmethod
37+
def _remove_markdown_image_links(text: str) -> str:
38+
return re.sub(r"!\[.*?\]\(.*?\)", "", text)
39+
40+
@staticmethod
41+
def _remove_markdown_badges(text: str) -> str:
42+
return re.sub(r"\[!\[.*?\]\(.*?\)\]", "", text)
43+
44+
@staticmethod
45+
def _remove_markdown_links(text: str) -> str:
46+
return re.sub(r"\[.*?\]\(.*?\)", "", text)
47+
48+
@staticmethod
49+
def _remove_urls(text: str) -> str:
50+
return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
51+
52+
@staticmethod
53+
def _remove_special_markdown_characters(text: str) -> str:
54+
return re.sub(r"[#*=_`]", "", text)
55+
56+
@staticmethod
57+
def _remove_markdown_headers(text: str) -> str:
58+
return re.sub(r"\n\s*#{1,6}\s*", " ", text)
59+
60+
@staticmethod
61+
def _remove_extra_whitespaces(text: str) -> str:
62+
return " ".join(text.split())

pypi_llm/data/reader.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from dataclasses import dataclass
2+
from pathlib import Path
3+
4+
import polars as pl
5+
6+
7+
@dataclass
8+
class DataReader:
9+
data_dir: Path
10+
11+
def read(self):
12+
df = pl.read_csv(self.data_dir / "pypi_dataset.csv")
13+
df = df.with_columns(weekly_downloads=(pl.col("number_of_downloads") / 4).round().cast(pl.Int32))
14+
df = df.drop("number_of_downloads")
15+
df = df.unique(subset="name")
16+
df = df.filter(~pl.col("description").is_null())
17+
df = df.sort("weekly_downloads", descending=True)
18+
return df

pypi_llm/scripts/upsert_data.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
pinecone_token=config.PINECONE_TOKEN,
2424
pinecone_index_name=config.PINECONE_INDEX_NAME,
2525
embeddings_model=SentenceTransformer(config.EMBEDDINGS_MODEL_NAME),
26+
pinecone_namespace=config.PINECONE_NAMESPACE,
2627
)
2728

2829
df = df.with_columns(

pypi_llm/vector_database/interface.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ def __init__(
99
self,
1010
pinecone_token: str,
1111
pinecone_index_name: str,
12+
pinecone_namespace: str,
1213
embeddings_model: SentenceTransformer,
1314
batch_size: int = 250,
1415
):
1516
self.batch_size = batch_size
1617
self.model = embeddings_model
1718
pc = Pinecone(api_key=pinecone_token)
1819
self.index = pc.Index(pinecone_index_name)
20+
self.pinecone_namespace = pinecone_namespace
1921

2022
def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
2123
df_chunks = self._split_dataframe_in_batches(df)
@@ -24,15 +26,17 @@ def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
2426

2527
def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame:
2628
embeddings = self.model.encode(query)
27-
matches = self.index.query(namespace="ns1", vector=embeddings.tolist(), top_k=top_k, include_values=False)
29+
matches = self.index.query(
30+
namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False
31+
)
2832
return pl.from_dicts([{"name": x["id"], "similarity": x["score"]} for x in matches["matches"]])
2933

3034
def _upsert_chunk(self, chunk: pl.DataFrame, key_column: str, text_column: str):
3135
embeddings = self.model.encode(list(chunk[text_column]))
3236
vectors = [
3337
{"id": project_name, "values": embedding} for project_name, embedding in zip(chunk[key_column], embeddings)
3438
]
35-
self.index.upsert(vectors=vectors, namespace="ns1")
39+
self.index.upsert(vectors=vectors, namespace=self.pinecone_namespace)
3640

3741
def _split_dataframe_in_batches(self, df):
3842
n_chunks = (df.height + self.batch_size - 1) // self.batch_size

0 commit comments

Comments
 (0)