fpgmaas
diff --git a/‎.gitignore
+1-1 b/‎.gitignore
+1-1
diff --git a/‎CONTRIBUTING.md
-133 b/‎CONTRIBUTING.md
-133
diff --git a/‎codecov.yaml
-9 b/‎codecov.yaml
-9
diff --git a/‎docs/index.md
-8 b/‎docs/index.md
-8
diff --git a/‎docs/modules.md
-1 b/‎docs/modules.md
-1
diff --git a/‎frontend/.gitignore
+2 b/‎frontend/.gitignore
+2
diff --git a/‎frontend/.next/server/app-paths-manifest.json
-1 b/‎frontend/.next/server/app-paths-manifest.json
-1
diff --git a/‎frontend/.next/server/interception-route-rewrite-manifest.js
-1 b/‎frontend/.next/server/interception-route-rewrite-manifest.js
-1
diff --git a/‎frontend/.next/server/middleware-manifest.json
-6 b/‎frontend/.next/server/middleware-manifest.json
-6
diff --git a/‎frontend/.next/server/pages-manifest.json
-1 b/‎frontend/.next/server/pages-manifest.json
-1
diff --git a/‎frontend/.next/server/server-reference-manifest.js
-2 b/‎frontend/.next/server/server-reference-manifest.js
-2
diff --git a/‎frontend/.next/server/server-reference-manifest.json
-5 b/‎frontend/.next/server/server-reference-manifest.json
-5
diff --git a/‎frontend/.next/types/package.json
-3 b/‎frontend/.next/types/package.json
-3
diff --git a/‎mkdocs.yml
-54 b/‎mkdocs.yml
-54
diff --git a/‎poetry.toml
-2 b/‎poetry.toml
-2
diff --git a/‎pypi_llm/api/main.py
+4-1 b/‎pypi_llm/api/main.py
+4-1
diff --git a/‎pypi_llm/config.py
+1 b/‎pypi_llm/config.py
+1
diff --git a/‎pypi_llm/data/description_cleaner.py
+62 b/‎pypi_llm/data/description_cleaner.py
+62
diff --git a/‎pypi_llm/data/reader.py
+18 b/‎pypi_llm/data/reader.py
+18
diff --git a/‎pypi_llm/scripts/upsert_data.py
+1 b/‎pypi_llm/scripts/upsert_data.py
+1
diff --git a/‎pypi_llm/vector_database/interface.py
+6-2 b/‎pypi_llm/vector_database/interface.py
+6-2
@@ -166,5 +166,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-data
+/data
 .env
@@ -34,3 +34,5 @@ yarn-error.log*
 # typescript
 *.tsbuildinfo
 next-env.d.ts
+
+.next
@@ -34,7 +34,10 @@
 
 # Initialize vector database interface
 vector_database_interface = VectorDatabaseInterface(
-    pinecone_token=config.PINECONE_TOKEN, pinecone_index_name=config.PINECONE_INDEX_NAME, embeddings_model=model
+    pinecone_token=config.PINECONE_TOKEN,
+    pinecone_index_name=config.PINECONE_INDEX_NAME,
+    embeddings_model=model,
+    pinecone_namespace=config.PINECONE_NAMESPACE,
 )
 
 
 
@@ -7,6 +7,7 @@
 class Config:
     DATA_DIR: Path = Path("data")
     PINECONE_INDEX_NAME = "pypi"
+    PINECONE_NAMESPACE = "ns1"
     PINECONE_TOKEN: str = field(default_factory=lambda: os.getenv("PINECONE_TOKEN"))
     EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2"
     EMBEDDINGS_DIMENSION = 768
 
@@ -0,0 +1,62 @@
+import re
+from dataclasses import dataclass
+
+import polars as pl
+from bs4 import BeautifulSoup
+
+CLEANING_FAILED = "cleaning failed!"
+
+
+@dataclass
+class DescriptionCleaner:
+    def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame:
+        df = df.with_columns(pl.col(input_col).apply(self._clean_text).alias(output_col))
+        return df
+
+    def _clean_text(self, text: str) -> str:
+        try:
+            text = self._remove_html_tags(text)
+            text = self._remove_markdown_image_links(text)
+            text = self._remove_markdown_badges(text)
+            text = self._remove_markdown_links(text)
+            text = self._remove_urls(text)
+            text = self._remove_special_markdown_characters(text)
+            text = self._remove_markdown_headers(text)
+            text = self._remove_extra_whitespaces(text)
+        except:  # noqa: E722
+            return CLEANING_FAILED
+
+        return text
+
+    @staticmethod
+    def _remove_html_tags(text: str) -> str:
+        soup = BeautifulSoup(text, "lxml")
+        return soup.get_text(separator=" ")
+
+    @staticmethod
+    def _remove_markdown_image_links(text: str) -> str:
+        return re.sub(r"!\[.*?\]\(.*?\)", "", text)
+
+    @staticmethod
+    def _remove_markdown_badges(text: str) -> str:
+        return re.sub(r"\[!\[.*?\]\(.*?\)\]", "", text)
+
+    @staticmethod
+    def _remove_markdown_links(text: str) -> str:
+        return re.sub(r"\[.*?\]\(.*?\)", "", text)
+
+    @staticmethod
+    def _remove_urls(text: str) -> str:
+        return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
+
+    @staticmethod
+    def _remove_special_markdown_characters(text: str) -> str:
+        return re.sub(r"[#*=_`]", "", text)
+
+    @staticmethod
+    def _remove_markdown_headers(text: str) -> str:
+        return re.sub(r"\n\s*#{1,6}\s*", " ", text)
+
+    @staticmethod
+    def _remove_extra_whitespaces(text: str) -> str:
+        return " ".join(text.split())
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+import polars as pl
+
+
+@dataclass
+class DataReader:
+    data_dir: Path
+
+    def read(self):
+        df = pl.read_csv(self.data_dir / "pypi_dataset.csv")
+        df = df.with_columns(weekly_downloads=(pl.col("number_of_downloads") / 4).round().cast(pl.Int32))
+        df = df.drop("number_of_downloads")
+        df = df.unique(subset="name")
+        df = df.filter(~pl.col("description").is_null())
+        df = df.sort("weekly_downloads", descending=True)
+        return df
@@ -23,6 +23,7 @@
         pinecone_token=config.PINECONE_TOKEN,
         pinecone_index_name=config.PINECONE_INDEX_NAME,
         embeddings_model=SentenceTransformer(config.EMBEDDINGS_MODEL_NAME),
+        pinecone_namespace=config.PINECONE_NAMESPACE,
     )
 
     df = df.with_columns(
 
@@ -9,13 +9,15 @@ def __init__(
         self,
         pinecone_token: str,
         pinecone_index_name: str,
+        pinecone_namespace: str,
         embeddings_model: SentenceTransformer,
         batch_size: int = 250,
     ):
         self.batch_size = batch_size
         self.model = embeddings_model
         pc = Pinecone(api_key=pinecone_token)
         self.index = pc.Index(pinecone_index_name)
+        self.pinecone_namespace = pinecone_namespace
 
     def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
         df_chunks = self._split_dataframe_in_batches(df)
@@ -24,15 +26,17 @@ def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
 
     def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame:
         embeddings = self.model.encode(query)
-        matches = self.index.query(namespace="ns1", vector=embeddings.tolist(), top_k=top_k, include_values=False)
+        matches = self.index.query(
+            namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False
+        )
         return pl.from_dicts([{"name": x["id"], "similarity": x["score"]} for x in matches["matches"]])
 
     def _upsert_chunk(self, chunk: pl.DataFrame, key_column: str, text_column: str):
         embeddings = self.model.encode(list(chunk[text_column]))
         vectors = [
             {"id": project_name, "values": embedding} for project_name, embedding in zip(chunk[key_column], embeddings)
         ]
-        self.index.upsert(vectors=vectors, namespace="ns1")
+        self.index.upsert(vectors=vectors, namespace=self.pinecone_namespace)
 
     def _split_dataframe_in_batches(self, df):
         n_chunks = (df.height + self.batch_size - 1) // self.batch_size
-Original file line number
+Diff line change
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 -data
 +/data
 .env
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,10 @@`
`34`	`34`
`35`	`35`	`# Initialize vector database interface`
`36`	`36`	`vector_database_interface = VectorDatabaseInterface(`
`37`		`- pinecone_token=config.PINECONE_TOKEN, pinecone_index_name=config.PINECONE_INDEX_NAME, embeddings_model=model`
	`37`	`+ pinecone_token=config.PINECONE_TOKEN,`
	`38`	`+ pinecone_index_name=config.PINECONE_INDEX_NAME,`
	`39`	`+ embeddings_model=model,`
	`40`	`+ pinecone_namespace=config.PINECONE_NAMESPACE,`
`38`	`41`	`)`
`39`	`42`
`40`	`43`
Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`pinecone_token=config.PINECONE_TOKEN,`
`24`	`24`	`pinecone_index_name=config.PINECONE_INDEX_NAME,`
`25`	`25`	`embeddings_model=SentenceTransformer(config.EMBEDDINGS_MODEL_NAME),`
	`26`	`+ pinecone_namespace=config.PINECONE_NAMESPACE,`
`26`	`27`	`)`
`27`	`28`
`28`	`29`	`df = df.with_columns(`