remove start script, add blob backend

fpgmaas · fpgmaas · commit d77543b0b065 · 2024-06-21T15:06:23.000+02:00
diff --git a/.DS_Store b/.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -22,14 +22,10 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \
 # Copy Python code to the Docker image
 COPY pypi_scout /code/pypi_scout/
 
-# Copy the start script and make executable
-COPY start.sh /start.sh
-RUN chmod +x /start.sh
-
 # Make empty data directory
 RUN mkdir -p /code/data
 
 ENV PYTHONPATH=/code
 
 # Use the script as the entrypoint
-ENTRYPOINT ["/start.sh"]
+CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/DockerfileCPU b/DockerfileCPU
@@ -23,14 +23,10 @@ RUN pip install --no-cache-dir -r requirements-cpu.txt
 # Copy the rest of the application code
 COPY pypi_scout /code/pypi_scout/
 
-# Copy the start script and make it executable
-COPY start.sh /start.sh
-RUN chmod +x /start.sh
-
 # Make empty data directory
 RUN mkdir -p /code/data
 
 ENV PYTHONPATH=/code
 
 # Use the script as the entrypoint
-ENTRYPOINT ["/start.sh"]
+CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,11 +5,10 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
-    command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000
     ports:
       - "8000:8000"
     volumes:
-      - ./data:/data
+      - ./data:/code/data
     env_file:
       - .env
 
@@ -18,7 +17,7 @@ services:
       context: ./frontend
       dockerfile: Dockerfile
       args:
-        NEXT_PUBLIC_API_URL: http://localhost:8000
+        NEXT_PUBLIC_API_URL: http://localhost:8000/api
     ports:
       - "3000:3000"
     depends_on:
diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py
@@ -28,7 +28,7 @@
     allow_headers=["*"],
 )
 
-df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+df = load_dataset(config)
 
 model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)
 
diff --git a/pypi_scout/api/utils.py b/pypi_scout/api/utils.py
@@ -1,12 +1,37 @@
 import logging
-from pathlib import Path
+import sys
 
 import polars as pl
 
+from pypi_scout.config import Config, StorageBackend
+from pypi_scout.utils.blob_io import BlobIO
+
+
+def load_dataset(config: Config) -> pl.DataFrame:
+    dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME
+
+    if dataset_path.exists():
+        logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...")
+        df = pl.read_csv(dataset_path)
+
+    elif config.STORAGE_BACKEND == StorageBackend.BLOB:
+        logging.info(
+            f"Downloading `{config.PROCESSED_DATASET_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..."
+        )
+        blob_io = BlobIO(
+            config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+            config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+            config.STORAGE_BACKEND_BLOB_KEY,
+        )
+        df = blob_io.download_csv(config.PROCESSED_DATASET_CSV_NAME)
+        logging.info("Finished downloading.")
+
+    else:
+        logging.error(
+            f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating."
+        )
+        sys.exit(1)
 
-def load_dataset(path_to_dataset: Path):
-    logging.info("Loading the processed dataset...")
-    df = pl.read_csv(path_to_dataset)
     logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}")
     logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
     logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")
diff --git a/pypi_scout/config.py b/pypi_scout/config.py
@@ -27,6 +27,9 @@ class Config:
     # Dimension of the vector embeddings produced by the model. Should match the output of the model above.
     EMBEDDINGS_DIMENSION = 768
 
+    # Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc.
+    OVERWRITE: bool = True
+
     # Directory where dataset files are stored.
     DATA_DIR: Path = Path("data")
 
@@ -53,7 +56,10 @@ class Config:
     WEIGHT_SIMILARITY = 0.8
     WEIGHT_WEEKLY_DOWNLOADS = 0.2
 
-    # Storage backend
+    # Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB.
+    # If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API
+    # will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB,
+    # the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables.
     STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
     STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
     STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
@@ -76,4 +82,4 @@ def __post_init__(self) -> None:
                     self.STORAGE_BACKEND_BLOB_KEY,
                 ]
             ):
-                raise OSError("One or more BLOB storage environment variables are missing!")
+                raise OSError("One or more BLOB storage environment variables are missing!")  # noqa: TRY003
diff --git a/pypi_scout/data/raw_data_reader.py b/pypi_scout/data/raw_data_reader.py
@@ -5,7 +5,7 @@
 
 
 @dataclass
-class DataReader:
+class RawDataReader:
     """
     A class for reading and processing data from a raw PyPI dataset.
     """
diff --git a/pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/download_dataset.py
diff --git a/pypi_scout/scripts/download_raw_dataset.py b/pypi_scout/scripts/download_raw_dataset.py
@@ -0,0 +1,35 @@
+import logging
+
+import gdown
+from dotenv import load_dotenv
+
+from pypi_scout.config import Config
+from pypi_scout.utils.logging import setup_logging
+
+
+def download_raw_dataset():
+    """
+    Downloads the dataset from a Google Drive link using the gdown library.
+    """
+    load_dotenv()
+    config = Config()
+
+    target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
+    if target_path.exists():
+        if not config.OVERWRITE:
+            logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.")
+            return
+        else:
+            logging.info(
+                f"⤵️  Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..."
+            )
+
+    logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...")
+    url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
+    gdown.download(url, str(target_path), quiet=False)
+    logging.info("✅ Done!")
+
+
+if __name__ == "__main__":
+    setup_logging()
+    download_raw_dataset()
diff --git a/pypi_scout/scripts/process_raw_dataset.py b/pypi_scout/scripts/process_raw_dataset.py
@@ -1,19 +1,17 @@
 import logging
-from pathlib import Path
 
 import polars as pl
 from dotenv import load_dotenv
 
-from pypi_scout.config import Config, StorageBackend
+from pypi_scout.config import Config
 from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
-from pypi_scout.data.reader import DataReader
-from pypi_scout.utils.blob_io import BlobIO
+from pypi_scout.data.raw_data_reader import RawDataReader
 from pypi_scout.utils.logging import setup_logging
 
 
 def read_raw_dataset(path_to_raw_dataset):
     logging.info("📂 Reading the raw dataset...")
-    df = DataReader(path_to_raw_dataset).read()
+    df = RawDataReader(path_to_raw_dataset).read()
     logging.info("📊 Number of rows in the raw dataset: %s", len(df))
     logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
     logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
@@ -44,61 +42,22 @@ def clean_descriptions(df):
     return df
 
 
-def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path):
+def store_processed_dataset(df, processed_dataset_path):
     logging.info("Storing the processed dataset...")
     df.write_csv(processed_dataset_path)
     logging.info("✅ Done!")
 
 
-def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str):
-    logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...")
-    blob_io.upload_csv(df, blob_name)
-    logging.info("✅ Done!")
-
-
-def handle_for_local_backend(config: Config):
-    if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists():
-        logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.")
-        return
-
-    df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
-    if config.FRAC_DATA_TO_INCLUDE < 1.0:
-        df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
-    df = clean_descriptions(df)
-
-    store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
-
-
-def handle_for_blob_backend(config: Config):
-    blob_io = BlobIO(
-        config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
-        config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
-        config.STORAGE_BACKEND_BLOB_KEY,
-    )
-
-    if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME):
-        logging.info(
-            f"✔️  Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
-        )
-        return
-
+def process_raw_dataset():
+    load_dotenv()
+    config = Config()
     df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
     if config.FRAC_DATA_TO_INCLUDE < 1.0:
         df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
     df = clean_descriptions(df)
-
-    store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME)
-
-
-def process_dataset():
-    load_dotenv()
-    config = Config()
-    if config.STORAGE_BACKEND == StorageBackend.LOCAL:
-        handle_for_local_backend(config)
-    else:
-        handle_for_blob_backend(config)
+    store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
 
 
 if __name__ == "__main__":
     setup_logging()
-    process_dataset()
+    process_raw_dataset()
diff --git a/pypi_scout/scripts/setup.py b/pypi_scout/scripts/setup.py
@@ -1,18 +1,30 @@
 import argparse
+import logging
 
-from pypi_scout.scripts.download_dataset import download_dataset
-from pypi_scout.scripts.process_dataset import process_dataset
+from pypi_scout.scripts.download_raw_dataset import download_raw_dataset
+from pypi_scout.scripts.process_raw_dataset import process_raw_dataset
 from pypi_scout.scripts.setup_pinecone import setup_pinecone
+from pypi_scout.scripts.upload_processed_dataset import upload_processed_dataset
 from pypi_scout.scripts.upsert_data import upsert_data
 from pypi_scout.utils.logging import setup_logging
 
 
 def main(no_upsert):
     setup_logging()
+
+    logging.info("\n\nSETTING UP PINECONE -------------\n")
     setup_pinecone()
-    download_dataset()
-    process_dataset()
+
+    logging.info("\n\nDOWNLOADING RAW DATASET -------------\n")
+    download_raw_dataset()
+
+    logging.info("\n\nPROCESSING RAW DATASET -------------\n")
+    process_raw_dataset()
+
+    logging.info("\n\nUPLOADING PROCESSED DATASET -------------\n")
+    upload_processed_dataset()
     if not no_upsert:
+        logging.info("\n\nUPSERTING DATA TO PINECONE -------------\n")
         upsert_data()
 
 
diff --git a/pypi_scout/scripts/setup_pinecone.py b/pypi_scout/scripts/setup_pinecone.py
@@ -33,7 +33,7 @@ def setup_pinecone():
         logging.info("✅ Pinecone index created successfully.")
     except PineconeApiException as e:
         if e.status == 409:
-            logging.warning(f"✔️  Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
+            logging.warning(f"🔹 Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
         else:
             logging.exception("❌ An error occurred while creating the Pinecone index.")
 
diff --git a/pypi_scout/scripts/upload_processed_dataset.py b/pypi_scout/scripts/upload_processed_dataset.py
diff --git a/pypi_scout/utils/blob_io.py b/pypi_scout/utils/blob_io.py
diff --git a/start.sh b/start.sh

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`allow_headers=["*"],`
`29`	`29`	`)`
`30`	`30`
`31`		`-df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)`
	`31`	`+df = load_dataset(config)`
`32`	`32`
`33`	`33`	`model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)`
`34`	`34`