wip; download blob backend

fpgmaas · fpgmaas · commit d83bb6ac68b4 · 2024-06-21T12:45:30.000+02:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pypi_scout/config.py b/pypi_scout/config.py
@@ -1,8 +1,14 @@
 import os
 from dataclasses import dataclass, field
+from enum import Enum
 from pathlib import Path
 
 
+class StorageBackend(Enum):
+    LOCAL = "LOCAL"
+    BLOB = "BLOB"
+
+
 @dataclass
 class Config:
     # Name of the Pinecone index used for storing vector representations of the package descriptions.
@@ -47,6 +53,27 @@ class Config:
     WEIGHT_SIMILARITY = 0.8
     WEIGHT_WEEKLY_DOWNLOADS = 0.2
 
+    # Storage backend
+    STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
+    STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
+    STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
+    STORAGE_BACKEND_BLOB_KEY: str | None = None
+
     def __post_init__(self) -> None:
         if not self.PINECONE_TOKEN:
             raise OSError("PINECONE_TOKEN not found in environment variables")  # noqa: TRY003
+
+        if os.getenv("STORAGE_BACKEND") == "BLOB":
+            self.STORAGE_BACKEND = StorageBackend.BLOB
+            self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME = os.getenv("STORAGE_BACKEND_BLOB_ACCOUNT_NAME")
+            self.STORAGE_BACKEND_BLOB_CONTAINER_NAME = os.getenv("STORAGE_BACKEND_BLOB_CONTAINER_NAME")
+            self.STORAGE_BACKEND_BLOB_KEY = os.getenv("STORAGE_BACKEND_BLOB_KEY")
+
+            if not all(
+                [
+                    self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+                    self.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+                    self.STORAGE_BACKEND_BLOB_KEY,
+                ]
+            ):
+                raise OSError("One or more BLOB storage environment variables are missing!")
diff --git a/pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/download_dataset.py
@@ -3,7 +3,8 @@
 import gdown
 from dotenv import load_dotenv
 
-from pypi_scout.config import Config
+from pypi_scout.config import Config, StorageBackend
+from pypi_scout.utils.blob_io import BlobIO
 from pypi_scout.utils.logging import setup_logging
 
 
@@ -14,6 +15,13 @@ def download_dataset():
     load_dotenv()
     config = Config()
 
+    if config.STORAGE_BACKEND == StorageBackend.LOCAL:
+        handle_for_local_backend(config)
+    else:
+        handle_for_blob_backend(config)
+
+
+def handle_for_local_backend(config: Config):
     target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
     if target_path.exists():
         logging.info(f"✔️  Raw dataset {target_path} from Google Drive already exists! Skipping download.")
@@ -25,6 +33,29 @@ def download_dataset():
     logging.info("✅ Done!")
 
 
+def handle_for_blob_backend(config: Config):
+    blob_io = BlobIO(
+        config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+        config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+        config.STORAGE_BACKEND_BLOB_KEY,
+    )
+
+    if blob_io.exists(config.RAW_DATASET_CSV_NAME):
+        logging.info(
+            f"✔️  Raw dataset {config.RAW_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
+        )
+        return
+
+    temp_target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
+    logging.info("⬇️ Downloading raw dataset from Google Drive to temporary file...")
+    url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
+    gdown.download(url, str(temp_target_path), quiet=False)
+
+    logging.info("Downloading done, now uploading to Blob...")
+    blob_io.upload_local_csv(temp_target_path, config.RAW_DATASET_CSV_NAME)
+    logging.info("✅ Done!")
+
+
 if __name__ == "__main__":
     setup_logging()
     download_dataset()
diff --git a/pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/process_dataset.py
@@ -1,11 +1,13 @@
 import logging
+from pathlib import Path
 
 import polars as pl
 from dotenv import load_dotenv
 
-from pypi_scout.config import Config
+from pypi_scout.config import Config, StorageBackend
 from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
 from pypi_scout.data.reader import DataReader
+from pypi_scout.utils.blob_io import BlobIO
 from pypi_scout.utils.logging import setup_logging
 
 
@@ -42,20 +44,59 @@ def clean_descriptions(df):
     return df
 
 
-def store_processed_dataset(df, processed_dataset_path):
+def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path):
     logging.info("Storing the processed dataset...")
     df.write_csv(processed_dataset_path)
     logging.info("✅ Done!")
 
 
-def process_dataset():
-    load_dotenv()
-    config = Config()
+def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str):
+    logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...")
+    blob_io.upload_csv(df, blob_name)
+    logging.info("✅ Done!")
+
+
+def handle_for_local_backend(config: Config):
+    if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists():
+        logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.")
+        return
+
     df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
     if config.FRAC_DATA_TO_INCLUDE < 1.0:
         df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
     df = clean_descriptions(df)
-    store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+
+    store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+
+
+def handle_for_blob_backend(config: Config):
+    blob_io = BlobIO(
+        config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+        config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+        config.STORAGE_BACKEND_BLOB_KEY,
+    )
+
+    if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME):
+        logging.info(
+            f"✔️  Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
+        )
+        return
+
+    df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
+    if config.FRAC_DATA_TO_INCLUDE < 1.0:
+        df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
+    df = clean_descriptions(df)
+
+    store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME)
+
+
+def process_dataset():
+    load_dotenv()
+    config = Config()
+    if config.STORAGE_BACKEND == StorageBackend.LOCAL:
+        handle_for_local_backend(config)
+    else:
+        handle_for_blob_backend(config)
 
 
 if __name__ == "__main__":
diff --git a/pypi_scout/utils/blob_io.py b/pypi_scout/utils/blob_io.py
@@ -0,0 +1,38 @@
+from io import BytesIO
+
+import polars as pl
+from azure.storage.blob import BlobServiceClient
+
+
+class BlobIO:
+    def __init__(self, account_name: str, container_name: str, account_key: str):
+        self.account_name = account_name
+        self.container_name = container_name
+        self.account_key = account_key
+        self.service_client = BlobServiceClient(
+            account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key
+        )
+        self.container_client = self.service_client.get_container_client(container_name)
+
+    def upload_csv(self, data_frame: pl.DataFrame, blob_name: str) -> None:
+        csv_buffer = BytesIO()
+        data_frame.write_csv(csv_buffer)
+        csv_buffer.seek(0)  # Reset buffer position to the beginning
+        blob_client = self.container_client.get_blob_client(blob_name)
+        blob_client.upload_blob(csv_buffer, overwrite=True)
+
+    def upload_local_csv(self, local_file_path: str, blob_name: str) -> None:
+        with open(local_file_path, "rb") as data:
+            blob_client = self.container_client.get_blob_client(blob_name)
+            blob_client.upload_blob(data, overwrite=True)
+
+    def download_csv(self, blob_name: str) -> pl.DataFrame:
+        blob_client = self.container_client.get_blob_client(blob_name)
+        download_stream = blob_client.download_blob()
+        csv_content = download_stream.content_as_text()
+        csv_buffer = StringIO(csv_content)
+        return pl.read_csv(csv_buffer)
+
+    def exists(self, blob_name):
+        blob_client = self.container_client.get_blob_client(blob_name)
+        return blob_client.exists()
diff --git a/pypi_scout/utils/logging.py b/pypi_scout/utils/logging.py
@@ -2,6 +2,8 @@
 
 
 def setup_logging() -> None:
+    logging.getLogger("azure").setLevel(logging.WARNING)
+
     logging.basicConfig(
         level=logging.INFO,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ fastapi = "^0.111.0"
 pydantic = "^2.7.4"
 uvicorn = "^0.30.1"
 gdown = "^5.2.0"
+azure-storage-blob = "^12.20.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.0"