Skip to content

Commit d83bb6a

Browse files
committed
wip; download blob backend
1 parent d9cc11e commit d83bb6a

File tree

7 files changed

+256
-9
lines changed

7 files changed

+256
-9
lines changed

poetry.lock

Lines changed: 109 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pypi_scout/config.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
import os
22
from dataclasses import dataclass, field
3+
from enum import Enum
34
from pathlib import Path
45

56

7+
class StorageBackend(Enum):
8+
LOCAL = "LOCAL"
9+
BLOB = "BLOB"
10+
11+
612
@dataclass
713
class Config:
814
# Name of the Pinecone index used for storing vector representations of the package descriptions.
@@ -47,6 +53,27 @@ class Config:
4753
WEIGHT_SIMILARITY = 0.8
4854
WEIGHT_WEEKLY_DOWNLOADS = 0.2
4955

56+
# Storage backend
57+
STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
58+
STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
59+
STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
60+
STORAGE_BACKEND_BLOB_KEY: str | None = None
61+
5062
def __post_init__(self) -> None:
5163
if not self.PINECONE_TOKEN:
5264
raise OSError("PINECONE_TOKEN not found in environment variables") # noqa: TRY003
65+
66+
if os.getenv("STORAGE_BACKEND") == "BLOB":
67+
self.STORAGE_BACKEND = StorageBackend.BLOB
68+
self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME = os.getenv("STORAGE_BACKEND_BLOB_ACCOUNT_NAME")
69+
self.STORAGE_BACKEND_BLOB_CONTAINER_NAME = os.getenv("STORAGE_BACKEND_BLOB_CONTAINER_NAME")
70+
self.STORAGE_BACKEND_BLOB_KEY = os.getenv("STORAGE_BACKEND_BLOB_KEY")
71+
72+
if not all(
73+
[
74+
self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
75+
self.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
76+
self.STORAGE_BACKEND_BLOB_KEY,
77+
]
78+
):
79+
raise OSError("One or more BLOB storage environment variables are missing!")

pypi_scout/scripts/download_dataset.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import gdown
44
from dotenv import load_dotenv
55

6-
from pypi_scout.config import Config
6+
from pypi_scout.config import Config, StorageBackend
7+
from pypi_scout.utils.blob_io import BlobIO
78
from pypi_scout.utils.logging import setup_logging
89

910

@@ -14,6 +15,13 @@ def download_dataset():
1415
load_dotenv()
1516
config = Config()
1617

18+
if config.STORAGE_BACKEND == StorageBackend.LOCAL:
19+
handle_for_local_backend(config)
20+
else:
21+
handle_for_blob_backend(config)
22+
23+
24+
def handle_for_local_backend(config: Config):
1725
target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
1826
if target_path.exists():
1927
logging.info(f"✔️ Raw dataset {target_path} from Google Drive already exists! Skipping download.")
@@ -25,6 +33,29 @@ def download_dataset():
2533
logging.info("✅ Done!")
2634

2735

36+
def handle_for_blob_backend(config: Config):
37+
blob_io = BlobIO(
38+
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
39+
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
40+
config.STORAGE_BACKEND_BLOB_KEY,
41+
)
42+
43+
if blob_io.exists(config.RAW_DATASET_CSV_NAME):
44+
logging.info(
45+
f"✔️ Raw dataset {config.RAW_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
46+
)
47+
return
48+
49+
temp_target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
50+
logging.info("⬇️ Downloading raw dataset from Google Drive to temporary file...")
51+
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
52+
gdown.download(url, str(temp_target_path), quiet=False)
53+
54+
logging.info("Downloading done, now uploading to Blob...")
55+
blob_io.upload_local_csv(temp_target_path, config.RAW_DATASET_CSV_NAME)
56+
logging.info("✅ Done!")
57+
58+
2859
if __name__ == "__main__":
2960
setup_logging()
3061
download_dataset()

pypi_scout/scripts/process_dataset.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import logging
2+
from pathlib import Path
23

34
import polars as pl
45
from dotenv import load_dotenv
56

6-
from pypi_scout.config import Config
7+
from pypi_scout.config import Config, StorageBackend
78
from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
89
from pypi_scout.data.reader import DataReader
10+
from pypi_scout.utils.blob_io import BlobIO
911
from pypi_scout.utils.logging import setup_logging
1012

1113

@@ -42,20 +44,59 @@ def clean_descriptions(df):
4244
return df
4345

4446

45-
def store_processed_dataset(df, processed_dataset_path):
47+
def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path):
4648
logging.info("Storing the processed dataset...")
4749
df.write_csv(processed_dataset_path)
4850
logging.info("✅ Done!")
4951

5052

51-
def process_dataset():
52-
load_dotenv()
53-
config = Config()
53+
def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str):
54+
logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...")
55+
blob_io.upload_csv(df, blob_name)
56+
logging.info("✅ Done!")
57+
58+
59+
def handle_for_local_backend(config: Config):
60+
if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists():
61+
logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.")
62+
return
63+
5464
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
5565
if config.FRAC_DATA_TO_INCLUDE < 1.0:
5666
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
5767
df = clean_descriptions(df)
58-
store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
68+
69+
store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
70+
71+
72+
def handle_for_blob_backend(config: Config):
73+
blob_io = BlobIO(
74+
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
75+
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
76+
config.STORAGE_BACKEND_BLOB_KEY,
77+
)
78+
79+
if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME):
80+
logging.info(
81+
f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
82+
)
83+
return
84+
85+
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
86+
if config.FRAC_DATA_TO_INCLUDE < 1.0:
87+
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
88+
df = clean_descriptions(df)
89+
90+
store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME)
91+
92+
93+
def process_dataset():
94+
load_dotenv()
95+
config = Config()
96+
if config.STORAGE_BACKEND == StorageBackend.LOCAL:
97+
handle_for_local_backend(config)
98+
else:
99+
handle_for_blob_backend(config)
59100

60101

61102
if __name__ == "__main__":

pypi_scout/utils/blob_io.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from io import BytesIO
2+
3+
import polars as pl
4+
from azure.storage.blob import BlobServiceClient
5+
6+
7+
class BlobIO:
8+
def __init__(self, account_name: str, container_name: str, account_key: str):
9+
self.account_name = account_name
10+
self.container_name = container_name
11+
self.account_key = account_key
12+
self.service_client = BlobServiceClient(
13+
account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key
14+
)
15+
self.container_client = self.service_client.get_container_client(container_name)
16+
17+
def upload_csv(self, data_frame: pl.DataFrame, blob_name: str) -> None:
18+
csv_buffer = BytesIO()
19+
data_frame.write_csv(csv_buffer)
20+
csv_buffer.seek(0) # Reset buffer position to the beginning
21+
blob_client = self.container_client.get_blob_client(blob_name)
22+
blob_client.upload_blob(csv_buffer, overwrite=True)
23+
24+
def upload_local_csv(self, local_file_path: str, blob_name: str) -> None:
25+
with open(local_file_path, "rb") as data:
26+
blob_client = self.container_client.get_blob_client(blob_name)
27+
blob_client.upload_blob(data, overwrite=True)
28+
29+
def download_csv(self, blob_name: str) -> pl.DataFrame:
30+
blob_client = self.container_client.get_blob_client(blob_name)
31+
download_stream = blob_client.download_blob()
32+
csv_content = download_stream.content_as_text()
33+
csv_buffer = StringIO(csv_content)
34+
return pl.read_csv(csv_buffer)
35+
36+
def exists(self, blob_name):
37+
blob_client = self.container_client.get_blob_client(blob_name)
38+
return blob_client.exists()

pypi_scout/utils/logging.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33

44
def setup_logging() -> None:
5+
logging.getLogger("azure").setLevel(logging.WARNING)
6+
57
logging.basicConfig(
68
level=logging.INFO,
79
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ fastapi = "^0.111.0"
2323
pydantic = "^2.7.4"
2424
uvicorn = "^0.30.1"
2525
gdown = "^5.2.0"
26+
azure-storage-blob = "^12.20.0"
2627

2728
[tool.poetry.group.dev.dependencies]
2829
pytest = "^7.2.0"

0 commit comments

Comments
 (0)