Skip to content

Commit d77543b

Browse files
committed
remove start script, add blob backend
1 parent d83bb6a commit d77543b

16 files changed

+174
-144
lines changed

.DS_Store

6 KB
Binary file not shown.

Dockerfile

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,10 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \
2222
# Copy Python code to the Docker image
2323
COPY pypi_scout /code/pypi_scout/
2424

25-
# Copy the start script and make executable
26-
COPY start.sh /start.sh
27-
RUN chmod +x /start.sh
28-
2925
# Make empty data directory
3026
RUN mkdir -p /code/data
3127

3228
ENV PYTHONPATH=/code
3329

3430
# Use the script as the entrypoint
35-
ENTRYPOINT ["/start.sh"]
31+
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

DockerfileCPU

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,10 @@ RUN pip install --no-cache-dir -r requirements-cpu.txt
2323
# Copy the rest of the application code
2424
COPY pypi_scout /code/pypi_scout/
2525

26-
# Copy the start script and make it executable
27-
COPY start.sh /start.sh
28-
RUN chmod +x /start.sh
29-
3026
# Make empty data directory
3127
RUN mkdir -p /code/data
3228

3329
ENV PYTHONPATH=/code
3430

3531
# Use the script as the entrypoint
36-
ENTRYPOINT ["/start.sh"]
32+
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

docker-compose.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@ services:
55
build:
66
context: .
77
dockerfile: Dockerfile
8-
command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000
98
ports:
109
- "8000:8000"
1110
volumes:
12-
- ./data:/data
11+
- ./data:/code/data
1312
env_file:
1413
- .env
1514

@@ -18,7 +17,7 @@ services:
1817
context: ./frontend
1918
dockerfile: Dockerfile
2019
args:
21-
NEXT_PUBLIC_API_URL: http://localhost:8000
20+
NEXT_PUBLIC_API_URL: http://localhost:8000/api
2221
ports:
2322
- "3000:3000"
2423
depends_on:

pypi_scout/api/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
allow_headers=["*"],
2929
)
3030

31-
df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
31+
df = load_dataset(config)
3232

3333
model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)
3434

pypi_scout/api/utils.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,37 @@
11
import logging
2-
from pathlib import Path
2+
import sys
33

44
import polars as pl
55

6+
from pypi_scout.config import Config, StorageBackend
7+
from pypi_scout.utils.blob_io import BlobIO
8+
9+
10+
def load_dataset(config: Config) -> pl.DataFrame:
11+
dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME
12+
13+
if dataset_path.exists():
14+
logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...")
15+
df = pl.read_csv(dataset_path)
16+
17+
elif config.STORAGE_BACKEND == StorageBackend.BLOB:
18+
logging.info(
19+
f"Downloading `{config.PROCESSED_DATASET_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..."
20+
)
21+
blob_io = BlobIO(
22+
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
23+
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
24+
config.STORAGE_BACKEND_BLOB_KEY,
25+
)
26+
df = blob_io.download_csv(config.PROCESSED_DATASET_CSV_NAME)
27+
logging.info("Finished downloading.")
28+
29+
else:
30+
logging.error(
31+
f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating."
32+
)
33+
sys.exit(1)
634

7-
def load_dataset(path_to_dataset: Path):
8-
logging.info("Loading the processed dataset...")
9-
df = pl.read_csv(path_to_dataset)
1035
logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}")
1136
logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
1237
logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")

pypi_scout/config.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ class Config:
2727
# Dimension of the vector embeddings produced by the model. Should match the output of the model above.
2828
EMBEDDINGS_DIMENSION = 768
2929

30+
# Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc.
31+
OVERWRITE: bool = True
32+
3033
# Directory where dataset files are stored.
3134
DATA_DIR: Path = Path("data")
3235

@@ -53,7 +56,10 @@ class Config:
5356
WEIGHT_SIMILARITY = 0.8
5457
WEIGHT_WEEKLY_DOWNLOADS = 0.2
5558

56-
# Storage backend
59+
# Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB.
60+
# If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API
61+
# will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB,
62+
# the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables.
5763
STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
5864
STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
5965
STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
@@ -76,4 +82,4 @@ def __post_init__(self) -> None:
7682
self.STORAGE_BACKEND_BLOB_KEY,
7783
]
7884
):
79-
raise OSError("One or more BLOB storage environment variables are missing!")
85+
raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003

pypi_scout/data/reader.py renamed to pypi_scout/data/raw_data_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
@dataclass
8-
class DataReader:
8+
class RawDataReader:
99
"""
1010
A class for reading and processing data from a raw PyPI dataset.
1111
"""

pypi_scout/scripts/download_dataset.py

Lines changed: 0 additions & 61 deletions
This file was deleted.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import logging
2+
3+
import gdown
4+
from dotenv import load_dotenv
5+
6+
from pypi_scout.config import Config
7+
from pypi_scout.utils.logging import setup_logging
8+
9+
10+
def download_raw_dataset():
11+
"""
12+
Downloads the dataset from a Google Drive link using the gdown library.
13+
"""
14+
load_dotenv()
15+
config = Config()
16+
17+
target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
18+
if target_path.exists():
19+
if not config.OVERWRITE:
20+
logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.")
21+
return
22+
else:
23+
logging.info(
24+
f"⤵️ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..."
25+
)
26+
27+
logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...")
28+
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
29+
gdown.download(url, str(target_path), quiet=False)
30+
logging.info("✅ Done!")
31+
32+
33+
if __name__ == "__main__":
34+
setup_logging()
35+
download_raw_dataset()

0 commit comments

Comments
 (0)