Skip to content

Commit d77543b

Browse files
committed
remove start script, add blob backend
1 parent d83bb6a commit d77543b

16 files changed

+174
-144
lines changed

.DS_Store

6 KB
Binary file not shown.

Dockerfile

+1-5
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,10 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \
2222
# Copy Python code to the Docker image
2323
COPY pypi_scout /code/pypi_scout/
2424

25-
# Copy the start script and make executable
26-
COPY start.sh /start.sh
27-
RUN chmod +x /start.sh
28-
2925
# Make empty data directory
3026
RUN mkdir -p /code/data
3127

3228
ENV PYTHONPATH=/code
3329

3430
# Use the script as the entrypoint
35-
ENTRYPOINT ["/start.sh"]
31+
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

DockerfileCPU

+1-5
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,10 @@ RUN pip install --no-cache-dir -r requirements-cpu.txt
2323
# Copy the rest of the application code
2424
COPY pypi_scout /code/pypi_scout/
2525

26-
# Copy the start script and make it executable
27-
COPY start.sh /start.sh
28-
RUN chmod +x /start.sh
29-
3026
# Make empty data directory
3127
RUN mkdir -p /code/data
3228

3329
ENV PYTHONPATH=/code
3430

3531
# Use the script as the entrypoint
36-
ENTRYPOINT ["/start.sh"]
32+
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

docker-compose.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@ services:
55
build:
66
context: .
77
dockerfile: Dockerfile
8-
command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000
98
ports:
109
- "8000:8000"
1110
volumes:
12-
- ./data:/data
11+
- ./data:/code/data
1312
env_file:
1413
- .env
1514

@@ -18,7 +17,7 @@ services:
1817
context: ./frontend
1918
dockerfile: Dockerfile
2019
args:
21-
NEXT_PUBLIC_API_URL: http://localhost:8000
20+
NEXT_PUBLIC_API_URL: http://localhost:8000/api
2221
ports:
2322
- "3000:3000"
2423
depends_on:

pypi_scout/api/main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
allow_headers=["*"],
2929
)
3030

31-
df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
31+
df = load_dataset(config)
3232

3333
model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)
3434

pypi_scout/api/utils.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,37 @@
11
import logging
2-
from pathlib import Path
2+
import sys
33

44
import polars as pl
55

6+
from pypi_scout.config import Config, StorageBackend
7+
from pypi_scout.utils.blob_io import BlobIO
8+
9+
10+
def load_dataset(config: Config) -> pl.DataFrame:
11+
dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME
12+
13+
if dataset_path.exists():
14+
logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...")
15+
df = pl.read_csv(dataset_path)
16+
17+
elif config.STORAGE_BACKEND == StorageBackend.BLOB:
18+
logging.info(
19+
f"Downloading `{config.PROCESSED_DATASET_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..."
20+
)
21+
blob_io = BlobIO(
22+
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
23+
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
24+
config.STORAGE_BACKEND_BLOB_KEY,
25+
)
26+
df = blob_io.download_csv(config.PROCESSED_DATASET_CSV_NAME)
27+
logging.info("Finished downloading.")
28+
29+
else:
30+
logging.error(
31+
f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating."
32+
)
33+
sys.exit(1)
634

7-
def load_dataset(path_to_dataset: Path):
8-
logging.info("Loading the processed dataset...")
9-
df = pl.read_csv(path_to_dataset)
1035
logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}")
1136
logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
1237
logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")

pypi_scout/config.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ class Config:
2727
# Dimension of the vector embeddings produced by the model. Should match the output of the model above.
2828
EMBEDDINGS_DIMENSION = 768
2929

30+
# Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc.
31+
OVERWRITE: bool = True
32+
3033
# Directory where dataset files are stored.
3134
DATA_DIR: Path = Path("data")
3235

@@ -53,7 +56,10 @@ class Config:
5356
WEIGHT_SIMILARITY = 0.8
5457
WEIGHT_WEEKLY_DOWNLOADS = 0.2
5558

56-
# Storage backend
59+
# Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB.
60+
# If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API
61+
# will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB,
62+
# the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables.
5763
STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
5864
STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
5965
STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
@@ -76,4 +82,4 @@ def __post_init__(self) -> None:
7682
self.STORAGE_BACKEND_BLOB_KEY,
7783
]
7884
):
79-
raise OSError("One or more BLOB storage environment variables are missing!")
85+
raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003

pypi_scout/data/reader.py renamed to pypi_scout/data/raw_data_reader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
@dataclass
8-
class DataReader:
8+
class RawDataReader:
99
"""
1010
A class for reading and processing data from a raw PyPI dataset.
1111
"""

pypi_scout/scripts/download_dataset.py

-61
This file was deleted.
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import logging
2+
3+
import gdown
4+
from dotenv import load_dotenv
5+
6+
from pypi_scout.config import Config
7+
from pypi_scout.utils.logging import setup_logging
8+
9+
10+
def download_raw_dataset():
11+
"""
12+
Downloads the dataset from a Google Drive link using the gdown library.
13+
"""
14+
load_dotenv()
15+
config = Config()
16+
17+
target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
18+
if target_path.exists():
19+
if not config.OVERWRITE:
20+
logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.")
21+
return
22+
else:
23+
logging.info(
24+
f"⤵️ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..."
25+
)
26+
27+
logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...")
28+
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
29+
gdown.download(url, str(target_path), quiet=False)
30+
logging.info("✅ Done!")
31+
32+
33+
if __name__ == "__main__":
34+
setup_logging()
35+
download_raw_dataset()
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
11
import logging
2-
from pathlib import Path
32

43
import polars as pl
54
from dotenv import load_dotenv
65

7-
from pypi_scout.config import Config, StorageBackend
6+
from pypi_scout.config import Config
87
from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
9-
from pypi_scout.data.reader import DataReader
10-
from pypi_scout.utils.blob_io import BlobIO
8+
from pypi_scout.data.raw_data_reader import RawDataReader
119
from pypi_scout.utils.logging import setup_logging
1210

1311

1412
def read_raw_dataset(path_to_raw_dataset):
1513
logging.info("📂 Reading the raw dataset...")
16-
df = DataReader(path_to_raw_dataset).read()
14+
df = RawDataReader(path_to_raw_dataset).read()
1715
logging.info("📊 Number of rows in the raw dataset: %s", len(df))
1816
logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
1917
logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
@@ -44,61 +42,22 @@ def clean_descriptions(df):
4442
return df
4543

4644

47-
def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path):
45+
def store_processed_dataset(df, processed_dataset_path):
4846
logging.info("Storing the processed dataset...")
4947
df.write_csv(processed_dataset_path)
5048
logging.info("✅ Done!")
5149

5250

53-
def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str):
54-
logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...")
55-
blob_io.upload_csv(df, blob_name)
56-
logging.info("✅ Done!")
57-
58-
59-
def handle_for_local_backend(config: Config):
60-
if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists():
61-
logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.")
62-
return
63-
64-
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
65-
if config.FRAC_DATA_TO_INCLUDE < 1.0:
66-
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
67-
df = clean_descriptions(df)
68-
69-
store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
70-
71-
72-
def handle_for_blob_backend(config: Config):
73-
blob_io = BlobIO(
74-
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
75-
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
76-
config.STORAGE_BACKEND_BLOB_KEY,
77-
)
78-
79-
if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME):
80-
logging.info(
81-
f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
82-
)
83-
return
84-
51+
def process_raw_dataset():
52+
load_dotenv()
53+
config = Config()
8554
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
8655
if config.FRAC_DATA_TO_INCLUDE < 1.0:
8756
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
8857
df = clean_descriptions(df)
89-
90-
store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME)
91-
92-
93-
def process_dataset():
94-
load_dotenv()
95-
config = Config()
96-
if config.STORAGE_BACKEND == StorageBackend.LOCAL:
97-
handle_for_local_backend(config)
98-
else:
99-
handle_for_blob_backend(config)
58+
store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
10059

10160

10261
if __name__ == "__main__":
10362
setup_logging()
104-
process_dataset()
63+
process_raw_dataset()

pypi_scout/scripts/setup.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,30 @@
11
import argparse
2+
import logging
23

3-
from pypi_scout.scripts.download_dataset import download_dataset
4-
from pypi_scout.scripts.process_dataset import process_dataset
4+
from pypi_scout.scripts.download_raw_dataset import download_raw_dataset
5+
from pypi_scout.scripts.process_raw_dataset import process_raw_dataset
56
from pypi_scout.scripts.setup_pinecone import setup_pinecone
7+
from pypi_scout.scripts.upload_processed_dataset import upload_processed_dataset
68
from pypi_scout.scripts.upsert_data import upsert_data
79
from pypi_scout.utils.logging import setup_logging
810

911

1012
def main(no_upsert):
1113
setup_logging()
14+
15+
logging.info("\n\nSETTING UP PINECONE -------------\n")
1216
setup_pinecone()
13-
download_dataset()
14-
process_dataset()
17+
18+
logging.info("\n\nDOWNLOADING RAW DATASET -------------\n")
19+
download_raw_dataset()
20+
21+
logging.info("\n\nPROCESSING RAW DATASET -------------\n")
22+
process_raw_dataset()
23+
24+
logging.info("\n\nUPLOADING PROCESSED DATASET -------------\n")
25+
upload_processed_dataset()
1526
if not no_upsert:
27+
logging.info("\n\nUPSERTING DATA TO PINECONE -------------\n")
1628
upsert_data()
1729

1830

pypi_scout/scripts/setup_pinecone.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def setup_pinecone():
3333
logging.info("✅ Pinecone index created successfully.")
3434
except PineconeApiException as e:
3535
if e.status == 409:
36-
logging.warning(f"✔️ Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
36+
logging.warning(f"🔹 Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
3737
else:
3838
logging.exception("❌ An error occurred while creating the Pinecone index.")
3939

0 commit comments

Comments
 (0)