|
1 | 1 | import logging
|
2 |
| -from pathlib import Path |
3 | 2 |
|
4 | 3 | import polars as pl
|
5 | 4 | from dotenv import load_dotenv
|
6 | 5 |
|
7 |
| -from pypi_scout.config import Config, StorageBackend |
| 6 | +from pypi_scout.config import Config |
8 | 7 | from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
|
9 |
| -from pypi_scout.data.reader import DataReader |
10 |
| -from pypi_scout.utils.blob_io import BlobIO |
| 8 | +from pypi_scout.data.raw_data_reader import RawDataReader |
11 | 9 | from pypi_scout.utils.logging import setup_logging
|
12 | 10 |
|
13 | 11 |
|
14 | 12 | def read_raw_dataset(path_to_raw_dataset):
|
15 | 13 | logging.info("📂 Reading the raw dataset...")
|
16 |
| - df = DataReader(path_to_raw_dataset).read() |
| 14 | + df = RawDataReader(path_to_raw_dataset).read() |
17 | 15 | logging.info("📊 Number of rows in the raw dataset: %s", len(df))
|
18 | 16 | logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
|
19 | 17 | logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
|
@@ -44,61 +42,22 @@ def clean_descriptions(df):
|
44 | 42 | return df
|
45 | 43 |
|
46 | 44 |
|
47 |
| -def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path): |
| 45 | +def store_processed_dataset(df, processed_dataset_path): |
48 | 46 | logging.info("Storing the processed dataset...")
|
49 | 47 | df.write_csv(processed_dataset_path)
|
50 | 48 | logging.info("✅ Done!")
|
51 | 49 |
|
52 | 50 |
|
53 |
| -def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str): |
54 |
| - logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...") |
55 |
| - blob_io.upload_csv(df, blob_name) |
56 |
| - logging.info("✅ Done!") |
57 |
| - |
58 |
| - |
59 |
| -def handle_for_local_backend(config: Config): |
60 |
| - if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists(): |
61 |
| - logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.") |
62 |
| - return |
63 |
| - |
64 |
| - df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) |
65 |
| - if config.FRAC_DATA_TO_INCLUDE < 1.0: |
66 |
| - df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) |
67 |
| - df = clean_descriptions(df) |
68 |
| - |
69 |
| - store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) |
70 |
| - |
71 |
| - |
72 |
| -def handle_for_blob_backend(config: Config): |
73 |
| - blob_io = BlobIO( |
74 |
| - config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, |
75 |
| - config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, |
76 |
| - config.STORAGE_BACKEND_BLOB_KEY, |
77 |
| - ) |
78 |
| - |
79 |
| - if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME): |
80 |
| - logging.info( |
81 |
| - f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download." |
82 |
| - ) |
83 |
| - return |
84 |
| - |
| 51 | +def process_raw_dataset(): |
| 52 | + load_dotenv() |
| 53 | + config = Config() |
85 | 54 | df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
|
86 | 55 | if config.FRAC_DATA_TO_INCLUDE < 1.0:
|
87 | 56 | df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
|
88 | 57 | df = clean_descriptions(df)
|
89 |
| - |
90 |
| - store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME) |
91 |
| - |
92 |
| - |
93 |
| -def process_dataset(): |
94 |
| - load_dotenv() |
95 |
| - config = Config() |
96 |
| - if config.STORAGE_BACKEND == StorageBackend.LOCAL: |
97 |
| - handle_for_local_backend(config) |
98 |
| - else: |
99 |
| - handle_for_blob_backend(config) |
| 58 | + store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) |
100 | 59 |
|
101 | 60 |
|
102 | 61 | if __name__ == "__main__":
|
103 | 62 | setup_logging()
|
104 |
| - process_dataset() |
| 63 | + process_raw_dataset() |
0 commit comments