open-pack · getty708 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024 · Aug 11, 2024
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -18,8 +18,8 @@ jobs:
         os:
           - macos-latest
           - ubuntu-latest
-        python-version: [3.8, 3.9]
-        poetry-version: ["1.2.0"]
+        python-version: [3.8, 3.10]
+        poetry-version: ["1.8.3"]
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2

diff --git a/.gitignore b/.gitignore
@@ -197,7 +197,7 @@ target/
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -345,9 +345,11 @@ $RECYCLE.BIN/
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,emacs,vim,visualstudiocode,python,jupyternotebooks
 
 # ================
-#  Custom Setting 
+#  Custom Setting
 # ================
 data/
 !openpack_toolkit/data/
 !tests/data/
 *.csv
+wandb/
+outputs/
diff --git a/.secrets/.gitignore b/.secrets/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/openpack_toolkit/download/README.md b/openpack_toolkit/download/README.md
@@ -0,0 +1,41 @@
+# Python-based Data Downloader
+
+Shell script-based data downloader is prepared in [open-pack/openpack-dataset](https://github.com/open-pack/openpack-dataset/blob/main/docs/DOWNLOAD.md).
+However, it does not support downloading data from Google Drive and it is not easy to configure.
+This download tool provides another option to set up datasets in your environment.
+
+## How to use
+
+```bash
+poetry run python download.py
+```
+
+### Track data lineage with Weight and Biases
+
+(Optional) Create an artifact of the OpenPack Dataset on the cloud repository.
+
+```bash
+poetry run python create_wandb_artifact.py zenodo
+```
+
+Download the dataset into your local using WandB.
+
+```bash
+wandb login
+poetry run python download_local.py --use-wandb
+```
+
+### Workflow
+
+```mermaid
+graph LR
+    RepoZ[(Zenodo)]
+    RepoG[(GoogleDrive)]
+    LocalZipFolder[(`openpack/v.X.X.X/zip/`)]
+    LocalDsFolder[(`openpack/v.X.X.X/`)]
+    ProcDownload(Download)
+    ProcExtract(Extract)
+    RepoZ --> ProcDownload --> LocalZipFolder
+    RepoG --> ProcDownload --> LocalZipFolder
+    LocalZipFolder --> ProcExtract --> LocalDsFolder
+```
diff --git a/openpack_toolkit/download/__init__.py b/openpack_toolkit/download/__init__.py
diff --git a/openpack_toolkit/download/const.py b/openpack_toolkit/download/const.py
@@ -0,0 +1,44 @@
+WANDB_PROJECT_NAME_PUBLIC = "openpack-dataset"
+WANDB_PROJECT_NAME_LOCAL = "openpack-dataset-local"
+WANDB_JOB_TYPE_DOWNLOAD_DATASET = "download-dataset"
+WANDB_ARTIFACT_TYPE_DATASET = "dataset"
+
+OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE = "openpack-{version}-zenodo"
+OPENPACK_DATASET_NAME_ON_GRDIVE_TEMPLATE = "openpack-{version}-gdrive"
+OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE = "openpack-{version}-local-{hostname}"
+
+OPENAPCK_ROOT_PATH_STR = "./openpack/v{version}"
+
+ZENODO_URLS = {
+    "v1.1.0": "https://zenodo.org/records/11059235",
+    "v1.0.0": "https://zenodo.org/records/8145223",
+}
+
+GDRIVE_URLS = {
+    "v1.0.0": "https://drive.google.com/drive/folders/10hYJYkhPRgf-uTToUm5KR99EHkH2v9GB",
+}
+
+
+OPENPACK_USERS = (
+    "U0101",
+    "U0102",
+    "U0103",
+    "U0104",
+    "U0105",
+    "U0106",
+    "U0107",
+    "U0108",
+    "U0109",
+    "U0110",
+    "U0111",
+    "U0201",
+    "U0202",
+    "U0203",
+    "U0204",
+    "U0205",
+    "U0206",
+    "U0207",
+    "U0208",
+    "U0209",
+    "U0210",
+)
diff --git a/openpack_toolkit/download/download_local.py b/openpack_toolkit/download/download_local.py
@@ -0,0 +1,154 @@
+import socket
+import zipfile
+from pathlib import Path
+
+import click
+import requests
+import wandb
+from loguru import logger
+from tqdm import tqdm
+
+from tools.download.const import (
+    OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE,
+    OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE,
+    OPENPACK_USERS,
+    WANDB_ARTIFACT_TYPE_DATASET,
+    WANDB_JOB_TYPE_DOWNLOAD_DATASET,
+    WANDB_PROJECT_NAME_LOCAL,
+    WANDB_PROJECT_NAME_PUBLIC,
+    ZENODO_URLS,
+)
+
+# _DEFAULT_OUTPUT_DIR = "../../data/datasets/openpack/"
+_DEFAULT_OPENPACK_DIR = Path().cwd() / "data" / "datasets" / "openpack"
+
+
+def download_file_with_progress_bar(src_uri: str, dest_path: Path):
+    if dest_path.exists():
+        logger.warning(f"Zip file ({dest_path}) already exists. Skip downloading.")
+        return
+
+    logger.info(f"Download zip file from {src_uri} and save it to {dest_path}.")
+    response = requests.get(src_uri, stream=True)
+    total_size = int(response.headers.get("content-length", 0))
+
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(dest_path, "wb") as f:
+        with tqdm(
+            desc=str(dest_path), total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
+        ) as bar:
+            for data in response.iter_content(chunk_size=1024):
+                size = f.write(data)
+                bar.update(size)
+    logger.info(f"Finish downloading zip file to {dest_path}.")
+
+
+def extract_zip(zip_file_path: Path, extract_path: Path):
+    logger.info(f"Extract {zip_file_path} to {extract_path}.")
+    extract_path.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+        zip_ref.extractall(extract_path)
+
+
+def download_from_zenodo(
+    version: str, user_id: str, zip_dir: Path, remote_artifact: wandb.Artifact = None
+):
+    """Download a zip file from Zenodo. If the wandb is active, download using the artifact."""
+    if remote_artifact is not None:
+        zip_path = remote_artifact.get_entry(f"{user_id}.zip").download(
+            root=zip_dir, skip_cache=True
+        )
+    else:
+        base_uri = ZENODO_URLS[version]
+        uri = f"{base_uri}/files/{user_id}.zip?download=1"
+        zip_path = zip_dir / f"{user_id}.zip"
+        download_file_with_progress_bar(uri, zip_path)
+    return zip_path
+
+
+def extract_zip_from_zenodo(zip_path: Path, user_dir: Path, local_artifact: wandb.Artifact = None):
+    """Extract a zip file downloaded from zenodo to the user_directory."""
+    user_id = user_dir.name
+    logger.info(f"Extract zip file for {user_id}.")
+    extract_zip(zip_path, user_dir)
+
+    # Log local data to WandB.
+    # Create artifact references for each data stream.
+    if local_artifact is not None:
+        logger.info(f"Log streams of {user_id} to WandB.")
+        for sensor_dir in user_dir.iterdir():
+            for stream_dir in sensor_dir.iterdir():
+                local_artifact.add_reference(
+                    f"file://{stream_dir.absolute()}",
+                    name=f"{user_id}/{sensor_dir.name}/{stream_dir.name}",
+                )
+
+
+@click.command()
+@click.option(
+    "-v",
+    "--version",
+    type=click.Choice(ZENODO_URLS.keys()),
+    default="v1.1.0",
+    help="Version of the dataset to download.",
+)
+@click.option(
+    "-o",
+    "--openpack-dir",
+    type=click.Path(exists=True),
+    default=_DEFAULT_OPENPACK_DIR,
+    help="a root directory to download datasets.",
+)
+@click.option(
+    "--use-wandb",
+    "--wandb",
+    is_flag=True,
+    show_default=True,
+    default=True,
+    help="Log artifact with wandb.",
+)
+@click.option("--debug", is_flag=True, show_default=True, default=False, help="Run in debug mode.")
+def main(version: str, openpack_dir: Path, use_wandb: bool = True, debug: bool = False):
+    # Init wandb.
+    if use_wandb is not None:
+        project_name = WANDB_PROJECT_NAME_LOCAL
+        if debug:
+            project_name += "-debug"
+        wandb_run = wandb.init(project=project_name, job_type=WANDB_JOB_TYPE_DOWNLOAD_DATASET)
+        remote_artifact = wandb_run.use_artifact(
+            f"{WANDB_PROJECT_NAME_PUBLIC}/{OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE}:latest".format(
+                version=version
+            )
+        )
+        local_artifact = wandb.Artifact(
+            name=OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE.format(
+                version=version, hostname=socket.gethostname()
+            ),
+            type=WANDB_ARTIFACT_TYPE_DATASET,
+            description=f"OpenPack Dataset ({version}) on {socket.gethostname()}",
+        )
+    else:
+        wandb_run, remote_artifact, local_artifact = None, None, None
+
+    # Init dataset directories.
+    openpack_dir = Path(openpack_dir)
+    zip_dir = openpack_dir / version / "zip" / "zenodo"
+    zip_dir.mkdir(parents=True, exist_ok=True)
+    users = OPENPACK_USERS
+    if debug:
+        users = users[:2]
+
+    # Download and extract data for the first two users.
+    for user_id in users:
+        zip_path = download_from_zenodo(version, user_id, openpack_dir, remote_artifact)
+
+        user_dir = openpack_dir / user_id
+        extract_zip_from_zenodo(zip_path, user_dir, local_artifact=local_artifact)
+
+    # Save the artifact to W&B
+    wandb_run.log_artifact(local_artifact)
+    wandb_run.finish()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openpack_toolkit/download/metadata/README.md b/openpack_toolkit/download/metadata/README.md
@@ -0,0 +1,3 @@
+# [Admin Tool] Dataset Metadata Generator
+
+This is a tool for OpenPack admin members to generate dataset metadata for creating a release.
diff --git a/openpack_toolkit/download/metadata/__init__.py b/openpack_toolkit/download/metadata/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# [Admin Tool] Dataset Metadata Generator

		This is a tool for OpenPack admin members to generate dataset metadata for creating a release.