Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
os:
- macos-latest
- ubuntu-latest
python-version: [3.8, 3.9]
poetry-version: ["1.2.0"]
python-version: [3.8, 3.10]
poetry-version: ["1.8.3"]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ target/
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand Down Expand Up @@ -345,9 +345,11 @@ $RECYCLE.BIN/
# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,emacs,vim,visualstudiocode,python,jupyternotebooks

# ================
# Custom Setting
# Custom Setting
# ================
data/
!openpack_toolkit/data/
!tests/data/
*.csv
wandb/
outputs/
2 changes: 2 additions & 0 deletions .secrets/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
41 changes: 41 additions & 0 deletions openpack_toolkit/download/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Python-based Data Downloader

Shell script-based data downloader is prepared in [open-pack/openpack-dataset](https://github.com/open-pack/openpack-dataset/blob/main/docs/DOWNLOAD.md).
However, it does not support downloading data from Google Drive and it is not easy to configure.
This download tool provides another option to set up datasets in your environment.

## How to use

```bash
poetry run python download.py
```

### Track data lineage with Weight and Biases

(Optional) Create an artifact of the OpenPack Dataset on the cloud repository.

```bash
poetry run python create_wandb_artifact.py zenodo
```

Download the dataset into your local using WandB.

```bash
wandb login
poetry run python download_local.py --use-wandb
```

### Workflow

```mermaid
graph LR
RepoZ[(Zenodo)]
RepoG[(GoogleDrive)]
LocalZipFolder[(`openpack/v.X.X.X/zip/`)]
LocalDsFolder[(`openpack/v.X.X.X/`)]
ProcDownload(Download)
ProcExtract(Extract)
RepoZ --> ProcDownload --> LocalZipFolder
RepoG --> ProcDownload --> LocalZipFolder
LocalZipFolder --> ProcExtract --> LocalDsFolder
```
Empty file.
44 changes: 44 additions & 0 deletions openpack_toolkit/download/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
WANDB_PROJECT_NAME_PUBLIC = "openpack-dataset"
WANDB_PROJECT_NAME_LOCAL = "openpack-dataset-local"
WANDB_JOB_TYPE_DOWNLOAD_DATASET = "download-dataset"
WANDB_ARTIFACT_TYPE_DATASET = "dataset"

OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE = "openpack-{version}-zenodo"
OPENPACK_DATASET_NAME_ON_GRDIVE_TEMPLATE = "openpack-{version}-gdrive"
OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE = "openpack-{version}-local-{hostname}"

OPENAPCK_ROOT_PATH_STR = "./openpack/v{version}"

ZENODO_URLS = {
"v1.1.0": "https://zenodo.org/records/11059235",
"v1.0.0": "https://zenodo.org/records/8145223",
}

GDRIVE_URLS = {
"v1.0.0": "https://drive.google.com/drive/folders/10hYJYkhPRgf-uTToUm5KR99EHkH2v9GB",
}


OPENPACK_USERS = (
"U0101",
"U0102",
"U0103",
"U0104",
"U0105",
"U0106",
"U0107",
"U0108",
"U0109",
"U0110",
"U0111",
"U0201",
"U0202",
"U0203",
"U0204",
"U0205",
"U0206",
"U0207",
"U0208",
"U0209",
"U0210",
)
154 changes: 154 additions & 0 deletions openpack_toolkit/download/download_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import socket
import zipfile
from pathlib import Path

import click
import requests
import wandb
from loguru import logger
from tqdm import tqdm

from tools.download.const import (
OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE,
OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE,
OPENPACK_USERS,
WANDB_ARTIFACT_TYPE_DATASET,
WANDB_JOB_TYPE_DOWNLOAD_DATASET,
WANDB_PROJECT_NAME_LOCAL,
WANDB_PROJECT_NAME_PUBLIC,
ZENODO_URLS,
)

# _DEFAULT_OUTPUT_DIR = "../../data/datasets/openpack/"
_DEFAULT_OPENPACK_DIR = Path().cwd() / "data" / "datasets" / "openpack"


def download_file_with_progress_bar(src_uri: str, dest_path: Path):
if dest_path.exists():
logger.warning(f"Zip file ({dest_path}) already exists. Skip downloading.")
return

logger.info(f"Download zip file from {src_uri} and save it to {dest_path}.")
response = requests.get(src_uri, stream=True)
total_size = int(response.headers.get("content-length", 0))

dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
with tqdm(
desc=str(dest_path), total=total_size, unit="iB", unit_scale=True, unit_divisor=1024
) as bar:
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
bar.update(size)
logger.info(f"Finish downloading zip file to {dest_path}.")


def extract_zip(zip_file_path: Path, extract_path: Path):
logger.info(f"Extract {zip_file_path} to {extract_path}.")
extract_path.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(extract_path)


def download_from_zenodo(
version: str, user_id: str, zip_dir: Path, remote_artifact: wandb.Artifact = None
):
"""Download a zip file from Zenodo. If the wandb is active, download using the artifact."""
if remote_artifact is not None:
zip_path = remote_artifact.get_entry(f"{user_id}.zip").download(
root=zip_dir, skip_cache=True
)
else:
base_uri = ZENODO_URLS[version]
uri = f"{base_uri}/files/{user_id}.zip?download=1"
zip_path = zip_dir / f"{user_id}.zip"
download_file_with_progress_bar(uri, zip_path)
return zip_path


def extract_zip_from_zenodo(zip_path: Path, user_dir: Path, local_artifact: wandb.Artifact = None):
"""Extract a zip file downloaded from zenodo to the user_directory."""
user_id = user_dir.name
logger.info(f"Extract zip file for {user_id}.")
extract_zip(zip_path, user_dir)

# Log local data to WandB.
# Create artifact references for each data stream.
if local_artifact is not None:
logger.info(f"Log streams of {user_id} to WandB.")
for sensor_dir in user_dir.iterdir():
for stream_dir in sensor_dir.iterdir():
local_artifact.add_reference(
f"file://{stream_dir.absolute()}",
name=f"{user_id}/{sensor_dir.name}/{stream_dir.name}",
)


@click.command()
@click.option(
"-v",
"--version",
type=click.Choice(ZENODO_URLS.keys()),
default="v1.1.0",
help="Version of the dataset to download.",
)
@click.option(
"-o",
"--openpack-dir",
type=click.Path(exists=True),
default=_DEFAULT_OPENPACK_DIR,
help="a root directory to download datasets.",
)
@click.option(
"--use-wandb",
"--wandb",
is_flag=True,
show_default=True,
default=True,
help="Log artifact with wandb.",
)
@click.option("--debug", is_flag=True, show_default=True, default=False, help="Run in debug mode.")
def main(version: str, openpack_dir: Path, use_wandb: bool = True, debug: bool = False):
# Init wandb.
if use_wandb is not None:
project_name = WANDB_PROJECT_NAME_LOCAL
if debug:
project_name += "-debug"
wandb_run = wandb.init(project=project_name, job_type=WANDB_JOB_TYPE_DOWNLOAD_DATASET)
remote_artifact = wandb_run.use_artifact(
f"{WANDB_PROJECT_NAME_PUBLIC}/{OPENPACK_DATASET_NAME_ON_ZENODO_TEMPLATE}:latest".format(
version=version
)
)
local_artifact = wandb.Artifact(
name=OPENPACK_DATASET_NAME_ON_LOCAL_TEMPLATE.format(
version=version, hostname=socket.gethostname()
),
type=WANDB_ARTIFACT_TYPE_DATASET,
description=f"OpenPack Dataset ({version}) on {socket.gethostname()}",
)
else:
wandb_run, remote_artifact, local_artifact = None, None, None

# Init dataset directories.
openpack_dir = Path(openpack_dir)
zip_dir = openpack_dir / version / "zip" / "zenodo"
zip_dir.mkdir(parents=True, exist_ok=True)
users = OPENPACK_USERS
if debug:
users = users[:2]

# Download and extract data for the first two users.
for user_id in users:
zip_path = download_from_zenodo(version, user_id, openpack_dir, remote_artifact)

user_dir = openpack_dir / user_id
extract_zip_from_zenodo(zip_path, user_dir, local_artifact=local_artifact)

# Save the artifact to W&B
wandb_run.log_artifact(local_artifact)
wandb_run.finish()


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions openpack_toolkit/download/metadata/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# [Admin Tool] Dataset Metadata Generator

This is a tool for OpenPack admin members to generate dataset metadata for creating a release.
Empty file.
Loading