Check Parquet files in together-cli, supply filetype in header (#73)

mryab · web-flow · commit 3caf6087a3aa · 2024-04-10T16:26:27.000-07:00
* Check Parquet files in together-cli, supply filetype in header

* Fix typing annotations

* Post-rebase fixes

* Bump minor version due to the change of behavior

* Remove files.py

* Fix typing errors

* Fix typing errors

* Fix typing errors

* Fix typing errors

* Reduce the diff

* Address review feedback
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,5 +25,5 @@ repos:
     hooks:
       - id: mypy
         args: [--strict]
-        additional_dependencies: [types-requests, types-tqdm, types-tabulate, types-click, types-filelock, types-Pillow, pydantic, aiohttp]
+        additional_dependencies: [types-requests, types-tqdm, types-tabulate, types-click, types-filelock, types-Pillow, pyarrow-stubs, pydantic, aiohttp]
         exclude: ^tests/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,20 @@
 [build-system]
-requires = ["poetry"]
+requires = [
+    "poetry",
+    # Starting with NumPy 1.25, NumPy is (by default) as far back compatible
+    # as oldest-support-numpy was (customizable with a NPY_TARGET_VERSION
+    # define).  For older Python versions (where NumPy 1.25 is not yet avaiable)
+    # continue using oldest-support-numpy.
+    "oldest-supported-numpy>=0.14; python_version<'3.9'",
+    "numpy>=1.25; python_version>='3.9'",
+]
 build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.0.1"
+version = "1.1.0"
 authors = [
-  "Together AI <support@together.ai>"
+    "Together AI <support@together.ai>"
 ]
 description = "Python client for Together's Cloud Platform!"
 readme = "README.md"
@@ -31,6 +39,11 @@ filelock = "^3.13.1"
 eval-type-backport = "^0.1.3"
 click = "^8.1.7"
 pillow = "^10.3.0"
+pyarrow = ">=10.0.1"
+numpy = [
+    { version = ">=1.23.5", python = "<3.12" },
+    { version = ">=1.26.0", python = ">=3.12" },
+]
 
 [tool.poetry.group.quality]
 optional = true
@@ -42,6 +55,7 @@ types-tqdm = "^4.65.0.0"
 types-tabulate = "^0.9.0.3"
 pre-commit = "3.5.0"
 types-requests = "^2.31.0.20240218"
+pyarrow-stubs = "^10.0.1.7"
 mypy = "^1.9.0"
 
 [tool.poetry.group.tests]
diff --git a/src/together/constants.py b/src/together/constants.py
@@ -26,3 +26,6 @@
 
 # maximum number of GB sized files we support finetuning for
 MAX_FILE_SIZE_GB = 4.9
+
+# expected columns for Parquet files
+PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
diff --git a/src/together/filemanager.py b/src/together/filemanager.py
@@ -25,7 +25,13 @@
     FileTypeError,
 )
 from together.together_response import TogetherResponse
-from together.types import FilePurpose, FileResponse, TogetherClient, TogetherRequest
+from together.types import (
+    FilePurpose,
+    FileResponse,
+    FileType,
+    TogetherClient,
+    TogetherRequest,
+)
 
 
 def chmod_and_replace(src: Path, dst: Path) -> None:
@@ -260,12 +266,17 @@ def _redirect_error_handler(
                 http_status=response.status_code,
             )
 
-    def redirect_policy(
-        self, url: str, file: Path, purpose: FilePurpose
+    def get_upload_url(
+        self,
+        url: str,
+        file: Path,
+        purpose: FilePurpose,
+        filetype: FileType,
     ) -> Tuple[str, str]:
         data = {
             "purpose": purpose.value,
             "file_name": file.name,
+            "file_type": filetype.value,
         }
 
         requestor = api_requestor.APIRequestor(
@@ -324,7 +335,16 @@ def upload(
 
         redirect_url = None
         if redirect:
-            redirect_url, file_id = self.redirect_policy(url, file, purpose)
+            if file.suffix == ".jsonl":
+                filetype = FileType.jsonl
+            elif file.suffix == ".parquet":
+                filetype = FileType.parquet
+            else:
+                raise FileTypeError(
+                    f"Unknown extension of file {file}. "
+                    "Only files with extensions .jsonl and .parquet are supported."
+                )
+            redirect_url, file_id = self.get_upload_url(url, file, purpose, filetype)
 
         file_size = os.stat(file.as_posix()).st_size
 
diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
@@ -18,6 +18,7 @@
     FilePurpose,
     FileRequest,
     FileResponse,
+    FileType,
 )
 from together.types.finetune import (
     FinetuneDownloadResult,
@@ -55,6 +56,7 @@
     "FileDeleteResponse",
     "FileObject",
     "FilePurpose",
+    "FileType",
     "ImageRequest",
     "ImageResponse",
     "ModelObject",
diff --git a/src/together/types/files.py b/src/together/types/files.py
@@ -15,6 +15,11 @@ class FilePurpose(str, Enum):
     FineTune = "fine-tune"
 
 
+class FileType(str, Enum):
+    jsonl = "jsonl"
+    parquet = "parquet"
+
+
 class FileRequest(BaseModel):
     """
     Files request type
@@ -43,21 +48,17 @@ class FileResponse(BaseModel):
     Files API response type
     """
 
-    # file id
     id: str
-    # object type
     object: Literal[ObjectType.File]
     # created timestamp
     created_at: int | None = None
-    # file purpose
+    type: FileType | None = None
     purpose: FilePurpose | None = None
-    # file-name
     filename: str | None = None
     # file byte size
     bytes: int | None = None
     # JSONL line count
     line_count: int | None = Field(None, alias="LineCount")
-    # is processed
     processed: bool | None = Field(None, alias="Processed")
 
 
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
@@ -3,9 +3,17 @@
 import json
 import os
 from pathlib import Path
+from traceback import format_exc
 from typing import Any, Dict
 
-from together.constants import MAX_FILE_SIZE_GB, MIN_SAMPLES, NUM_BYTES_IN_GB
+from pyarrow import ArrowInvalid, parquet
+
+from together.constants import (
+    MAX_FILE_SIZE_GB,
+    MIN_SAMPLES,
+    NUM_BYTES_IN_GB,
+    PARQUET_EXPECTED_COLUMNS,
+)
 
 
 def check_file(
@@ -50,6 +58,25 @@ def check_file(
     else:
         report_dict["file_size"] = file_size
 
+    if file.suffix == ".jsonl":
+        report_dict["filetype"] = "jsonl"
+        data_report_dict = _check_jsonl(file)
+    elif file.suffix == ".parquet":
+        report_dict["filetype"] = "parquet"
+        data_report_dict = _check_parquet(file)
+    else:
+        report_dict["filetype"] = (
+            f"Unknown extension of file {file}. "
+            "Only files with extensions .jsonl and .parquet are supported."
+        )
+        report_dict["is_check_passed"] = False
+
+    report_dict.update(data_report_dict)
+    return report_dict
+
+
+def _check_jsonl(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
     # Check that the file is UTF-8 encoded. If not report where the error occurs.
     try:
         with file.open(encoding="utf-8") as f:
@@ -71,7 +98,7 @@ def check_file(
                 if not isinstance(json_line, dict):
                     report_dict["line_type"] = False
                     report_dict["message"] = (
-                        f"Error parsing file. Invalid format on line {idx+1} of the input file. "
+                        f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
                         'Example of valid json: {"text": "my sample string"}. '
                     )
 
@@ -80,7 +107,7 @@ def check_file(
                 if "text" not in json_line.keys():
                     report_dict["text_field"] = False
                     report_dict["message"] = (
-                        f"Missing 'text' field was found on line {idx+1} of the the input file. "
+                        f"Missing 'text' field was found on line {idx + 1} of the the input file. "
                         "Expected format: {'text': 'my sample string'}. "
                     )
                     report_dict["is_check_passed"] = False
@@ -89,7 +116,7 @@ def check_file(
                     if not isinstance(json_line["text"], str):
                         report_dict["key_value"] = False
                         report_dict["message"] = (
-                            f'Invalid value type for "text" key on line {idx+1}. '
+                            f'Invalid value type for "text" key on line {idx + 1}. '
                             f'Expected string. Found {type(json_line["text"])}.'
                         )
 
@@ -99,7 +126,7 @@ def check_file(
             if idx + 1 < MIN_SAMPLES:
                 report_dict["min_samples"] = False
                 report_dict["message"] = (
-                    f"Processing {file} resulted in only {idx+1} samples. "
+                    f"Processing {file} resulted in only {idx + 1} samples. "
                     f"Our minimum is {MIN_SAMPLES} samples. "
                 )
                 report_dict["is_check_passed"] = False
@@ -118,7 +145,7 @@ def check_file(
                 )
             else:
                 report_dict["message"] = (
-                    f"Error parsing json payload. Unexpected format on line {idx+1}."
+                    f"Error parsing json payload. Unexpected format on line {idx + 1}."
                 )
             report_dict["is_check_passed"] = False
 
@@ -128,5 +155,50 @@ def check_file(
         report_dict["line_type"] = True
     if report_dict["key_value"] is not False:
         report_dict["key_value"] = True
+    return report_dict
+
+
+def _check_parquet(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
+
+    try:
+        table = parquet.read_table(str(file), memory_map=True)
+    except ArrowInvalid:
+        report_dict["load_parquet"] = (
+            f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
+            f"Exception trace:\n{format_exc()}"
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+
+    column_names = table.schema.names
+    if "input_ids" not in column_names:
+        report_dict["load_parquet"] = (
+            f"Parquet file {file} does not contain the `input_ids` column."
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+
+    for column_name in column_names:
+        if column_name not in PARQUET_EXPECTED_COLUMNS:
+            report_dict["load_parquet"] = (
+                f"Parquet file {file} contains an unexpected column {column_name}. "
+                f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
+            )
+            report_dict["is_check_passed"] = False
+            return report_dict
+
+    num_samples = len(table)
+    if num_samples < MIN_SAMPLES:
+        report_dict["min_samples"] = (
+            f"Processing {file} resulted in only {num_samples} samples. "
+            f"Our minimum is {MIN_SAMPLES} samples. "
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["num_samples"] = num_samples
+
+    report_dict["is_check_passed"] = True
 
     return report_dict