Merge pull request #617 from Labelbox/ms/assign-data-splits

msokoloff1 · web-flow · commit bcb23d8dfcf6 · 2022-07-01T17:31:59.000-04:00
assign data row split
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ client = Client( endpoint = "<local deployment>")
 client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="http://localhost:8080/graphql")
 
 # Staging
-client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://staging-api.labelbox.com/graphql")
+client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://api.lb-stage.xyz/graphql")
 ```
 
 ## Contribution
@@ -122,5 +122,5 @@ make test-prod # with an optional flag: PATH_TO_TEST=tests/integration/...etc LA
 make -B {build|test-staging|test-prod}
 ```
 
-6. Testing against Delegated Access will be skipped unless the local env contains the key: 
-DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
+6. Testing against Delegated Access will be skipped unless the local env contains the key:
+DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
@@ -21,7 +21,7 @@
 from labelbox.schema.role import Role, ProjectRole
 from labelbox.schema.invite import Invite, InviteLimit
 from labelbox.schema.data_row_metadata import DataRowMetadataOntology
-from labelbox.schema.model_run import ModelRun
+from labelbox.schema.model_run import ModelRun, DataSplit
 from labelbox.schema.benchmark import Benchmark
 from labelbox.schema.iam_integration import IAMIntegration
 from labelbox.schema.resource_tag import ResourceTag
diff --git a/labelbox/schema/model_run.py b/labelbox/schema/model_run.py
@@ -1,10 +1,12 @@
+# type: ignore
 from typing import TYPE_CHECKING, Dict, Iterable, Union, List, Optional, Any
 from pathlib import Path
 import os
 import time
 import logging
 import requests
 import ndjson
+from enum import Enum
 
 from labelbox.pagination import PaginatedCollection
 from labelbox.orm.query import results_query_part
@@ -17,13 +19,27 @@
 logger = logging.getLogger(__name__)
 
 
+class DataSplit(Enum):
+    TRAINING = "TRAINING"
+    TEST = "TEST"
+    VALIDATION = "VALIDATION"
+    UNASSIGNED = "UNASSIGNED"
+
+
 class ModelRun(DbObject):
     name = Field.String("name")
     updated_at = Field.DateTime("updated_at")
     created_at = Field.DateTime("created_at")
     created_by_id = Field.String("created_by_id", "createdBy")
     model_id = Field.String("model_id")
 
+    class Status(Enum):
+        EXPORTING_DATA = "EXPORTING_DATA"
+        PREPARING_DATA = "PREPARING_DATA"
+        TRAINING_MODEL = "TRAINING_MODEL"
+        COMPLETE = "COMPLETE"
+        FAILED = "FAILED"
+
     def upsert_labels(self, label_ids, timeout_seconds=60):
         """ Adds data rows and labels to a model run
         Args:
@@ -90,8 +106,9 @@ def upsert_data_rows(self, data_row_ids, timeout_seconds=60):
             }})['MEADataRowRegistrationTaskStatus'],
                                      timeout_seconds=timeout_seconds)
 
-    def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
+    def _wait_until_done(self, status_fn, timeout_seconds=120, sleep_time=5):
         # Do not use this function outside of the scope of upsert_data_rows or upsert_labels. It could change.
+        original_timeout = timeout_seconds
         while True:
             res = status_fn()
             if res['status'] == 'COMPLETE':
@@ -102,9 +119,8 @@ def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
             timeout_seconds -= sleep_time
             if timeout_seconds <= 0:
                 raise TimeoutError(
-                    f"Unable to complete import within {timeout_seconds} seconds."
+                    f"Unable to complete import within {original_timeout} seconds."
                 )
-
             time.sleep(sleep_time)
 
     def add_predictions(
@@ -161,7 +177,7 @@ def delete(self):
             deleteModelRuns(where: {ids: [$%s]})}""" % (ids_param, ids_param)
         self.client.execute(query_str, {ids_param: str(self.uid)})
 
-    def delete_model_run_data_rows(self, data_row_ids):
+    def delete_model_run_data_rows(self, data_row_ids: List[str]):
         """ Deletes data rows from model runs.
 
         Args:
@@ -180,22 +196,62 @@ def delete_model_run_data_rows(self, data_row_ids):
             data_row_ids_param: data_row_ids
         })
 
+    @experimental
+    def assign_data_rows_to_split(self,
+                                  data_row_ids: List[str],
+                                  split: Union[DataSplit, str],
+                                  timeout_seconds=120):
+
+        split_value = split.value if isinstance(split, DataSplit) else split
+
+        if split_value == DataSplit.UNASSIGNED.value:
+            raise ValueError(
+                f"Cannot assign split value of `{DataSplit.UNASSIGNED.value}`.")
+
+        valid_splits = filter(lambda name: name != DataSplit.UNASSIGNED.value,
+                              DataSplit._member_names_)
+
+        if split_value not in valid_splits:
+            raise ValueError(
+                f"`split` must be one of : `{valid_splits}`. Found : `{split}`")
+
+        task_id = self.client.execute(
+            """mutation assignDataSplitPyApi($modelRunId: ID!, $data: CreateAssignDataRowsToDataSplitTaskInput!){
+                  createAssignDataRowsToDataSplitTask(modelRun : {id: $modelRunId}, data: $data)}
+            """, {
+                'modelRunId': self.uid,
+                'data': {
+                    'assignments': [{
+                        'split': split_value,
+                        'dataRowIds': data_row_ids
+                    }]
+                }
+            },
+            experimental=True)['createAssignDataRowsToDataSplitTask']
+
+        status_query_str = """query assignDataRowsToDataSplitTaskStatusPyApi($id: ID!){
+            assignDataRowsToDataSplitTaskStatus(where: {id : $id}){status errorMessage}}
+            """
+
+        return self._wait_until_done(lambda: self.client.execute(
+            status_query_str, {'id': task_id}, experimental=True)[
+                'assignDataRowsToDataSplitTaskStatus'],
+                                     timeout_seconds=timeout_seconds)
+
     @experimental
     def update_status(self,
-                      status: str,
+                      status: Union[str, "ModelRun.Status"],
                       metadata: Optional[Dict[str, str]] = None,
                       error_message: Optional[str] = None):
 
-        valid_statuses = [
-            "EXPORTING_DATA", "PREPARING_DATA", "TRAINING_MODEL", "COMPLETE",
-            "FAILED"
-        ]
-        if status not in valid_statuses:
+        status_value = status.value if isinstance(status,
+                                                  ModelRun.Status) else status
+        if status_value not in ModelRun.Status._member_names_:
             raise ValueError(
-                f"Status must be one of : `{valid_statuses}`. Found : `{status}`"
+                f"Status must be one of : `{ModelRun.Status._member_names_}`. Found : `{status_value}`"
             )
 
-        data: Dict[str, Any] = {'status': status}
+        data: Dict[str, Any] = {'status': status_value}
         if error_message:
             data['errorMessage'] = error_message
 
@@ -264,6 +320,7 @@ def export_labels(
 class ModelRunDataRow(DbObject):
     label_id = Field.String("label_id")
     model_run_id = Field.String("model_run_id")
+    data_split = Field.Enum(DataSplit, "data_split")
     data_row = Relationship.ToOne("DataRow", False, cache=True)
 
     def __init__(self, client, model_id, *args, **kwargs):
diff --git a/tests/integration/annotation_import/test_model_run.py b/tests/integration/annotation_import/test_model_run.py
@@ -2,6 +2,9 @@
 import os
 import pytest
 
+from collections import Counter
+from labelbox import DataSplit, ModelRun
+
 
 def test_model_run(client, configured_project_with_label, rand_gen):
     project, _, _, label = configured_project_with_label
@@ -119,3 +122,40 @@ def get_model_run_status():
     assert model_run_status['status'] == status
     assert model_run_status['metadata'] == {**metadata, **extra_metadata}
     assert model_run_status['errorMessage'] == errorMessage
+
+    status = ModelRun.Status.FAILED
+    model_run_with_model_run_data_rows.update_status(status, metadata,
+                                                     errorMessage)
+    model_run_status = get_model_run_status()
+    assert model_run_status['status'] == status.value
+
+    with pytest.raises(ValueError):
+        model_run_with_model_run_data_rows.update_status(
+            "INVALID", metadata, errorMessage)
+
+
+def test_model_run_split_assignment(model_run, dataset, image_url):
+    n_data_rows = 10
+    data_rows = dataset.create_data_rows([{
+        "row_data": image_url
+    } for _ in range(n_data_rows)])
+    data_row_ids = [data_row['id'] for data_row in data_rows.result]
+
+    model_run.upsert_data_rows(data_row_ids)
+
+    with pytest.raises(ValueError):
+        model_run.assign_data_rows_to_split(data_row_ids, "INVALID SPLIT")
+
+    with pytest.raises(ValueError):
+        model_run.assign_data_rows_to_split(data_row_ids, DataSplit.UNASSIGNED)
+
+    for split in ["TRAINING", "TEST", "VALIDATION", *DataSplit]:
+        if split == DataSplit.UNASSIGNED:
+            continue
+
+        model_run.assign_data_rows_to_split(data_row_ids, split)
+        counts = Counter()
+        for data_row in model_run.model_run_data_rows():
+            counts[data_row.data_split.value] += 1
+        split = split.value if isinstance(split, DataSplit) else split
+        assert counts[split] == n_data_rows
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -145,7 +145,10 @@ def client(environ: str):
 
 @pytest.fixture(scope="session")
 def image_url(client):
-    return client.upload_data(requests.get(IMG_URL).content, sign=True)
+    return client.upload_data(requests.get(IMG_URL).content,
+                              content_type="application/json",
+                              filename="json_import.json",
+                              sign=True)
 
 
 @pytest.fixture
@@ -181,7 +184,7 @@ def iframe_url(environ) -> str:
     if environ in [Environ.PROD, Environ.LOCAL]:
         return 'https://editor.labelbox.com'
     elif environ == Environ.STAGING:
-        return 'https://staging.labelbox.dev/editor'
+        return 'https://editor.lb-stage.xyz'
 
 
 @pytest.fixture
@@ -290,7 +293,7 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset,
 
     def create_label():
         """ Ad-hoc function to create a LabelImport
-        
+
         Creates a LabelImport task which will create a label
         """
         upload_task = LabelImport.create_from_objects(