Merge pull request #615 from Labelbox/bulk-upload-convo

kkim-labelbox · web-flow · commit e6f523c7be63 · 2022-07-06T15:34:25.000-07:00
Ability to bulk upload conversations using Dataset.create_data_rows() + update staging URL
diff --git a/labelbox/schema/dataset.py b/labelbox/schema/dataset.py
@@ -226,6 +226,7 @@ def _create_descriptor_file(self, items, max_attachments_per_data_row=None):
         >>>     {DataRow.row_data:"/path/to/file1.jpg"},
         >>>     "path/to/file2.jpg",
         >>>     {"tileLayerUrl" : "http://", ...}
+        >>>     {"conversationalData" : [...], ...}
         >>>     ])
 
         For an example showing how to upload tiled data_rows see the following notebook:
@@ -280,6 +281,33 @@ def validate_attachments(item):
                     )
             return attachments
 
+        def validate_conversational_data(conversational_data: list) -> None:
+            """
+            Checks each conversational message for keys expected as per https://docs.labelbox.com/reference/text-conversational#sample-conversational-json
+
+            Args:
+                conversational_data (list): list of dictionaries.
+            """
+
+            def check_message_keys(message):
+                accepted_message_keys = set([
+                    "messageId", "timestampUsec", "content", "user", "align",
+                    "canLabel"
+                ])
+                for key in message.keys():
+                    if not key in accepted_message_keys:
+                        raise KeyError(
+                            f"Invalid {key} key found! Accepted keys in messages list is {accepted_message_keys}"
+                        )
+
+            if conversational_data and not isinstance(conversational_data,
+                                                      list):
+                raise ValueError(
+                    f"conversationalData must be a list. Found {type(conversational_data)}"
+                )
+
+            [check_message_keys(message) for message in conversational_data]
+
         def parse_metadata_fields(item):
             metadata_fields = item.get('metadata_fields')
             if metadata_fields:
@@ -321,6 +349,27 @@ def convert_item(item):
             if "tileLayerUrl" in item:
                 validate_attachments(item)
                 return item
+
+            if "conversationalData" in item:
+                messages = item.pop("conversationalData")
+                version = item.pop("version")
+                type = item.pop("type")
+                if "externalId" in item:
+                    external_id = item.pop("externalId")
+                    item["external_id"] = external_id
+                validate_conversational_data(messages)
+                one_conversation = \
+                    {
+                        "type": type,
+                        "version": version,
+                        "messages": messages
+                    }
+                conversationUrl = self.client.upload_data(
+                    json.dumps(one_conversation),
+                    content_type="application/json",
+                    filename="conversational_data.json")
+                item["row_data"] = conversationUrl
+
             # Convert all payload variations into the same dict format
             item = format_row(item)
             # Make sure required keys exist (and there are no extra keys)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,3 +1,4 @@
+import json
 import os
 import re
 import time
@@ -190,10 +191,17 @@ def iframe_url(environ) -> str:
 @pytest.fixture
 def sample_video() -> str:
     path_to_video = 'tests/integration/media/cat.mp4'
-    assert os.path.exists(path_to_video)
     return path_to_video
 
 
+@pytest.fixture
+def sample_bulk_conversation() -> list:
+    path_to_conversation = 'tests/integration/media/bulk_conversation.json'
+    with open(path_to_conversation) as json_file:
+        conversations = json.load(json_file)
+    return conversations
+
+
 @pytest.fixture
 def organization(client):
     # Must have at least one seat open in your org to run these tests
diff --git a/tests/integration/media/bulk_conversation.json b/tests/integration/media/bulk_conversation.json
@@ -0,0 +1,113 @@
+[
+    {
+        "externalId": "Convo-123",
+        "attachments": [
+            {
+                "type": "TEXT",
+                "value": "IOWA, Zone 2232, June 2022 [Text string]"
+            },
+            {
+                "type": "IMAGE",
+                "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/disease_attachment.jpeg"
+            }
+        ],
+        "type": "application/vnd.labelbox.conversational",
+        "version": 1,
+        "conversationalData": [
+            {
+                "messageId": "message-0",
+                "timestampUsec": 1530718491,
+                "content": "I love iphone! i just bought new iphone! :smiling_face_with_3_hearts: :calling:",
+                "user": {
+                    "userId": "Bot 002",
+                    "name": "Bot"
+                },
+                "align": "left",
+                "canLabel": false
+            },
+            {
+                "messageId": "message-1",
+                "timestampUsec": 1530718503,
+                "content": "Thats good for you, i'm not very into new tech",
+                "user": {
+                    "userId": "User 00686",
+                    "name": "User"
+                },
+                "align": "right",
+                "canLabel": true
+            },
+            {
+                "messageId": "message-2",
+                "timestampUsec": 1530718516,
+                "content": "I am a college student and i am a college student :female-teacher::skin-tone-2:",
+                "user": {
+                    "userId": "Bot 002",
+                    "name": "Bot"
+                },
+                "align": "left",
+                "canLabel": false
+            },
+            {
+                "messageId": "message-3",
+                "timestampUsec": 1530718528,
+                "content": "I am go to gym and live on donations :woman-lifting-weights::skin-tone-6:",
+                "user": {
+                    "userId": "User 00686",
+                    "name": "User"
+                },
+                "align": "right",
+                "canLabel": true
+            }
+        ]
+    },
+    {
+        "externalId": "Convo-456",
+        "attachments": [
+            {
+                "type": "TEXT",
+                "value": "IOWA, Zone 1234567, July 2022 [Text string]"
+            },
+            {
+                "type": "IMAGE",
+                "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/disease_attachment.jpeg"
+            }
+        ],
+        "type": "application/vnd.labelbox.conversational",
+        "version": 1,
+        "conversationalData": [
+            {
+                "messageId": "message-0",
+                "timestampUsec": 1530718491,
+                "content": "I love iphone! i just bought new iphone! :smiling_face_with_3_hearts: :calling:",
+                "user": {
+                    "userId": "Bot 002",
+                    "name": "Bot"
+                },
+                "align": "left",
+                "canLabel": false
+            },
+            {
+                "messageId": "message-1",
+                "timestampUsec": 1530718503,
+                "content": "Thats good for you, i'm not very into new tech",
+                "user": {
+                    "userId": "User 00686",
+                    "name": "User"
+                },
+                "align": "right",
+                "canLabel": true
+            },
+            {
+                "messageId": "message-2",
+                "timestampUsec": 1530718516,
+                "content": "I am a college student and i am a college student :female-teacher::skin-tone-2:",
+                "user": {
+                    "userId": "Bot 002",
+                    "name": "Bot"
+                },
+                "align": "left",
+                "canLabel": false
+            }
+        ]
+    }
+]
diff --git a/tests/integration/test_client_errors.py b/tests/integration/test_client_errors.py
@@ -43,7 +43,6 @@ def test_semantic_error(client):
 
 
 def test_timeout_error(client, project):
-    time.sleep(60)  #Fails to connect if we don't wait
     with pytest.raises(labelbox.exceptions.TimeoutError) as excinfo:
         query_str = """query getOntology { 
         project (where: {id: $%s}) { 
@@ -52,7 +51,9 @@ def test_timeout_error(client, project):
                 } 
             }
         } """ % (project.uid)
-        client.execute(query_str, check_naming=False, timeout=0.01)
+
+        # Setting connect timeout to 30s, and read timeout to 0.01s
+        client.execute(query_str, check_naming=False, timeout=(30.0, 0.01))
 
 
 def test_query_complexity_error(client):
diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py
@@ -1,3 +1,4 @@
+import json
 import pytest
 import requests
 from labelbox import Dataset
@@ -101,6 +102,17 @@ def test_upload_video_file(dataset, sample_video: str) -> None:
         assert response.headers['Content-Type'] == 'video/mp4'
 
 
+def test_bulk_conversation(dataset, sample_bulk_conversation: list) -> None:
+    """
+    Tests that bulk conversations can be uploaded.
+
+    """
+    task = dataset.create_data_rows(sample_bulk_conversation)
+    task.wait_till_done()
+
+    assert len(list(dataset.data_rows())) == len(sample_bulk_conversation)
+
+
 def test_data_row_export(dataset, image_url):
     n_data_rows = 5
     ids = set()

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,6 @@ def test_semantic_error(client):`
`43`	`43`
`44`	`44`
`45`	`45`	`def test_timeout_error(client, project):`
`46`		`- time.sleep(60) #Fails to connect if we don't wait`
`47`	`46`	`with pytest.raises(labelbox.exceptions.TimeoutError) as excinfo:`
`48`	`47`	`query_str = """query getOntology {`
`49`	`48`	`project (where: {id: $%s}) {`
`@@ -52,7 +51,9 @@ def test_timeout_error(client, project):`
`52`	`51`	`}`
`53`	`52`	`}`
`54`	`53`	`} """ % (project.uid)`
`55`		`- client.execute(query_str, check_naming=False, timeout=0.01)`
	`54`	`+`
	`55`	`+ # Setting connect timeout to 30s, and read timeout to 0.01s`
	`56`	`+ client.execute(query_str, check_naming=False, timeout=(30.0, 0.01))`
`56`	`57`
`57`	`58`
`58`	`59`	`def test_query_complexity_error(client):`