move the docker conf around. inch towards generic model serving

NERC-CEH · Feb 13, 2025 · 4b5d21f · 4b5d21f
1 parent c81ccc0
commit 4b5d21f
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 21 deletions.
diff --git a/src/label_studio_cyto_ml/Dockerfile → Dockerfile b/src/label_studio_cyto_ml/Dockerfile → Dockerfile
@@ -4,6 +4,12 @@ ARG PYTHON_VERSION=3.12
 FROM python:${PYTHON_VERSION}-slim AS python-base
 ARG TEST_ENV
 
+# Build context is the root of the project
+# This way we can copy model state into the container.
+ARG APP_PATH=./src/label_studio_cyto_ml
+
+WORKDIR /models
+COPY ./models .
 WORKDIR /app
 
 ENV PYTHONUNBUFFERED=1 \
@@ -24,24 +30,25 @@ RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \
     apt-get autoremove -y
 
 # install base requirements
-COPY requirements-base.txt .
+COPY ${APP_PATH}/requirements-base.txt .
 RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
     pip install -r requirements-base.txt
 
 # install custom requirements
-COPY requirements.txt .
+COPY ${APP_PATH}/requirements.txt .
 RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
     pip install -r requirements.txt
 
 # install test requirements if needed
-COPY requirements-test.txt .
+COPY ${APP_PATH}/requirements-test.txt .
 # build only when TEST_ENV="true"
 RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
     if [ "$TEST_ENV" = "true" ]; then \
       pip install -r requirements-test.txt; \
     fi
 
-COPY . .
+COPY ${APP_PATH} .
+
 ENV PYTHONPATH /app
 EXPOSE 9090
 

diff --git a/scripts/params.yaml b/scripts/params.yaml
@@ -1,4 +1,4 @@
 cluster:
   n_clusters: 5
 
-collection: untagged-images-wala
+collection: untagged-images-lana
diff --git a/src/cyto_ml/data/vectorstore.py b/src/cyto_ml/data/vectorstore.py
@@ -200,7 +200,6 @@ def closest(self, url: str, n_results: int = 25) -> List:
                 select embedding as first_embedding from images_vec where id = ?
             )
             select
-            images_vec.id,
             images.url,
             vec_distance_cosine(images_vec.embedding, first_embedding) as distance
             from
@@ -209,7 +208,7 @@ def closest(self, url: str, n_results: int = 25) -> List:
             order by distance limit ?"""
 
         results = self.db.execute(query, [doc_id, n_results]).fetchall()
-        return [i for j in results for i in j]
+        return results  # [i for j in results for i in j]
 
     def labelled(self, label: str, n_results: int = 50) -> List[str]:
         labelled = self.db.execute(

diff --git a/src/cyto_ml/visualisation/app.py b/src/cyto_ml/visualisation/app.py
@@ -70,7 +70,7 @@ def closest_n(url: str, n: Optional[int] = 26) -> list:
     """
     s = store(st.session_state["collection"])
     results = s.closest(url, n_results=n)
-    logging.info(results)
+    # logging.info(results)
     return results
 
 
@@ -108,10 +108,13 @@ def closest_grid(start_url: str, size: Optional[int] = 65) -> None:
     for index, _ in enumerate(rows):
         for c in rows[index]:
             try:
-                next_image = closest.pop()
+                next_image, distance = closest.pop()
             except IndexError:
                 break
-            c.image(cached_image(next_image), width=60)
+            next_image = next_image.replace(".tif", ".png")
+            next_image = next_image.replace("untagged-images-lana", "untagged-images-lana-ls")
+
+            c.image(next_image, width=60)
             c.button("this", key=next_image, on_click=pick_image, args=[next_image])
 
 
@@ -148,10 +151,12 @@ def random_image() -> str:
 
 
 def pick_image(image: str) -> None:
+    logging.info("pick " + image)
     st.session_state["random_img"] = image
 
 
 def show_random_image() -> None:
+    logging.info("show" + st.session_state["random_img"])
     if st.session_state["random_img"]:
         st.image(cached_image(st.session_state["random_img"]))
         st.write(st.session_state["random_img"])

diff --git a/src/label_studio_cyto_ml/docker-compose.yml b/src/label_studio_cyto_ml/docker-compose.yml
@@ -11,11 +11,13 @@ services:
     volumes:
       - label-studio-data:/label-studio/data
 
+
   ml-backend:
     container_name: ml-backend
     image: humansignal/ml-backend:v0
+    env_file: ".env"
     build:
-      context: .
+      context: ../..
       args:
         TEST_ENV: ${TEST_ENV}
     environment:
@@ -47,4 +49,4 @@ services:
 
 volumes:
   label-studio-data:
-    driver: local
+    driver: local
diff --git a/src/label_studio_cyto_ml/model.py b/src/label_studio_cyto_ml/model.py
@@ -61,15 +61,27 @@ def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -
                 raise (err)
 
         # https://github.com/HumanSignal/label-studio-ml-backend/blob/master/label_studio_ml/response.py
+
         return ModelResponse(predictions=predictions)
 
     def convert_url(self, url: str) -> str:
         """Convert an s3:// URL to an https:// URL
         Set AWS_URL_ENDPOINT in .env"""
         if url.startswith("s3://"):
-            return url.replace("s3://", f"https://{os.getenv('AWS_URL_ENDPOINT')}/")
+            return url.replace("s3://", f"{os.getenv('AWS_URL_ENDPOINT')}/")
         return url
 
+    def bucket_from_url(self, url: str) -> str:
+        """Extract the bucket from an s3:// URL"""
+        try:
+            bucket = url.split("/")[2]
+        except IndexError:
+            raise ImageNotFoundError(f"Could not find bucket in {url}")
+        if bucket and "-ls" in bucket:
+            bucket = bucket.replace("-ls", "")
+
+        return bucket
+
     def predict_task(self, task: dict) -> dict:
         """Receive a single task definition as described here https://labelstud.io/guide/task_format.html
         Return the task decorated with predictions as described here
@@ -84,20 +96,33 @@ def predict_task(self, task: dict) -> dict:
 
         features = resnet50_model(load_image_from_url(self.convert_url(image_url)))
         embeddings = flat_embeddings(features)
+
         # Classify embeddings (KNN to start, many improvements possible!) and return a label
-        label = self.embeddings_predict(embeddings)
-        # TODO check what the return format should be - does ModelResponse handle this?
+        # This allows us one prediction model per bucket, but it could be an ensemble
+        bucket_name = self.bucket_from_url(image_url)
+
+        label = self.embeddings_predict(embeddings, model=bucket_name)
+
         return label
 
-    def embeddings_predict(self, embeddings: List[List[float]]) -> List[str]:
+    def embeddings_predict(self, embeddings: List[List[float]], model: Optional[str] = "") -> List[str]:
         """Predict labels from embeddings
         See cyto_ml/visualisation/pages/02_kmeans.py for usage for a collection
-        See scripts/cluster.py for the model build and save
+        See scripts/cluster.py for the model build and save.
+        Args:
+            embeddings: List of embeddings
+            model: The name of the model to use (based on bucket name)
         """
-        # TODO load this from config, add to Dockerfile
-        fitted = pickle.load(open("../models/kmeans-untagged-images-lana.pkl", "rb"))
-        label = fitted.predict(embeddings)
-        return label
+
+        # "naming convention" is {model type}-{bucket name}
+        fitted = pickle.load(open(f"./models/kmeans-{model}.pkl", "rb"))
+        label = fitted.predict([embeddings])[0]
+
+        # The prediction format should be this, a dict
+        # model_version: Optional[Any] = None
+        # score: Optional[float] = 0.00
+        # result: Optional[List[Union[Dict[str, Any], Region]]]
+        return {"result": label}
 
     def fit(
         self,