Skip to content

Commit

Permalink
move the docker conf around. inch towards generic model serving
Browse files Browse the repository at this point in the history
  • Loading branch information
metazool committed Feb 13, 2025
1 parent c81ccc0 commit 4b5d21f
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 21 deletions.
15 changes: 11 additions & 4 deletions src/label_studio_cyto_ml/Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ ARG PYTHON_VERSION=3.12
FROM python:${PYTHON_VERSION}-slim AS python-base
ARG TEST_ENV

# Build context is the root of the project
# This way we can copy model state into the container.
ARG APP_PATH=./src/label_studio_cyto_ml

WORKDIR /models
COPY ./models .
WORKDIR /app

ENV PYTHONUNBUFFERED=1 \
Expand All @@ -24,24 +30,25 @@ RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \
apt-get autoremove -y

# install base requirements
COPY requirements-base.txt .
COPY ${APP_PATH}/requirements-base.txt .
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
pip install -r requirements-base.txt

# install custom requirements
COPY requirements.txt .
COPY ${APP_PATH}/requirements.txt .
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
pip install -r requirements.txt

# install test requirements if needed
COPY requirements-test.txt .
COPY ${APP_PATH}/requirements-test.txt .
# build only when TEST_ENV="true"
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
if [ "$TEST_ENV" = "true" ]; then \
pip install -r requirements-test.txt; \
fi

COPY . .
COPY ${APP_PATH} .

ENV PYTHONPATH /app
EXPOSE 9090

Expand Down
2 changes: 1 addition & 1 deletion scripts/params.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cluster:
n_clusters: 5

collection: untagged-images-wala
collection: untagged-images-lana
3 changes: 1 addition & 2 deletions src/cyto_ml/data/vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ def closest(self, url: str, n_results: int = 25) -> List:
select embedding as first_embedding from images_vec where id = ?
)
select
images_vec.id,
images.url,
vec_distance_cosine(images_vec.embedding, first_embedding) as distance
from
Expand All @@ -209,7 +208,7 @@ def closest(self, url: str, n_results: int = 25) -> List:
order by distance limit ?"""

results = self.db.execute(query, [doc_id, n_results]).fetchall()
return [i for j in results for i in j]
return results # [i for j in results for i in j]

def labelled(self, label: str, n_results: int = 50) -> List[str]:
labelled = self.db.execute(
Expand Down
11 changes: 8 additions & 3 deletions src/cyto_ml/visualisation/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def closest_n(url: str, n: Optional[int] = 26) -> list:
"""
s = store(st.session_state["collection"])
results = s.closest(url, n_results=n)
logging.info(results)
# logging.info(results)
return results


Expand Down Expand Up @@ -108,10 +108,13 @@ def closest_grid(start_url: str, size: Optional[int] = 65) -> None:
for index, _ in enumerate(rows):
for c in rows[index]:
try:
next_image = closest.pop()
next_image, distance = closest.pop()
except IndexError:
break
c.image(cached_image(next_image), width=60)
next_image = next_image.replace(".tif", ".png")
next_image = next_image.replace("untagged-images-lana", "untagged-images-lana-ls")

c.image(next_image, width=60)
c.button("this", key=next_image, on_click=pick_image, args=[next_image])


Expand Down Expand Up @@ -148,10 +151,12 @@ def random_image() -> str:


def pick_image(image: str) -> None:
logging.info("pick " + image)
st.session_state["random_img"] = image


def show_random_image() -> None:
logging.info("show" + st.session_state["random_img"])
if st.session_state["random_img"]:
st.image(cached_image(st.session_state["random_img"]))
st.write(st.session_state["random_img"])
Expand Down
6 changes: 4 additions & 2 deletions src/label_studio_cyto_ml/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ services:
volumes:
- label-studio-data:/label-studio/data


ml-backend:
container_name: ml-backend
image: humansignal/ml-backend:v0
env_file: ".env"
build:
context: .
context: ../..
args:
TEST_ENV: ${TEST_ENV}
environment:
Expand Down Expand Up @@ -47,4 +49,4 @@ services:

volumes:
label-studio-data:
driver: local
driver: local
43 changes: 34 additions & 9 deletions src/label_studio_cyto_ml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,27 @@ def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -
raise (err)

# https://github.com/HumanSignal/label-studio-ml-backend/blob/master/label_studio_ml/response.py

return ModelResponse(predictions=predictions)

def convert_url(self, url: str) -> str:
"""Convert an s3:// URL to an https:// URL
Set AWS_URL_ENDPOINT in .env"""
if url.startswith("s3://"):
return url.replace("s3://", f"https://{os.getenv('AWS_URL_ENDPOINT')}/")
return url.replace("s3://", f"{os.getenv('AWS_URL_ENDPOINT')}/")
return url

def bucket_from_url(self, url: str) -> str:
"""Extract the bucket from an s3:// URL"""
try:
bucket = url.split("/")[2]
except IndexError:
raise ImageNotFoundError(f"Could not find bucket in {url}")
if bucket and "-ls" in bucket:
bucket = bucket.replace("-ls", "")

return bucket

def predict_task(self, task: dict) -> dict:
"""Receive a single task definition as described here https://labelstud.io/guide/task_format.html
Return the task decorated with predictions as described here
Expand All @@ -84,20 +96,33 @@ def predict_task(self, task: dict) -> dict:

features = resnet50_model(load_image_from_url(self.convert_url(image_url)))
embeddings = flat_embeddings(features)

# Classify embeddings (KNN to start, many improvements possible!) and return a label
label = self.embeddings_predict(embeddings)
# TODO check what the return format should be - does ModelResponse handle this?
# This allows us one prediction model per bucket, but it could be an ensemble
bucket_name = self.bucket_from_url(image_url)

label = self.embeddings_predict(embeddings, model=bucket_name)

return label

def embeddings_predict(self, embeddings: List[List[float]]) -> List[str]:
def embeddings_predict(self, embeddings: List[List[float]], model: Optional[str] = "") -> List[str]:
"""Predict labels from embeddings
See cyto_ml/visualisation/pages/02_kmeans.py for usage for a collection
See scripts/cluster.py for the model build and save
See scripts/cluster.py for the model build and save.
Args:
embeddings: List of embeddings
model: The name of the model to use (based on bucket name)
"""
# TODO load this from config, add to Dockerfile
fitted = pickle.load(open("../models/kmeans-untagged-images-lana.pkl", "rb"))
label = fitted.predict(embeddings)
return label

# "naming convention" is {model type}-{bucket name}
fitted = pickle.load(open(f"./models/kmeans-{model}.pkl", "rb"))
label = fitted.predict([embeddings])[0]

# The prediction format should be this, a dict
# model_version: Optional[Any] = None
# score: Optional[float] = 0.00
# result: Optional[List[Union[Dict[str, Any], Region]]]
return {"result": label}

def fit(
self,
Expand Down

0 comments on commit 4b5d21f

Please sign in to comment.