Skip to content

Commit a088470

Browse files
committed
adding containerized cog inference
1 parent 7c3242f commit a088470

File tree

3 files changed

+106
-1
lines changed

3 files changed

+106
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Ishan Misra*
1212

1313
To appear at CVPR 2023 (*Highlighted paper*)
1414

15-
[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)]
15+
[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)] [[`Replicate Demo`](https://replicate.com/daanelson/imagebind)]
1616

1717
PyTorch implementation and pretrained models for ImageBind. For details, see the paper: **[ImageBind: One Embedding Space To Bind Them All](https://facebookresearch.github.io/ImageBind/paper)**.
1818

cog.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Configuration for Cog ⚙️
2+
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
3+
4+
build:
5+
# set to true if your model requires a GPU
6+
gpu: true
7+
cuda: "11.6"
8+
9+
# a list of ubuntu apt packages to install
10+
# system_packages:
11+
# - "libgl1-mesa-glx"
12+
# - "libglib2.0-0"
13+
14+
# python version in the form '3.8' or '3.8.12'
15+
python_version: "3.9"
16+
17+
# a list of packages in the format <package-name>==<version>
18+
python_packages:
19+
- "torch==1.13"
20+
- "torchvision==0.14.0"
21+
- "torchaudio==0.13.0"
22+
- "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d"
23+
- "timm==0.6.7"
24+
- "ftfy"
25+
- "regex"
26+
- "einops"
27+
- "fvcore"
28+
- "decord==0.6.0"
29+
30+
# commands run after the environment is setup
31+
# run:
32+
# - "echo env is ready!"
33+
# - "echo another command if needed"
34+
35+
# predict.py defines how predictions are run on your model
36+
predict: "predict.py:Predictor"

predict.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Prediction interface for Cog ⚙️
2+
# https://github.com/replicate/cog/blob/main/docs/python.md
3+
4+
from typing import List, Optional
5+
from cog import BasePredictor, Input, Path
6+
import data
7+
import torch
8+
from models import imagebind_model
9+
from models.imagebind_model import ModalityType
10+
11+
MODALITY_TO_PREPROCESSING = {
12+
ModalityType.TEXT: data.load_and_transform_text,
13+
ModalityType.VISION: data.load_and_transform_vision_data,
14+
ModalityType.AUDIO: data.load_and_transform_audio_data,
15+
}
16+
17+
18+
class Predictor(BasePredictor):
19+
def setup(self):
20+
"""Load the model into memory to make running multiple predictions efficient"""
21+
model = imagebind_model.imagebind_huge(pretrained=True)
22+
model.eval()
23+
self.model = model.to("cuda")
24+
25+
def predict(
26+
self,
27+
input: Path = Input(
28+
description="file that you want to embed. Needs to be text, vision, or audio.",
29+
default=None,
30+
),
31+
text_input: str = Input(
32+
description="text that you want to embed. Provide a string here instead of a text file to input if you'd like.",
33+
default=None,
34+
),
35+
modality: str = Input(
36+
description="modality of the input you'd like to embed",
37+
choices=list(MODALITY_TO_PREPROCESSING.keys()),
38+
default=ModalityType.VISION,
39+
),
40+
) -> List[float]:
41+
"""Infer a single embedding with the model"""
42+
43+
if not input and not text_input:
44+
raise Exception(
45+
"Neither input nor text_input were provided! Provide one in order to generate an embedding"
46+
)
47+
48+
modality_function = MODALITY_TO_PREPROCESSING[modality]
49+
50+
if modality == "text":
51+
if input and text_input:
52+
raise Exception(
53+
f"Input and text_input were both provided! Only provide one to generate an embedding.\nInput provided: {input}\nText Input provided: {text_input}"
54+
)
55+
if text_input:
56+
input = text_input
57+
else:
58+
with open(input, "r") as f:
59+
text_input = f.readlines()
60+
input = text_input
61+
62+
device = "cuda"
63+
model_input = {modality: modality_function([input], device)}
64+
65+
with torch.no_grad():
66+
embeddings = self.model(model_input)
67+
# print(type(embeddings))
68+
emb = embeddings[modality]
69+
return emb.cpu().squeeze().tolist()

0 commit comments

Comments
 (0)