Skip to content

Commit

Permalink
make the image openshift-friendly
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored and vishnoianil committed Jan 29, 2025
1 parent ee7a237 commit ddf3144
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 17 deletions.
63 changes: 46 additions & 17 deletions Containerfile
Original file line number Diff line number Diff line change
@@ -1,32 +1,61 @@
FROM python:3.11-slim-bookworm
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s

FROM ${BASE_IMAGE}

ARG CPU_ONLY=false
WORKDIR /docling-serve

RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get clean
USER 0

RUN pip install --no-cache-dir poetry
###################################################################################################
# OS Layer #
###################################################################################################

COPY pyproject.toml poetry.lock README.md /docling-serve/
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
dnf config-manager --enable crb && \
dnf -y update && \
dnf install -y $(cat /tmp/os-packages.txt) && \
dnf -y clean all && \
rm -rf /var/cache/dnf

RUN if [ "$CPU_ONLY" = "true" ]; then \
poetry install --no-root --with cpu; \
else \
poetry install --no-root; \
fi
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/

ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/
###################################################################################################
# Docling layer #
###################################################################################################

RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
USER 1001

WORKDIR /opt/app-root/src

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4

COPY ./docling_serve /docling-serve/docling_serve
ENV LANG=en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV PYTHONIOENCODING=utf-8

ENV WITH_UI=True

COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./

RUN pip install --no-cache-dir poetry && \
# We already are in a virtual environment, so we don't need to create a new one, only activate it.
poetry config virtualenvs.create false && \
source /opt/app-root/bin/activate && \
if [ "$CPU_ONLY" = "true" ]; then \
poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
else \
poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
fi && \
echo "Downloading models..." && \
python models_download.py && \
chown -R 1001:0 /opt/app-root/src && \
chmod -R g=u /opt/app-root/src

COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve

EXPOSE 5001

CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
36 changes: 36 additions & 0 deletions models_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import zipfile

import requests
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

# Download Docling models
StandardPdfPipeline.download_models_hf(force=True)
load_pretrained_nlp_models(verbose=True)

# Download EasyOCR models
urls = [
"https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
"https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
]

local_zip_paths = [
"/opt/app-root/src/latin_g2.zip",
"/opt/app-root/src/craft_mlt_25k.zip"
]

extract_path = "/opt/app-root/src/.EasyOCR/model/"

for url, local_zip_path in zip(urls, local_zip_paths):
# Download the file
response = requests.get(url)
with open(local_zip_path, "wb") as file:
file.write(response.content)

# Unzip the file
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
zip_ref.extractall(extract_path)

# Clean up the zip file
os.remove(local_zip_path)
8 changes: 8 additions & 0 deletions os-packages.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
tesseract
tesseract-devel
tesseract-langpack-eng
leptonica-devel
libglvnd-glx
glib2
wget
git

0 comments on commit ddf3144

Please sign in to comment.