-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Michele Dolfi <[email protected]>
- Loading branch information
1 parent
ee7a237
commit ddf3144
Showing
3 changed files
with
90 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,61 @@ | ||
FROM python:3.11-slim-bookworm | ||
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s | ||
|
||
FROM ${BASE_IMAGE} | ||
|
||
ARG CPU_ONLY=false | ||
WORKDIR /docling-serve | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \ | ||
&& apt-get clean | ||
USER 0 | ||
|
||
RUN pip install --no-cache-dir poetry | ||
################################################################################################### | ||
# OS Layer # | ||
################################################################################################### | ||
|
||
COPY pyproject.toml poetry.lock README.md /docling-serve/ | ||
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \ | ||
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \ | ||
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \ | ||
dnf config-manager --enable crb && \ | ||
dnf -y update && \ | ||
dnf install -y $(cat /tmp/os-packages.txt) && \ | ||
dnf -y clean all && \ | ||
rm -rf /var/cache/dnf | ||
|
||
RUN if [ "$CPU_ONLY" = "true" ]; then \ | ||
poetry install --no-root --with cpu; \ | ||
else \ | ||
poetry install --no-root; \ | ||
fi | ||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ | ||
|
||
ENV HF_HOME=/tmp/ | ||
ENV TORCH_HOME=/tmp/ | ||
################################################################################################### | ||
# Docling layer # | ||
################################################################################################### | ||
|
||
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' | ||
USER 1001 | ||
|
||
WORKDIR /opt/app-root/src | ||
|
||
# On container environments, always set a thread budget to avoid undesired thread congestion. | ||
ENV OMP_NUM_THREADS=4 | ||
|
||
COPY ./docling_serve /docling-serve/docling_serve | ||
ENV LANG=en_US.UTF-8 | ||
ENV LC_ALL=en_US.UTF-8 | ||
ENV PYTHONIOENCODING=utf-8 | ||
|
||
ENV WITH_UI=True | ||
|
||
COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./ | ||
|
||
RUN pip install --no-cache-dir poetry && \ | ||
# We already are in a virtual environment, so we don't need to create a new one, only activate it. | ||
poetry config virtualenvs.create false && \ | ||
source /opt/app-root/bin/activate && \ | ||
if [ "$CPU_ONLY" = "true" ]; then \ | ||
poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \ | ||
else \ | ||
poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \ | ||
fi && \ | ||
echo "Downloading models..." && \ | ||
python models_download.py && \ | ||
chown -R 1001:0 /opt/app-root/src && \ | ||
chmod -R g=u /opt/app-root/src | ||
|
||
COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve | ||
|
||
EXPOSE 5001 | ||
|
||
CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"] | ||
CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import zipfile | ||
|
||
import requests | ||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models | ||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline | ||
|
||
# Download Docling models | ||
StandardPdfPipeline.download_models_hf(force=True) | ||
load_pretrained_nlp_models(verbose=True) | ||
|
||
# Download EasyOCR models | ||
urls = [ | ||
"https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip", | ||
"https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" | ||
] | ||
|
||
local_zip_paths = [ | ||
"/opt/app-root/src/latin_g2.zip", | ||
"/opt/app-root/src/craft_mlt_25k.zip" | ||
] | ||
|
||
extract_path = "/opt/app-root/src/.EasyOCR/model/" | ||
|
||
for url, local_zip_path in zip(urls, local_zip_paths): | ||
# Download the file | ||
response = requests.get(url) | ||
with open(local_zip_path, "wb") as file: | ||
file.write(response.content) | ||
|
||
# Unzip the file | ||
with zipfile.ZipFile(local_zip_path, "r") as zip_ref: | ||
zip_ref.extractall(extract_path) | ||
|
||
# Clean up the zip file | ||
os.remove(local_zip_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
tesseract | ||
tesseract-devel | ||
tesseract-langpack-eng | ||
leptonica-devel | ||
libglvnd-glx | ||
glib2 | ||
wget | ||
git |