Skip to content

Commit 18f0d90

Browse files
authored
Merge convert-document into ingest-file (#395)
* Merge convert-document into ingest-file Fixes #122 * Re-raise exception during conversion * Add retry logic * Document the unit for timeout
1 parent e040f1b commit 18f0d90

29 files changed

+76
-543
lines changed

.github/workflows/build.yml

-9
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@ jobs:
3232
docker push ghcr.io/alephdata/ingest-file:${GITHUB_SHA}
3333
docker tag ghcr.io/alephdata/ingest-file ghcr.io/alephdata/ingest-file:cache
3434
docker push ghcr.io/alephdata/ingest-file:cache
35-
36-
docker tag ghcr.io/alephdata/convert-document ghcr.io/alephdata/convert-document:${GITHUB_SHA}
37-
docker push ghcr.io/alephdata/convert-document:${GITHUB_SHA}
38-
docker tag ghcr.io/alephdata/convert-document ghcr.io/alephdata/convert-document:cache
39-
docker push ghcr.io/alephdata/convert-document:cache
4035
- name: Push docker images for tags
4136
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
4237
run: |
@@ -45,7 +40,3 @@ jobs:
4540
docker tag ghcr.io/alephdata/ingest-file ghcr.io/alephdata/ingest-file:${TAG};
4641
docker push ghcr.io/alephdata/ingest-file:${TAG};
4742
docker push ghcr.io/alephdata/ingest-file;
48-
49-
docker tag ghcr.io/alephdata/convert-document ghcr.io/alephdata/convert-document:${TAG};
50-
docker push ghcr.io/alephdata/convert-document:${TAG};
51-
docker push ghcr.io/alephdata/convert-document;

.github/workflows/daily.yml

-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,3 @@ jobs:
2020
run: |
2121
echo ${{ secrets.CR_PAT }} | docker login ghcr.io -u $GITHUB_ACTOR --password-stdin
2222
docker push ghcr.io/alephdata/ingest-file:cache
23-
docker push ghcr.io/alephdata/convert-document:cache

Dockerfile

+7-2
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ RUN apt-get -qq -y update \
9696
tesseract-ocr-aze \
9797
tesseract-ocr-bel \
9898
tesseract-ocr-uzb \
99+
### pdf convert: libreoffice + a bunch of fonts
100+
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
101+
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
102+
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
103+
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
104+
fonts-tlwg-purisa \
99105
###
100106
&& apt-get -qq -y autoremove \
101107
&& apt-get clean \
@@ -146,8 +152,7 @@ RUN chown -R app:app /ingestors
146152
ENV ARCHIVE_TYPE=file \
147153
ARCHIVE_PATH=/data \
148154
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
149-
REDIS_URL=redis://redis:6379/0 \
150-
INGESTORS_CONVERT_DOCUMENT_URL=http://convert-document:3000/convert
155+
REDIS_URL=redis://redis:6379/0
151156

152157
# USER app
153158
CMD ingestors process

Makefile

+1-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
INGEST=ghcr.io/alephdata/ingest-file
2-
CONVERT=ghcr.io/alephdata/convert-document
32
COMPOSE=docker-compose
43
DOCKER=$(COMPOSE) run --rm ingest-file
54

@@ -12,30 +11,26 @@ build:
1211

1312
pull-cache:
1413
-docker pull -q $(INGEST):cache
15-
-docker pull -q $(CONVERT):cache
1614

1715
cached-build: pull-cache
1816
docker build --cache-from $(INGEST):cache -t $(INGEST) .
19-
docker build --cache-from $(CONVERT):cache -t $(CONVERT) convert
2017

2118
fresh-cache:
2219
# re-generate cache images on a daily basis to avoid using
2320
# stale docker containers from upstream.
2421
docker build --pull --no-cache -t $(INGEST):cache .
25-
docker build --pull --no-cache -t $(CONVERT):cache convert
2622

2723
services:
2824
$(COMPOSE) up -d --remove-orphans postgres redis
2925

3026
shell: services
31-
$(COMPOSE) up -d --remove-orphans convert-document
3227
$(DOCKER) /bin/bash
3328

3429
test: services
3530
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
3631

3732
restart: build
38-
$(COMPOSE) up --force-recreate --no-deps --detach convert-document ingest-file
33+
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file
3934

4035
tail:
4136
$(COMPOSE) logs -f

convert/.dockerignore

-7
This file was deleted.

convert/.gitignore

-3
This file was deleted.

convert/Dockerfile

-45
This file was deleted.

convert/Makefile

-17
This file was deleted.

convert/README.md

-53
This file was deleted.

convert/convert/__init__.py

Whitespace-only changes.

convert/convert/app.py

-77
This file was deleted.

convert/convert/formats.py

-36
This file was deleted.

convert/convert/lock.py

-37
This file was deleted.

0 commit comments

Comments
 (0)