diff --git a/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb b/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb index bef9197..4e7772f 100644 --- a/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb +++ b/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 1, "id": "13560ae0", "metadata": { "execution": { @@ -58,7 +58,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "The example will be executed on the Deep Search instance None\n" + "The example will be executed on the Deep Search instance foc\n" ] } ], @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 2, "id": "1ea3cd26", "metadata": { "execution": { @@ -134,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 3, "id": "fae4dee3", "metadata": { "execution": { @@ -150,6 +150,24 @@ "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)" ] }, + { + "cell_type": "code", + "execution_count": 4, + "id": "246c146f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "foc\n" + ] + } + ], + "source": [ + "print(PROFILE_NAME)" + ] + }, { "cell_type": "markdown", "id": "70fe44f7-89f8-45ec-b6a7-8367d171a1ba", @@ -168,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "id": "f5496626", "metadata": { "execution": { @@ -189,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 6, "id": "530fd67a", "metadata": { "execution": { @@ -207,7 +225,7 @@ "['SUCCESS']" ] }, - "execution_count": 26, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -221,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 7, "id": "bfe83b90", "metadata": { "execution": { @@ -236,7 +254,7 @@ { "data": { "text/markdown": [ - "The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at
https://pr-516-cps-dev.deepsearch-dev.zurich.ibm.com/projects/468b81ffe515a99172f93b07dd20cd50d4c19a3b/library/private/3ae2e330eb133a1bf4657d256290eece5dfc537f" + "The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at
https://cps.foc-deepsearch.zurich.ibm.com//projects/360a3b44d2ab0918fe10bab5b16a8c06860d2617/library/private/53095678b18cff71db15c6940479dad697e3c7fc" ], "text/plain": [ "" @@ -272,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "6a9a31ab", "metadata": { "execution": { @@ -302,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "id": "a669e5a9", "metadata": { "execution": { @@ -317,7 +335,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ee6a0e72d9b044cd87ae98180be10ad6", + "model_id": "df5b8a5191f14fee8e6c32612bc261ee", "version_major": 2, "version_minor": 0 }, @@ -333,6 +351,7 @@ "output_type": "stream", "text": [ "\n", + "{}\n", "Finished fetching all data. Total is 1 records.\n" ] } @@ -362,8 +381,8 @@ " \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n", " \"Title\": metadata.get(\"title\", \"\"),\n", " \"Authors\": \", \".join(\n", - " [author[\"name\"] for author in metadata.get(\"authors\", [])]\n", - " ),\n", + " [author[\"name\"] for author in metadata.get(\"authors\")]) if metadata.get(\"authors\") is not None else \"\"\n", + " ,\n", " }\n", " )\n", "\n", @@ -373,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "e997bbae", "metadata": { "execution": { @@ -384,50 +403,7 @@ "shell.execute_reply.started": "2024-05-27T10:47:50.724431Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "#### Results\n", - "Documents matching the search query 'speedup':" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FilenameTitleAuthors
02206.00785.pdf
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Visualize the table with all results\n", "df = pd.json_normalize(all_results)\n", @@ -455,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "b7addff1-b857-463a-b82c-02ed9dd5ef5f", "metadata": { "execution": { @@ -466,81 +442,7 @@ "shell.execute_reply.started": "2024-05-27T10:47:50.739834Z" } }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ed559131f4cd45509352161c238fc0df", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "0it [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_index': 'cps-dev-deepsearch-dev-projdata3ae2e3', '_id': '6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea', '_score': None, '_source': {'_name': '', 'type': 'pdf-document', 'description': {'logs': [{'agent': 'CPS Docling', 'type': 'parsing', 'comment': 'Docling 2.7.1 parsing of documents', 'date': '2025-01-16T13:53:04.836922+00:00'}], 'collection': {'type': 'Document'}}, 'file-info': {'filename': '2206.00785.pdf', 'document-hash': '6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea', '#-pages': 11, 'page-hashes': [{'hash': 'dc8ab77215bdf5d1c50e4635bc886078ceb862131df9de406624cb73be373fc1', 'model': 'default', 'page': 1}, {'hash': '58474aac22030f7608c307aeb1aee7b69b8b4ca533c80ae41ab68153790d6ab0', 'model': 'default', 'page': 2}, {'hash': '1d96fb50e9a4d970cab5199fa3b76469b804c2457390c4bdda04890bc651b813', 'model': 'default', 'page': 3}, {'hash': '4dc4d130627b1f9841208469936e65cf4667f47730044ef31ecbf9664a41d706', 'model': 'default', 'page': 4}, {'hash': '4ba309b0bcae706030066723f66989d89818f00305702bbd5935d6eca9d4db6d', 'model': 'default', 'page': 5}, {'hash': '09357309f7ee2ad6efe2951fcc314bcdac7f6296b244724a78d9e9037bd071ff', 'model': 'default', 'page': 6}, {'hash': '1b8ad8508fe45a56af6aafcaeefcefa168b0b0e8530b9b0c60be1542ce001c14', 'model': 'default', 'page': 7}, {'hash': '8289d80b04962b29ea5f2967b69ca0ae43fb56535cef4fe36630c58a9edde998', 'model': 'default', 'page': 8}, {'hash': 'ba08e96ab334e61dfcde2a71203a96856d5261280e445df5f7d84d7e5974170e', 'model': 'default', 'page': 9}, {'hash': 'f90176f27b3737c29840cf46f7c23ef86f607a2edd94f4c4201e931817f7d2ee', 'model': 'default', 'page': 10}, {'hash': '5c71fa7ee304c4d871411bbb71734c39660173335fc28f955dc3ee8e044fd748', 'model': 'default', 'page': 11}]}, 'main-text': [{'prov': [{'bbox': [17.247167587280273, 236.501953125, 36.33979415893555, 572.52001953125], 'page': 1, 'span': [0, 37], '__ref_s3_data': None}], 'text': 'arXiv:2206.00785v1 [cs.DL] 1 Jun 2022', 'type': 'page-header', 'name': 'Page-header', 'font': None}, {'prov': [{'bbox': [55.040950775146484, 685.9405517578125, 556.8915405273438, 738.414794921875], 'page': 1, 'span': [0, 89], '__ref_s3_data': None}], 'text': 'Delivering Document Conversion as a Cloud Service with High Throughput and Responsiveness', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [104.35169982910156, 622.2041015625, 203.9119873046875, 670.6500244140625], 'page': 1, 'span': [0, 75], '__ref_s3_data': None}], 'text': '1 st Christoph Auer IBM Research Ruschlikon, Switzerland cau@zurich.ibm.com', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [98.21205139160156, 542.8441162109375, 213.3932647705078, 591.4330444335938], 'page': 1, 'span': [0, 81], '__ref_s3_data': None}], 'text': '4 th Cesar Berrospi Ramis IBM Research Ruschlikon, Switzerland ceb@zurich.ibm.com', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [262.605712890625, 622.2041625976562, 362.1552734375, 671.0858154296875], 'page': 1, 'span': [0, 74], '__ref_s3_data': None}], 'text': '2 nd Michele Dolfi IBM Research Ruschlikon, Switzerland dol@zurich.ibm.com', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [263.0400390625, 542.8441772460938, 362.1445007324219, 591.435302734375], 'page': 1, 'span': [0, 77], '__ref_s3_data': None}], 'text': '5 th Peter W.J. Staar IBM Research Ruschlikon, Switzerland taa@zurich.ibm.com', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [47.97799301147461, 252.72711181640625, 300.4272766113281, 501.2793884277344], 'page': 1, 'span': [0, 1518], '__ref_s3_data': None}], 'text': 'Abstract -Document understanding is a key business process in the data-driven economy since documents are central to knowledge discovery and business insights. Converting documents into a machine-processable format is a particular challenge here due to their huge variability in formats and complex structure. Accordingly, many algorithms and machine-learning methods emerged to solve particular tasks such as Optical Character Recognition (OCR), layout analysis, table-structure recovery, figure understanding, etc. We observe the adoption of such methods in document understanding solutions offered by all major cloud providers. Yet, publications outlining how such services are designed and optimized to scale in the cloud are scarce. In this paper, we focus on the case of document conversion to illustrate the particular challenges of scaling a complex data processing pipeline with a strong reliance on machine-learning methods on cloud infrastructure. Our key objective is to achieve high scalability and responsiveness for different workload profiles in a well-defined resource budget. We outline the requirements, design, and implementation choices of our document conversion service and reflect on the challenges we faced. Evidence for the scaling behavior and resource efficiency is provided for two alternative workload distribution strategies and deployment configurations. Our best-performing method achieves sustained throughput of over one million PDF pages per hour on 3072 CPU cores across 192 nodes.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.34965896606445, 232.261962890625, 300.6181335449219, 251.952392578125], 'page': 1, 'span': [0, 103], '__ref_s3_data': None}], 'text': 'Index Terms -cloud applications, document understanding, distributed computing, artificial intelligence', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [135.26025390625, 209.32899475097656, 213.54257202148438, 218.677490234375], 'page': 1, 'span': [0, 15], '__ref_s3_data': None}], 'text': 'I. INTRODUCTION', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [48.13774871826172, 72.90673828125, 300.5203857421875, 203.037841796875], 'page': 1, 'span': [0, 668], '__ref_s3_data': None}], 'text': 'Over the past decade, many organizations have accelerated their transformation into data-driven businesses, as studies have shown its positive impact in efficiency, decision making, or financial performance [1], [2]. Leading companies are increasingly deploying workloads on public and private cloud infrastructure, including business intelligence processing and machine learning models in data analytics platforms [3]. This is owed to several factors such as high availability, lower cost for compute, and storage [4], as well as the flexibility to scale up or down a cloud-based business process to fit the operational needs. Workloads and services can be container-', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.1437683105469, 479.3659973144531, 563.2503662109375, 501.63055419921875], 'page': 1, 'span': [0, 108], '__ref_s3_data': None}], 'text': 'ized, deployed, and orchestrated through widely adopted and standardized platforms like Kubernetes [5], [6].', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.15472412109375, 347.8852844238281, 563.2810668945312, 477.949462890625], 'page': 1, 'span': [0, 673], '__ref_s3_data': None}], 'text': 'A key business process relevant to many companies is document understanding. Documents may constitute contracts, guidelines, manuals, presentations, papers, etc., which contain valuable knowledge for their operations. We observe that several specialized companies and all major cloud providers offer dedicated services (SaaS) for various aspects of document understanding such as Optical Character Recognition (OCR) (e.g., Amazon Textract$^{1}$), forms, and invoice parsing (Docparser$^{2}$, Nanonets$^{3}$, Google Document AI$^{4}$, Microsoft SharePoint Syntex$^{5}$), or conversion of unstructured formats such as PDF into structured content (IBM Watson Discovery$^{6}$).', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9252014160156, 143.3106689453125, 563.5765380859375, 346.3759460449219], 'page': 1, 'span': [0, 1015], '__ref_s3_data': None}], 'text': 'Conversion of PDF documents into a structured, machineprocessable format is a particularly challenging business process due to the high variability and weak normalization of its input. To name a few dimensions of variability, PDF documents can be short or long, encode programmatic or scanned content, have simple or complex page layouts, may contain tables or figures, etc. Thus, the process of recovering their structure and extracting content in high detail entails several dynamic steps (see Fig. 1). On the computational side, this relies on multiple algorithms and machine-learning (ML) models specialized for particular tasks. Examples for such models include OCR [7], document layout analysis [8]-[10], table structure recovery [11], [12], figure understanding [13], reference and citation resolution [14], etc. Furthermore, the ML landscape is evolving rapidly, with new models frequently exposing significantly different characteristics in terms of computational expenses, memory usage, or accelerator re-', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [319.5043029785156, 123.1485366821289, 426.5840148925781, 131.6661376953125], 'page': 1, 'span': [0, 37], '__ref_s3_data': None}], 'text': '$^{1}$https://aws.amazon.com/textract', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.4458312988281, 113.28553009033203, 392.4120788574219, 121.45208740234375], 'page': 1, 'span': [0, 27], '__ref_s3_data': None}], 'text': '$^{2}$https://docparser.com', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.6492919921875, 103.42252349853516, 427.7674255371094, 111.71142578125], 'page': 1, 'span': [0, 38], '__ref_s3_data': None}], 'text': '$^{3}$https://nanonets.com/invoice-ocr', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.6199645996094, 93.4466552734375, 444.7857360839844, 101.82421875], 'page': 1, 'span': [0, 42], '__ref_s3_data': None}], 'text': '$^{4}$https://cloud.google.com/document-ai', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.3656311035156, 83.270263671875, 546.199951171875, 91.654296875], 'page': 1, 'span': [0, 73], '__ref_s3_data': None}], 'text': '$^{5}$https://docs.microsoft.com/en-us/microsoft-365/contentunderstanding', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [318.8615417480469, 73.23809814453125, 470.4822998046875, 81.661865234375], 'page': 1, 'span': [0, 48], '__ref_s3_data': None}], 'text': '$^{6}$https://www.ibm.com/cloud/watson-discovery', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [415.03265380859375, 622.1795654296875, 513.1004638671875, 670.93896484375], 'page': 1, 'span': [0, 72], '__ref_s3_data': None}], 'text': \"3 rd Andr'e Carvalho SoftINSA Lda. Tomar, Portugal afecarvalho@gmail.com\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [169.74295043945312, 539.94189453125, 441.62701416015625, 548.11376953125], 'page': 2, 'span': [0, 78], '__ref_s3_data': None}], 'text': 'Fig. 1. Sketch of operation dependency graph in a document conversion pipeline', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/0'}, {'prov': [{'bbox': [48.25772476196289, 400.16717529296875, 300.41497802734375, 517.3280029296875], 'page': 2, 'span': [0, 574], '__ref_s3_data': None}], 'text': 'quirements. Additionally, many bottlenecks hide in logical operations such as parsing the PDF code, rasterizing a PDF page to a bitmap, or serialization and database transactions. On top of that, a cloud service to solve this business task needs to be flexible enough to support different consumption modes. Some users may need to convert large document repositories in bulk, expecting high throughput, while others may want to convert a single document ad-hoc, expecting short response time. Satisfying both expectations in a common application architecture is non-trivial.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.10683059692383, 236.3572998046875, 300.54351806640625, 389.4085998535156], 'page': 2, 'span': [0, 760], '__ref_s3_data': None}], 'text': 'In short, a cloud service for document conversion needs to be easily adaptable, must scale with model resource demand and workload sizes, and ideally avoids strong assumptions about the consumption mode. As such, it shares many traits with cloud services for other business processes but also holds unique challenges, which make it an interesting systems engineering target. In this paper, we take the subject of document conversion as a case to illustrate the particular challenges of scaling a complex data processing pipeline with strong reliance on ML methods on cloud infrastructure. We outline the practical problems our team faced while developing a cloud service of such nature [15] and present solutions in several aspects. Our main contributions are:', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [58.926998138427734, 169.04527282714844, 300.02191162109375, 214.8209228515625], 'page': 2, 'span': [0, 177], '__ref_s3_data': None}], 'text': '- 1) We propose a validated novel approach to efficiently scale an end-to-end processing pipeline leveraging ML in a cloud environment for the use-case of PDF document conversion.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [58.82088088989258, 144.35357666015625, 300.02191162109375, 167.091552734375], 'page': 2, 'span': [0, 113], '__ref_s3_data': None}], 'text': '- 2) We reflect on the design considerations, challenges, and implementation choices of our PDF conversion service.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [58.47614288330078, 120.86260986328125, 300.0218811035156, 142.80609130859375], 'page': 2, 'span': [0, 89], '__ref_s3_data': None}], 'text': '- 3) We introduce a set of comparative metrics to evaluate the performance characteristics.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [58.079864501953125, 72.9669189453125, 300.2283630371094, 118.8519287109375], 'page': 2, 'span': [0, 205], '__ref_s3_data': None}], 'text': '- 4) We present and analyze the resulting scaling and response behavior in a benchmark setup on common cloud infrastructure for two alternative workload distribution strategies and deployment configurations.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [320.0240478515625, 495.8082580566406, 554.480712890625, 517.8453979492188], 'page': 2, 'span': [0, 68], '__ref_s3_data': None}], 'text': 'II. REQUIREMENTS AND OBJECTIVES FOR DOCUMENT CONVERSION IN THE CLOUD', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [311.1315002441406, 324.1973876953125, 563.376708984375, 489.4632873535156], 'page': 2, 'span': [0, 854], '__ref_s3_data': None}], 'text': 'As teased in the introduction, a cloud service for document conversion provides a particularly broad and challenging set of requirements, which are not trivial to satisfy all at once. Precisely this combination of real-world requirements make it an interesting target for a case-study on scaling a business process in the cloud. Broadly speaking, we can split the requirements into three categories, namely 1. the ability to process documents with a wide variety in characteristics at scale, 2. the ability to incorporate and consume several ML models for different tasks and 3. the ability to perform well in different user consumption modes ( bulk or ad-hoc ) which sit on opposite ends. It should be noted that only the first category is linked to the domain problem of document conversion. The latter two categories are shared by many cloud services.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.1156311035156, 73.162353515625, 563.4229736328125, 322.1588134765625], 'page': 2, 'span': [0, 1284], '__ref_s3_data': None}], 'text': 'PDF document processing: We work under the premise that a document conversion service must treat a single document as an atomic data-unit. For instance, a single output file in JavaScript Object Notation (JSON) needs to be produced for each PDF file that is submitted. Contrary to other data formats such as images or plain text, documents have an internal granularity in the shape of pages. This trait can be exploited, since many operations can be executed independently on each page. Examples of such operations are text-cell identification (either through parsing programmatic PDF code or through OCR), layout segmentation, table-structure recovery, figure classification, etc. Two critical constraints need to be recognized here. Firstly, the intra-page operations may have dependencies. For instance, table-structure recovery relies on text-cell identification and on layout segmentation to locate a table. Secondly, the results of each operation on each page needs to be ultimately merged into a single output document (see Figure 1, phase D). This merge operation ensures that the content of the document has the correct section-structure and ordering. It is therefore an invalid assumption that the conversion of a document can be simply treated as an embarrassingly parallel', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [47.93190002441406, 539.6549072265625, 300.59576416015625, 740.781494140625], 'page': 3, 'span': [0, 1025], '__ref_s3_data': None}], 'text': 'set of operations over its pages. Rather, to obtain the converted document, one has to execute a complex operation dependency graph which dynamically forks and merges on two levels (see Figure 1 phase B and C). While this provides opportunity for highly concurrent execution, it can create significant imbalance in the number and runtime of operations for a given page, due to their extremely varying complexity. Adding to this, the PDF format has inherent properties that make it very opaque . Therefore, there is no practical way to estimate the complexity and resource demand required for processing ahead of execution time. From an architecture perspective, the above is suggesting an approach which decomposes the full chain of operations and data into tasks . Such tasks can then be created and executed concurrently through an orchestration mechanism that is aware of their dependencies. We outline implementation options and examine the advantages and tradeoffs with different task granularity in Sections III and IV.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.09360885620117, 336.267822265625, 300.3430480957031, 537.7481689453125], 'page': 3, 'span': [0, 990], '__ref_s3_data': None}], 'text': 'Machine-learning models: ML models play a powerful role in the quality of document conversion. To date, they outcompete any rule-based algorithm in document conversion tasks and even approach human-like accuracy in particular tasks such as layout analysis [16]. From a systems perspective, ML models bring an own set of challenges. First, solving the conversion task requires many different types of ML models as outlined earlier. Each of these models has different runtime characteristics and resource requirements in terms of CPU cores, memory demand, and accelerators. Furthermore, ML models evolve at a rapid pace. Consequently, to quickly adopt state-of-the-art ML models in production, our cloud service needs to be flexible with regard to integration of new models. Encapsulation of ML models into separate microservices, each serving on an own endpoint, can provide for both resource isolation and easy integration. We will discuss choices for ML serving and scaling in Section III.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.03481674194336, 72.84271240234375, 300.5635070800781, 334.2709045410156], 'page': 3, 'span': [0, 1320], '__ref_s3_data': None}], 'text': 'User requirements: The requirements outlined above are all system-internal requirements. From a user experience perspective, there are strong requirements to be met regarding the conversion speed and the response time. Our goal is to provide a cloud service which supports both bulk-conversion of huge document repositories, as well as ad-hoc conversion of individual documents. In the former case, a key concern is to achieve the highest (sustained) document throughput and ensure it scales proportionally to the compute resources. Importantly, this should allow to set a time-budget for a large conversion workload, e.g., a full online library, and infer the compute resource cost, or conversely, estimate the conversion time with a given resource budget. In the latter case, the primary objective is to achieve the shortest time-tosolution (TTS), allowing for an interactive user experience. Here it is important to note that the smaller a workload is (e.g., one document), the more challenging it becomes to achieve proper workload balancing in a distributed processing scheme. Both goals are reflecting in the scaling behavior of the system, which we will examine in Section IV-E. Additionally, we want to enable multi-tenant operation for many users on one application instance. Therefore, the service must provide', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.06005859375, 683.1170654296875, 563.045166015625, 740.9441528320312], 'page': 3, 'span': [0, 259], '__ref_s3_data': None}], 'text': 'reasonable fairness when working at full capacity and multiple users each submit a conversion job. That means, submitted conversion jobs cannot be simply processed serially, but the service should allow to schedule multiple jobs concurrently with short delay.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9884948730469, 563.0549926757812, 563.4898071289062, 681.1704711914062], 'page': 3, 'span': [0, 601], '__ref_s3_data': None}], 'text': 'Evaluation dimensions: In addition to functional, technical, and experience requirements, we also need to consider performance evaluation metrics, which typically fall into three dimensions: speed , resource cost , and quality . In this paper, we will not focus on conversion quality , since it is directly determined by the choice of models and their accuracy. Model performance for different tasks is backed by many peerreviewed results [16]. Instead, this paper will focus on speed and resource cost , and how they correlate given a fixed system configuration with selected state-of-the-art models.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [391.8093566894531, 543.4899291992188, 482.7892150878906, 552.8233642578125], 'page': 3, 'span': [0, 18], '__ref_s3_data': None}], 'text': 'III. SYSTEM DESIGN', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [311.1693420410156, 479.4771423339844, 563.0579833984375, 537.5883178710938], 'page': 3, 'span': [0, 294], '__ref_s3_data': None}], 'text': 'Having established the problem domain, requirements, and objectives, this section will outline and reflect on the fundamental application architecture of our document conversion service. Further, we establish a distributed task orchestration mechanism and discuss aspects of its implementation.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.40386962890625, 459.4205322265625, 423.4620361328125, 469.8033447265625], 'page': 3, 'span': [0, 27], '__ref_s3_data': None}], 'text': 'A. Application architecture', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [310.8529052734375, 324.1124267578125, 563.23583984375, 454.426513671875], 'page': 3, 'span': [0, 665], '__ref_s3_data': None}], 'text': 'Two fundamental decisions to be made prior to any implementation are the technology stack and architecture patterns to use. Given its clear dominance in virtually every contribution by the ML community, its proven utility in data-science tasks and the broad support in webservice frameworks [17]-[19], we chose python as the primary implementation language. Another priority we set is to work close to the bare kubernetes stack, in order to exercise lower-level control over the workload distribution, scaling, and resource creation. For this reason, we ruled out full-fledged data engines (e.g., Apache Spark [20]) and avoided proprietary or vendor-specific tools.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9093017578125, 217.00413513183594, 563.3410034179688, 322.2803649902344], 'page': 3, 'span': [0, 478], '__ref_s3_data': None}], 'text': 'The other critical decision regards the application architecture. Here, one can find fundamentally different options, depending on the type and volume of data, their processing complexity, and the consumption modes. As outlined in section II, we expect high-volume workloads which can be partitioned horizontally and processed in parallel to a high degree. On the other hand, we want to satisfy expectations of short response times for ad-hoc conversion with small data volumes.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9143981933594, 73.40312194824219, 563.4306640625, 214.8663330078125], 'page': 3, 'span': [0, 723], '__ref_s3_data': None}], 'text': 'For the latter case, an architecture optimized for real-time processing would appear to be a sensible choice. An example for this is a microservice-based application with a user-facing, synchronous REST API and a backend which implements the document processing through a chain of remote-procedurecalls (RPC) between these microservices. With microservice replication, backed by sufficient compute resources or elastic cloud infrastructure, responsive operation could be upheld even for several concurrent users. Still, this approach would certainly fail to scale to high-volume workloads such as document repositories or libraries, especially in the case where the service is used for ad-hoc conversion and bulk conversion', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.29470443725586, 658.69287109375, 300.327880859375, 740.9005126953125], 'page': 4, 'span': [0, 394], '__ref_s3_data': None}], 'text': 'at the same time. Given a bulk conversion workload in the order of hundreds of documents, the client requests would soon stack up and cause congestion, forcing the system to deny more requests. Dynamically, let alone spontaneously, scaling the microservice replication to a level which can absorb this would be extremely challenging in terms of resource management and also be cost-prohibitive.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.177391052246094, 417.8052978515625, 300.45684814453125, 489.2463684082031], 'page': 4, 'span': [0, 510], '__ref_s3_data': None}], 'text': 'Fig. 2. Architecture diagram of our conversion service. User requests for document conversion are handled by a REST API service, which can dispatch workloads asynchronously to the compute infrastructure through a queueing mechanism. Workers pick up queued tasks from a message broker and store the results for later retrieval on a document database and cloud object storage. The ML models are served through separate microservices, which are consumed by workers when executing the document conversion pipeline.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/1'}, {'prov': [{'bbox': [47.96165084838867, 72.57586669921875, 300.37664794921875, 406.59539794921875], 'page': 4, 'span': [0, 1699], '__ref_s3_data': None}], 'text': \"For these reasons, we favored an asynchronous task-based queue-worker architecture suitable for high-volume batchprocessing, involving a task queue and containerized worker processes in the backend. Here, the user-facing API only accepts a document conversion request and enqueues a corresponding task on a message broker. Clients receive an immediate response with the task's identifier, which can be used to retrieve the conversion result once the processing is completed. The worker processes attach to a task queue, consume tasks competitively, and write status information and task output to a (transient) results-backend and a persistent database (see Figure 2). Since large conversion workloads can be partitioned horizontally, workers are not required to communicate to each other or share resources. Hence, worker processes can be run isolated in containers and trivially replicated both inside and across cluster nodes through kubernetes. Tasks produced in a conversion job can reside on the task queue or on a worker process and eventually persist their state on the results-backend. Together, this ensures that a task remains traceable over its whole lifetime. As such, a queue-worker architecture protects the system from compute congestion through request overload, promises reliability, and enables simple scaling, at the cost of potentially delayed processing. However, queue mechanisms may also add I/O overhead and latency to the system, even in idle state. Consequently, TTS in ad-hoc conversion may be handicapped. To minimize such adverse effects, we chose to use online workers, which boot at service deployment time. This makes their lifetime independent of any conversion job\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.16973876953125, 670.67041015625, 563.1245727539062, 740.8549194335938], 'page': 4, 'span': [0, 349], '__ref_s3_data': None}], 'text': 'and allows to pre-load the codebase, initialize resources, and keep internal state across tasks (such as cached connections and database drivers). Further, we chose to implement a lightweight library which enables low-latency task orchestration as well as fairness between multiple conversion jobs. Details are provided in the following subsections.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9786071777344, 396.32208251953125, 563.35595703125, 669.229736328125], 'page': 4, 'span': [0, 1328], '__ref_s3_data': None}], 'text': 'We introduce a notable exception from the queue-worker pattern for serving ML models. Instead of executing ML inference codes inside the workers, we externalize ML models as microservices with a simple REST API. The slightly less efficient interface and added I/O compared to direct codebinding and shared memory inside a worker process has proven to be a reasonable sacrifice to the significantly lower integration effort for new models. Ease of integration is a key requirement in the fast-evolving ML model landscape. Additionally, model serving can be resource-budgeted and scaled out independently from workers. The latter is particularly relevant for resource efficiency since our ML inference is computationally far more expensive and at the same time far less dynamic in memory use than other operations (e.g., PDF splitting, parsing, and merging). Leveraging this benefit would otherwise demand several different worker types and queues (one for each model), therefore increasing the complexity in the core system. In our service implementation, we wrap ML model codes in a python aiohttp webserver runtime and rely on knative serving for scaling and load-balancing [21]. This compares well to the technology used in popular ML serving frameworks such as ray.io $^{7}$, Seldon Core $^{8}$, or TensorFlow Serving $^{9}$.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.2190856933594, 378.6492004394531, 451.3511657714844, 388.47711181640625], 'page': 4, 'span': [0, 34], '__ref_s3_data': None}], 'text': 'B. Task orchestration architecture', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [311.0847473144531, 255.86019897460938, 563.4871215820312, 373.316162109375], 'page': 4, 'span': [0, 572], '__ref_s3_data': None}], 'text': \"We define one conversion job as one (asynchronous) call to our service's API endpoint, which may request conversion of any number and length of PDF files. To organize the operations within a conversion job, we define tasks . A task is defined by a code function which performs an operation, by the data it processes, and an identifier which makes it traceable across the backend architecture. All tasks belong to a conversion job, which itself acts as the root-task. Tasks will be executed on the workers and cover all operations, with the exception of ML model inference.\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.08941650390625, 112.442138671875, 563.5247192382812, 253.7100830078125], 'page': 4, 'span': [0, 729], '__ref_s3_data': None}], 'text': 'Obviously, implementing the pipeline outlined in Figure 1 in a purely serial fashion, meaning as a single task for each conversion job, would not promise any reasonable efficiency, even if multiple jobs were processed in parallel on different workers. To use available cluster resources efficiently, we focus on two strategies in particular. First of all, we try to balance the workload distribution evenly. Therefore, we split the conversion jobs into smaller tasks, which can be processed independently on different workers. Secondly, we strive to avoid that workers are underutilizing CPU resources while waiting for I/O-bound operations, such as reading and writing files, database transactions, or network requests to the ML', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [319.5000915527344, 93.22607421875, 382.4632263183594, 102.552001953125], 'page': 4, 'span': [0, 24], '__ref_s3_data': None}], 'text': '$^{7}$https://www.ray.io', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.1501159667969, 83.304443359375, 509.3135986328125, 91.548828125], 'page': 4, 'span': [0, 63], '__ref_s3_data': None}], 'text': '$^{8}$https://www.seldon.io/solutions/open-source-projects/core', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [319.06390380859375, 73.2734375, 466.4934387207031, 81.6090087890625], 'page': 4, 'span': [0, 50], '__ref_s3_data': None}], 'text': '$^{9}$https://www.tensorflow.org/tfx/guide/serving', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [48.68336868286133, 718.9821166992188, 300.251708984375, 740.7255859375], 'page': 5, 'span': [0, 83], '__ref_s3_data': None}], 'text': 'models. Therefore, we enable concurrency between multiple tasks on the same worker.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.233646392822266, 346.260009765625, 300.12921142578125, 390.9906005859375], 'page': 5, 'span': [0, 395], '__ref_s3_data': None}], 'text': 'Fig. 3. Sketch of workload distribution schemes on the document-level (top) and page-level (bottom). A conversion job is submitted at time t$_{0}$. Tasks are produced and consumed by the workers through the task queue. The queue drains at time t$_{d}$. Sustained throughput is upheld until the first worker finds no more work (t$_{w}$). The job only finishes after completion of all tasks (TTS).', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/2'}, {'prov': [{'bbox': [48.275936126708984, 168.78912353515625, 300.3795166015625, 334.2211608886719], 'page': 5, 'span': [0, 800], '__ref_s3_data': None}], 'text': 'Data partitioning and workload distribution: We explore two schemes for partitioning data and forming tasks in a conversion job: a) per-document and b) in equal-sized batches of pages, across document boundaries. In the former case, each document contained in a conversion job is fully processed into the corresponding JSON output on its own subtask, which we further refer to as the document-level distribution scheme . In the latter case, we further increase granularity by producing tasks for equal-sized batches of pages, further referred to as page-level distribution scheme (see Fig. 3). As sketched out, we expect to see stronger workload imbalance in the document-level distribution scheme, leading to underutilization of workers in the tail part of a conversion job (from t$_{w}$ until TTS).', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.253902435302734, 72.8404541015625, 300.1933288574219, 166.87823486328125], 'page': 5, 'span': [0, 459], '__ref_s3_data': None}], 'text': 'A conversion job using the document-level distribution scheme consists of N + 1 tasks for N PDF documents, where the root task has the responsibility of iterating over the data source and submitting one task for every document to the queue. While the document-level scheme can be implemented rather straightforward without any dependencies between tasks, the page-level scheme introduces significantly more complexity. It becomes evident from Fig. 1 that some', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.8545837402344, 539.6549072265625, 563.4945068359375, 740.6552734375], 'page': 5, 'span': [0, 976], '__ref_s3_data': None}], 'text': 'stages in the pipeline require to work on the full document (e.g., merging pages into the full document JSON output and exporting to the target), while others can be run pageby page (e.g., PDF parsing and applying ML models). We therefore chose to define specialized tasks responsible for each processing stage, which depend on the results of tasks from the preceding stage. Hence, many tasks need to be produced on the fly and awaited at multiple points from other tasks. Their count depends on factors determined by the PDF content. Another consequence is that partial results from each task need to be stored intermediately for downstream tasks to be picked up. Since downstream tasks may schedule on any worker in any node, this incurs additional data transmission to and from a database or object storage. This particular requirement is not present in the document-level pipeline, because all intermediate data can remain worker-local until a document is fully converted.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9823913574219, 323.7270812988281, 563.3364868164062, 536.9470825195312], 'page': 5, 'span': [0, 1032], '__ref_s3_data': None}], 'text': 'Task concurrency: Keeping allocated cluster resources well saturated, while also not overbooking them, is key to efficient resource usage and high throughput in document conversion. The operations in the conversion pipeline can be roughly divided into those primarily compute-bound and primarily I/Obound. Examples for the former are parsing PDF contents, rendering an image of a PDF page, applying ML models or OCR. Examples for the latter are retrieving a PDF file over the network, unpacking an archive, exporting to a target database, or storing and fetching intermediate task results. A feasible approach we found is to interleave I/O-bound and computebound operations on the same worker, while allowing only one compute-bound operation to execute at the same time. Thus, precisely saturating resources on a cluster node turns into a matter of monitoring resource usage of workers under load and adjusting the amount of worker processes deployed on each node accordingly. Concrete evidence of this is presented in section IV-D.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9559020996094, 155.51666259765625, 563.2556762695312, 320.91015625], 'page': 5, 'span': [0, 825], '__ref_s3_data': None}], 'text': 'We evaluated and discarded multiple implementation options to achieve interleaving, such as threading and explicit callback-passing code. Eventually, we settled on implementing all task code in python asyncio 10 co-routines, which run in an asynchronous event loop. This cooperative concurrency scheme is more efficient compared to launching and managing threads, but it requires a different coding paradigm. Any active task arriving at code points which require to wait for I/O can choose to suspend and return control to the asyncio event loop, and resume at a later point after its waiting condition has been fulfilled. Compute-intense tasks may fill the waiting time and remain in control of execution until they are completed. As such, no second compute-intense task can compete for resources in the same worker process.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.0685729980469, 95.35310363769531, 563.1466674804688, 152.83831787109375], 'page': 5, 'span': [0, 308], '__ref_s3_data': None}], 'text': 'Dynamic task orchestration: Realising the dynamic traits of the proposed task distribution schemes prompted us to find an efficient mechanism to produce (sub-)tasks, submit these to the task queue, and wait for their completion during the pipeline execution. In the context outlined above, this demands for a', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [319.4994201660156, 73.4012451171875, 472.5478515625, 82.676025390625], 'page': 5, 'span': [0, 53], '__ref_s3_data': None}], 'text': '$^{10}$https://docs.python.org/3/library/asyncio.html', 'type': 'page-footer', 'name': 'Page-footer', 'font': None}, {'prov': [{'bbox': [48.14511489868164, 706.8296508789062, 300.26812744140625, 740.992919921875], 'page': 6, 'span': [0, 137], '__ref_s3_data': None}], 'text': 'task distribution code which can itself run inside an asyncio event-loop, such that in-worker concurrency between tasks can be leveraged.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.04045867919922, 371.0640869140625, 300.5950622558594, 704.4154052734375], 'page': 6, 'span': [0, 1685], '__ref_s3_data': None}], 'text': \"Several task distribution libraries have emerged in the community for different requirements and use-cases, for instance, ray.io, Celery$^{11}$, Dramatiq$^{12}$, or Apache Beam$^{13}$. To our surprise, none of the available options could meet our needs, since distribution and execution of asynchronous co-routines as tasks is widely unsupported. Few attempts to provide this capability have remained experimental and are poorly maintained or abandoned [22], [23]. Therefore, we decided to implement a custom library to orchestrate asynchronous tasks, which we make available as open source under the name Mognet $^{14}$. Fundamentally, it allows to create worker processes that initialize an asyncio event loop and attach to an amqp 15 message broker and a redis 16 instance for storage of every task's status. A worker process subscribes to a queue and fetches up to X tasks at once, which are then executed on the worker's event loop concurrently. To support the dynamic nature of our pipeline, we explicitly allow any task to produce (many) subtasks and await their completion. Subtasks are submitted to the task queue, like any task. Since all tasks are asynchronous co-routines, waiting for subtasks is a nonblocking and cheap operation. It is worth mentioning that waiting for subtasks creates a risk for deadlocks. A task may wait for subtasks forever while they cannot be scheduled on any worker, because all workers are occupied up to their limit X . To circumvent this situation, the local limit X of concurrently scheduling tasks on a worker is incremented by 1 each time a task produces any number of subtasks, and decremented by 1 again when the producing task completes.\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [47.953495025634766, 144.31390380859375, 300.4158630371094, 369.63275146484375], 'page': 6, 'span': [0, 1143], '__ref_s3_data': None}], 'text': 'Additionally to the orchestration capability outlined above, we exploit the property that such tasks which produce subtasks are very cheap to suspend and reactivate for another significant benefit. Instead of producing and enqueueing every possible subtask at once, the producing task may instead iteratively enqueue new subtasks only as previous subtasks finish. This allows precise control over how many subtasks are submitted from a producer task at once and therefore enables to keep the task queue short. A concrete example for this case is the root task of a conversion job, which produces one subtask for each document. Given a number of N workers, enqueueing much more than N tasks at once is to no benefit for distributed processing. Conversely, it can delay the scheduling of competing conversion jobs. Enqueueing the N +1 st document subtask only when the first document subtask finishes instead gives opportunity for better interleaving of tasks belonging to competing conversion jobs on the queue. The same principle translates to the level of pages inside one document. We demonstrate the effect of this strategy in section IV-F.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [56.923988342285156, 122.95361328125, 138.96153259277344, 132.4644775390625], 'page': 6, 'span': [0, 31], '__ref_s3_data': None}], 'text': '$^{11}$https://docs.celeryq.dev', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.82627868652344, 113.28555297851562, 122.49656677246094, 121.32855224609375], 'page': 6, 'span': [0, 26], '__ref_s3_data': None}], 'text': '$^{12}$https://dramatiq.io', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.90766906738281, 103.01483154296875, 139.6422119140625, 111.3055419921875], 'page': 6, 'span': [0, 30], '__ref_s3_data': None}], 'text': '$^{13}$https://beam.apache.org', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.93403625488281, 93.08990478515625, 198.8325958251953, 101.25537109375], 'page': 6, 'span': [0, 46], '__ref_s3_data': None}], 'text': '$^{14}$https://github.com/DS4SD/project-mognet', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.8766975402832, 82.97198486328125, 134.6250457763672, 91.07672119140625], 'page': 6, 'span': [0, 27], '__ref_s3_data': None}], 'text': '$^{15}$https://www.amqp.org', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.70504379272461, 73.50115966796875, 109.47432708740234, 81.83197021484375], 'page': 6, 'span': [0, 23], '__ref_s3_data': None}], 'text': '$^{16}$https://redis.io', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [400.010986328125, 730.9371337890625, 474.3590087890625, 740.70361328125], 'page': 6, 'span': [0, 14], '__ref_s3_data': None}], 'text': 'IV. EVALUATION', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [311.1755065917969, 667.5081787109375, 563.4439086914062, 724.8348388671875], 'page': 6, 'span': [0, 299], '__ref_s3_data': None}], 'text': 'To evaluate the real-world performance and behavior of our service, we outline our benchmark environment and characterize a representative test dataset used for conversion. In the following, measurements and observations regarding the scalability and multi-user fairness are presented and discussed.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.6365661621094, 648.5142211914062, 355.97283935546875, 658.2706909179688], 'page': 6, 'span': [0, 10], '__ref_s3_data': None}], 'text': 'A. Metrics', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [310.79632568359375, 345.7513122558594, 563.5660400390625, 643.2157592773438], 'page': 6, 'span': [0, 1525], '__ref_s3_data': None}], 'text': 'Following from the design goals, we establish several evaluation metrics to characterize the scaling behavior. The timeto-solution (TTS) for a particular conversion job is defined as the time difference between the submission of the first batch of documents and the retrieval of the last converted document. The throughput is defined as the number of processed pages per unit of time. Notice that the number of processed documents per unit of time is not a good metric as documents have a variable number of pages. The effective throughput is computed by determining the number of processed pages from the input dataset and dividing it by the TTS. The sustained throughput is defined as the number of processed pages from the input dataset divided by the time in which all compute resources are busy processing (from t$_{0}$ until t$_{w}$, see Fig. 3). Importantly, the effective throughput will be dependent on the data volume of the conversion job, while the sustained throughput is independent of the data volume of the conversion job. In the limit of a very large conversion job, the effective throughput will asymptotically reach the sustained throughput , as the time proportion of partially idle worker state at the end of a job will be negligible compared to the time in saturated worker state. Additionally, we observe the serial time of the benchmark conversion job. It is defined as the sum of all task runtimes in each conversion job and, in the ideal case, it is constant unless a resource bottleneck is present.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.34002685546875, 326.9412536621094, 381.1842346191406, 336.3915100097656], 'page': 6, 'span': [0, 17], '__ref_s3_data': None}], 'text': 'B. Infrastructure', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [310.9816589355469, 215.5965576171875, 563.470947265625, 320.7731018066406], 'page': 6, 'span': [0, 491], '__ref_s3_data': None}], 'text': 'For our benchmark environment, we created a managed OpenShift cluster (version 4.8, using kubernetes 1.21) hosted on IBM Cloud with default settings and we added a pool of virtualized dedicated cluster nodes. Each cluster node provides 16 CPU cores and 16 GB RAM. This choice of nodes falls well within the band of optimal resources-to-cost ratio on most cloud infrastructure providers. To enable ML model serving, we installed knative serving through the OpenShift serverless operator [21].', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.03338623046875, 155.8448486328125, 563.0993041992188, 213.60015869140625], 'page': 6, 'span': [0, 284], '__ref_s3_data': None}], 'text': 'Data blobs (i.e., source documents and conversion output) and intermediate results (e.g., model predictions) are stored on managed instances of IBM Cloud Object Storage (COS) and IBM Cloud MongoDB, respectively. All resources are hosted in an IBM Cloud location in Frankfurt, Germany.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.5181884765625, 136.80130004882812, 376.6218566894531, 146.42706298828125], 'page': 6, 'span': [0, 15], '__ref_s3_data': None}], 'text': 'C. Test Dataset', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [310.8953857421875, 72.79766845703125, 563.3804321289062, 130.986572265625], 'page': 6, 'span': [0, 291], '__ref_s3_data': None}], 'text': 'To obtain a realistic test dataset, we compiled a subset of 8053 PDF documents from arXiv (consecutive submissions 2106.00001 to 2106.08072, omitting those that do not provide a PDF). It contains 156 529 pages and measures 20.7 GB in binary size. We picked this dataset as it is available at', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.505126953125, 695.0720825195312, 300.1548767089844, 741.4332275390625], 'page': 7, 'span': [0, 252], '__ref_s3_data': None}], 'text': 'minimal cost 17 and fully reproducible. Furthermore, the arXiv is a representative dataset, as it is one of the main open-source libraries for disseminating scientific preprint articles. Detailed characteristics of the test dataset are shown in Fig. 4.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [59.584068298339844, 552.5806884765625, 60.722347259521484, 577.9385375976562], 'page': 7, 'span': [0, 1], '__ref_s3_data': None}], 'text': '#', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [102.52969360351562, 410.676513671875, 243.02830505371094, 418.9044494628906], 'page': 7, 'span': [0, 44], '__ref_s3_data': None}], 'text': 'Fig. 4. Characteristics of the test dataset.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/3'}, {'prov': [{'bbox': [48.54422378540039, 291.52410888671875, 300.34613037109375, 396.6930847167969], 'page': 7, 'span': [0, 519], '__ref_s3_data': None}], 'text': 'It is worth noting that we did not cherry-pick the documents in the dataset in any way to best reflect a real-world situation. We have observed that a fraction of documents reproducibly fail to convert or cause outliers in task runtimes. Three documents consistently fail to convert because of PDF interpretation issues. In another 160 documents, a total of 504 pages could not be converted due to memory constraints or rendering problems, which is a typical fraction that we also observed with other types of datasets.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [47.948448181152344, 267.3865966796875, 134.76904296875, 277.25396728515625], 'page': 7, 'span': [0, 20], '__ref_s3_data': None}], 'text': 'D. Application setup', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [48.122955322265625, 94.83416748046875, 300.5075988769531, 260.14739990234375], 'page': 7, 'span': [0, 779], '__ref_s3_data': None}], 'text': 'We evaluate two different profiles for the deployment and configuration of our system components to understand the impact of concurrent task interleaving on resource efficiency. For both profiles, we take the resource budget of a cluster node as the basis. To decide how many pod replicas of workers and ML models one can safely schedule on a cluster node without overbooking resources, we elected their average memory footprint under load as the primary criterion. Our target was to occupy 75% of the node memory, leaving 25% as overhead. This choice was made since memory, in contrast to CPU cores, cannot be oversubscribed and therefore strongly determines the required cost. Average memory footprint and CPU usage of the components, observed under load, are shown in table I.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [56.61177062988281, 73.37420654296875, 174.19342041015625, 83.37066650390625], 'page': 7, 'span': [0, 42], '__ref_s3_data': None}], 'text': '$^{17}$https://arxiv.org/help/bulk data s3', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [330.09259033203125, 713.3770751953125, 544.55517578125, 739.2872314453125], 'page': 7, 'span': [0, 98], '__ref_s3_data': None}], 'text': 'TABLE I Typical resource usage of workers and ML models under load in deployment profiles A and B.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Table', 'type': 'table', '$ref': '#/tables/0'}, {'prov': [{'bbox': [311.1868896484375, 518.7265014648438, 563.3357543945312, 600.181884765625], 'page': 7, 'span': [0, 404], '__ref_s3_data': None}], 'text': 'In profile A, we simulate a service which is not capable of interleaving concurrent tasks in the same worker process. Thus, we create equal counts of worker replicas and ML model replicas, and effectively disable worker-internal task concurrency by fetching only one task at a time from the queue. We schedule 17-18 pods (8-9 each for worker and ML model) per cluster node within the given memory budget.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.173828125, 422.8310852050781, 563.3305053710938, 516.508056640625], 'page': 7, 'span': [0, 471], '__ref_s3_data': None}], 'text': 'In profile B, we intend to optimize the conversion throughput. With the prior knowledge that ML model inference is the computationally most expensive operation in the pipeline, we create worker replicas and ML model replicas at a ratio of 1:4, and allow four concurrent tasks per worker to interleave. This allows to fit 15 pods (3 workers, 12 ML models) per cluster node, since now workers require more memory on average to interleave four tasks at a time (see table I).', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.1134948730469, 361.9398193359375, 563.2164306640625, 420.5632019042969], 'page': 7, 'span': [0, 291], '__ref_s3_data': None}], 'text': \"Further, we do not enforce limits on CPU usage per worker or ML model replica through kubernetes. Worker processes are naturally bounded to a single core due to python's global interpreter lock [24]. Each replica of the ML models is configured to use only one thread to avoid CPU congestion.\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.0853576660156, 266.850341796875, 563.2699584960938, 360.21392822265625], 'page': 7, 'span': [0, 465], '__ref_s3_data': None}], 'text': 'As for the ML models, we deliberately set up a rather expensive object-detection network (Faster R-CNN with ResNet50 backbone, GLYPH 63M weights) [25] for layout segmentation, which we feed PDF pages rasterized to 1025 × 1025 pixels. Bare inference time using one CPU core is in the order of 3 seconds. The structure of identified tables is reconstructed by a secondary model, here using a very cheap topological algorithm with negligible resource demand.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.39276123046875, 245.30401611328125, 368.1570739746094, 254.8785400390625], 'page': 7, 'span': [0, 14], '__ref_s3_data': None}], 'text': 'E. Scalability', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [311.0267333984375, 181.4732666015625, 563.1416625976562, 238.85894775390625], 'page': 7, 'span': [0, 255], '__ref_s3_data': None}], 'text': 'To evaluate the scaling behavior of the service, we measure the runtime and throughput of exactly one conversion job at a time, repeated with increasing node pool sizes and matching application deployment scale on the benchmark environment outlined above.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [310.9446716308594, 73.07958984375, 563.3133544921875, 178.65673828125], 'page': 7, 'span': [0, 486], '__ref_s3_data': None}], 'text': 'Benchmark series and protocol: All evaluation metrics defined above are measured repeatedly in both deployment profiles after scaling the pool size to 12, 48, 96, and 192 cluster nodes (using 192, 768, 1536, and 3072 CPU cores). The scaling level of 12 cluster nodes acts as the baseline measurement to determine the speedup for the higher scales. On a second dimension, the whole test series is carried out separately for both task distribution schemes (document-level and page-level).', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/4'}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/5'}, {'prov': [{'bbox': [48.3709602355957, 575.2971801757812, 563.189453125, 593.0181274414062], 'page': 8, 'span': [0, 261], '__ref_s3_data': None}], 'text': 'Fig. 5. Scaling of time-to-solution (A), effective throughput and sustained throughput on document-level distribution scheme (B), page-level distribution scheme (C) in both deployment profiles with node count. Gray diagonals indicate theoretical linear scaling.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/6'}, {'prov': [{'bbox': [48.08693313598633, 256.1873474121094, 300.58367919921875, 552.7544555664062], 'page': 8, 'span': [0, 1457], '__ref_s3_data': None}], 'text': 'Observations in scaling behavior: Several observations can be made in the context of our scaling benchmark series, as shown in Fig. 5. First of all, we find that the TTS at the lowest scale (12 nodes) is very similar across both deployment profiles and task distribution schemes. As we scale up the number of cluster nodes, we first recognize that both distribution schemes achieve reasonable speedup in TTS. However, the TTS in the document-level distribution scheme beats the TTS in page-level distribution scheme by a growing margin at increasing scales. This observation also mirrors in the effective throughput. At the largest scale (192 nodes), the achieved speedup in TTS starts to tail off to different degrees in both schemes, irrespective of the deployment profile. Our test dataset converts within 17 minutes in the best case (document-level scheme, profile B) and 25 min in the worst case (page-level scheme, profile A). Contrary to our theoretical considerations, the loss in scaling efficiency affects the pagelevel scheme much stronger than the document-level scheme. Deeper investigation revealed that two different causes are responsible for the observed loss in scaling efficiency. In the document-level distribution scheme, long documents increasingly dominate the TTS of the full conversion job, some of which take over 14 minutes to convert alone (see Fig 6). Thus, TTS would be lower-bounded at 14 minutes even with infinite resources.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.17839431762695, 72.85797119140625, 300.47991943359375, 250.44354248046875], 'page': 8, 'span': [0, 907], '__ref_s3_data': None}], 'text': \"In the page-level distribution scheme, despite the finegrained workload balancing achieved through equal-sized batches of pages, we find that the amount of I/O created by storing and loading intermediate task results creates a scaling bottleneck. Our MongoDB instance cannot keep up with the concurrent transaction load. This bottleneck is completely absent in the document-level distribution scheme since it's implementation creates no database transactions at all. Further, interleaving tasks on the same worker (profile B) does not demonstrate significant benefit to TTS or effective throughput over the non-interleaving setup (profile A) in either of the distribution schemes. A different picture shows when looking at the sustained throughput. In the document-level distribution scheme, we are observing even slightly better-than-linear scaling, which is clearly distinct from the effective throughput.\", 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.075439453125, 387.6934509277344, 563.6383666992188, 552.6748657226562], 'page': 8, 'span': [0, 853], '__ref_s3_data': None}], 'text': 'At first glance, this artifact may seem questionable. However, deeper verification shows two contributing factors to this. The first is owed to small imperfections in the load-balancing of requests to our ML model replicas. The fewer ML model replicas exist, the more pronounced is the impact of an unbalanced distribution of requests to those. Relatively speaking, a larger fraction of requests will be subject to additional waittimes for the response. The second factor is a peculiarity of our worker implementation. To regularly clean up the excess memory allocated (but not used) in the python runtime, worker pods reboot after every 64 completed (document-level) tasks. This causes a delay of approximately 10 seconds each time. In each conversion job, such reboots occur more frequently when fewer workers need to consume more tasks. Therefore, we', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.0782470703125, 73.94544982910156, 563.5233154296875, 126.7979736328125], 'page': 8, 'span': [0, 381], '__ref_s3_data': None}], 'text': 'Fig. 6. Active tasks over job runtime (top) and time schedule in 70 of 834 workers (bottom), using document-level scheme, profile A on 96 nodes. Each red bar marks the runtime of one task, equal to one document. The job completes only 14 minutes after the first worker finds no more work due to few, long documents in the tail. An anomaly around 7 min is caused by worker restarts.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/7'}, {'prov': [{'bbox': [48.542991638183594, 718.9821166992188, 300.4215087890625, 740.6807250976562], 'page': 9, 'span': [0, 69], '__ref_s3_data': None}], 'text': 'see sustained throughput improve overproportionally at higher scales.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.19097900390625, 551.491943359375, 300.4952697753906, 717.0626831054688], 'page': 9, 'span': [0, 812], '__ref_s3_data': None}], 'text': 'The significant difference between sustained and effective throughput in the document-level distribution scheme is clearly explained through the workload imbalance between workers, as shown in figure 6. Moreover, we find that deployment profile B, which exploits in-worker task concurrency, achieves consistently higher sustained throughput than profile A with the same total amount of resources. At the highest scale of 192 nodes, sustained throughput of 296 pages/second is achieved in profile B, compared to 220 pages/second in profile A. In the page-level distribution scheme, effective and sustained throughput remain much closer together and follow similar curves across scale levels. While this indicates better workload balancing, both metrics are equally impacted by the I/O bottleneck described before.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.219970703125, 360.1738586425781, 300.2220458984375, 376.9077453613281], 'page': 9, 'span': [0, 107], '__ref_s3_data': None}], 'text': 'Fig. 7. Measured serial times of operations in the document-le v el pipeline, profile A, across all scales.', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/8'}, {'prov': [{'bbox': [48.07781219482422, 204.2110595703125, 300.3043212890625, 345.63861083984375], 'page': 9, 'span': [0, 706], '__ref_s3_data': None}], 'text': 'An important conte xt for the interpretation of the observ ations abov e is pro vided by the serial time, dif ferentiated by processing stage (see figure 7). W e see consistent serial times on every tested scaling le v el in the document pipeline, which confirms that no resource congestion or bandwidth limitation is present. In the page-le v el pipeline (not sho wn), serial times, in particular for upload and mer ge operations, increase noticeably on the lar gest scale. Finally , we clearly recognize that applying ML models is the most time consuming operation, responsible for approximately 76% (page-le v el scheme) to 84% (document-le v el scheme) of the task runtimes in our test conversion job .', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.22081756591797, 184.77928161621094, 95.30005645751953, 193.93328857421875], 'page': 9, 'span': [0, 11], '__ref_s3_data': None}], 'text': 'F. Fairness', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [47.95774459838867, 73.27685546875, 300.39031982421875, 178.79351806640625], 'page': 9, 'span': [0, 536], '__ref_s3_data': None}], 'text': 'To understand both ho w f ast and ho w f air our service handles multiple concurrent con v ersion jobs, we analyze an extreme case. While saturating our service with the con version of our benchmark dataset, we submit a second con version job with a single a v erage-length document (15 pages) one, two, and three minutes later . W e measure the TTS of the single-document job repeatedly and compare it to the TTS of the same job submitted on an idle sys tem. This e xperiment is carried out at a scale of 48 nodes and repeated for both', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [325.976318359375, 713.4354858398438, 548.9337768554688, 730.4822387695312], 'page': 9, 'span': [0, 113], '__ref_s3_data': None}], 'text': 'TTS for a 15-page document in idle and busy system states. Values represent mean ± standard deviation in seconds.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [420.9889831542969, 731.3684692382812, 454.0250244140625, 738.4937744140625], 'page': 9, 'span': [0, 8], '__ref_s3_data': None}], 'text': 'TABLE II', 'type': 'caption', 'name': 'Caption', 'font': None}, {'name': 'Table', 'type': 'table', '$ref': '#/tables/1'}, {'prov': [{'bbox': [311.104248046875, 573.2300415039062, 563.035400390625, 594.8715209960938], 'page': 9, 'span': [0, 81], '__ref_s3_data': None}], 'text': 'deployment profiles and task distribution schemes. Results are shown in table II.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.1533508300781, 496.09405517578125, 563.3656616210938, 565.8314208984375], 'page': 9, 'span': [0, 295], '__ref_s3_data': None}], 'text': 'The single document converts with a reproducible TTS of 33s (document-level scheme) or 26s (page-level scheme). On a busy system, this increases by a factor of 3.5 (documentlevel scheme) or 7.5 (page-level scheme) on average, with a substantial standard deviation between 30% and 45% of the TTS.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.07275390625, 371.1371765136719, 563.5538330078125, 488.30364990234375], 'page': 9, 'span': [0, 609], '__ref_s3_data': None}], 'text': 'One can draw multiple conclusions from these results. First, we see good evidence that our strategy to keep the length of the task queue short (see section III-B) pays off. Concurrent conversion workloads of very different volume are processed in a reasonably fair manner, all while avoiding the complexity of job priorities or separate queues. Still, the TTS on a busy system is significantly higher in the page-level distribution scheme, since the single-document conversion job produces a total of 12 tasks, which pass through the task queue, compared to only one in the document-level distribution scheme.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.04754638671875, 138.58425903320312, 563.3732299804688, 363.76458740234375], 'page': 9, 'span': [0, 1077], '__ref_s3_data': None}], 'text': 'Second, we find that the page-level distribution scheme on this 15-page document yields 20% lower TTS (1.25x speedup) than the document-level scheme in an idle system. Disregarding all non-parallelizable operations (upload, merge, and export), the theoretical speedup would be bounded at 4x for 15 pages, because we chose to configure our system to create tasks for batches of up to four pages. Dynamically deciding the optimum page-batch size based on the page count could improve this aspect. In reality, we must consider two factors: On one hand, profiling the page-level distribution scheme reveals that approximately one quarter of the total conversion job run-time is spent in the non-parallelizable operations for this document. On the other hand, the pagelevel distribution pays a high price on additional I/O, as shown earlier. In the observed case, it requires 12 distinct store and retrieve operations, while the document-level scheme requires only two. As such, TTS in ad-hoc conversion can profit from the page-level distribution scheme only with longer documents.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [311.07318115234375, 73.40324401855469, 563.1641845703125, 130.89666748046875], 'page': 9, 'span': [0, 244], '__ref_s3_data': None}], 'text': 'Third, we find that the deployment profile (A or B) has no relevant impact on the measurements above. Processing 15 pages only utilizes a fraction of the available worker and ML model replicas here, which explains why no difference is observed.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [135.48716735839844, 730.9381103515625, 212.7260284423828, 740.87451171875], 'page': 10, 'span': [0, 14], '__ref_s3_data': None}], 'text': 'V. CONCLUSIONS', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [48.13857650756836, 618.2142333984375, 300.3543701171875, 723.6151733398438], 'page': 10, 'span': [0, 497], '__ref_s3_data': None}], 'text': 'We have carefully established a scalable cloud service design for the task of document conversion and presented the impact of different workload distribution schemes and implementation options on both absolute speed and scaling behavior. With the best-performing setup, we achieve a sustained conversion throughput scaling linearly with the resource budget up to 192 nodes (equal to 3072 CPU cores and 3072 GB of memory). In practice, this allows us to convert over one million PDF pages per hour.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.06001281738281, 498.42889404296875, 300.3832702636719, 615.6994018554688], 'page': 10, 'span': [0, 564], '__ref_s3_data': None}], 'text': 'An important lesson we learned is that distributing the workload more evenly at a page-level cannot compensate for the excess of I/O expenses and synchronization need it creates over document-level distribution. In terms of sustainable throughput, it ultimately steers into a scaling bottleneck. More efficient intermediate data storage and locality may push this bottleneck further up the scale, but it will eventually prevail. We find good potential in using page-level distribution, however, to achieve short response time in ad-hoc conversion of few documents.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.03425979614258, 330.3664855957031, 300.15289306640625, 495.8798522949219], 'page': 10, 'span': [0, 817], '__ref_s3_data': None}], 'text': 'To drive a production-grade deployment of our conversion service, we need to consider some important differences to the benchmark setup presented here. For cost reasons, it is highly advisable to rely on automatic scaling mechanisms. The ML model microservices can be easily transferred to a managed platform-as-a-service where the replicas can be dynamically scaled according to the request load without preallocating cluster infrastructure (e.g., Google App Engine$^{18}$, IBM Cloud Code Engine$^{19}$, or Amazon ECS$^{20}$). Additionally, worker pods and even cluster nodes may be automatically added and removed based on the length of the task queue. In deployments for open audiences, we enforce strict timeouts on tasks to ensure that very heavy or even poisonous PDF files are not locking up resources forever.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [47.99211883544922, 139.13909912109375, 300.3812255859375, 328.5460510253906], 'page': 10, 'span': [0, 923], '__ref_s3_data': None}], 'text': 'The investment of implementing all pipeline logic in asynchronous co-routines and developing a task distribution library which works natively within an event-loop has proven to be worthwhile. Distributing and executing code with dynamic subtasks and dependencies becomes very cheap and it helps to achieve fairness in multi-tenant usage with little effort and complexity. Additionally, it allows to write idiomatic code. It should be noted however that the efficiency advantage of interleaving tasks as proposed here depends significantly both on the amount of opportunity to avoid blocking I/O and on the proportion of compute-bound code executing in the workers. Externalizing the ML models as web-services has proven to be a sensible choice in this regard. Considering the large fraction of total compute-time spent in ML inference, investing effort into cheaper ML models will remain a top priority in our ongoing work.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [48.339088439941406, 114.763427734375, 300.40673828125, 136.94744873046875], 'page': 10, 'span': [0, 119], '__ref_s3_data': None}], 'text': 'In the landscape of data-driven business, several applications share important properties with document conversion. One', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [56.891944885253906, 92.89056396484375, 177.87632751464844, 102.00689697265625], 'page': 10, 'span': [0, 41], '__ref_s3_data': None}], 'text': '$^{18}$https://cloud.google.com/appengine', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.821067810058594, 83.01531982421875, 193.50592041015625, 91.6480712890625], 'page': 10, 'span': [0, 44], '__ref_s3_data': None}], 'text': '$^{19}$https://www.ibm.com/cloud/code-engine', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [56.264488220214844, 73.61920166015625, 152.8022003173828, 81.8521728515625], 'page': 10, 'span': [0, 33], '__ref_s3_data': None}], 'text': '$^{20}$https://aws.amazon.com/ecs', 'type': 'footnote', 'name': 'Footnote', 'font': None}, {'prov': [{'bbox': [310.8520812988281, 599.430908203125, 563.4625854492188, 740.8935546875], 'page': 10, 'span': [0, 671], '__ref_s3_data': None}], 'text': 'such case is automated knowledge-base construction, where several data processing pipelines have been developed for extraction, normalization, and analysis of knowledge [26], often targeting specific domains like material sciences, geology, or biomedicine [27]-[31]. They commonly process complex or unstructured input data in bulk (e.g., literature, experiment reports, and instrument data), employ machine-learning (e.g., language understanding), and often require complex dynamic pipelines. As such, the insights presented and conclusions drawn in this paper would be of value for the design and validation of future work on scalable processing pipelines in the cloud.', 'type': 'paragraph', 'name': 'Text', 'font': None}, {'prov': [{'bbox': [409.0566101074219, 577.389892578125, 465.4123840332031, 586.37548828125], 'page': 10, 'span': [0, 10], '__ref_s3_data': None}], 'text': 'REFERENCES', 'type': 'subtitle-level-1', 'name': 'Section-header', 'font': None}, {'prov': [{'bbox': [315.4861145019531, 515.8212890625, 563.2164306640625, 569.1727905273438], 'page': 10, 'span': [0, 364], '__ref_s3_data': None}], 'text': '- [1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R. Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, \"IDC FutureScape: Worldwide future of intelligence 2022 predictions,\" International Data Group, Inc., Needham, MA, Research Report US47913321, Oct. 2021. [Online]. Available: https://www.idc.com/getdoc.jsp?containerId=US47913321', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.63177490234375, 470.57928466796875, 563.3629150390625, 515.1875610351562], 'page': 10, 'span': [0, 285], '__ref_s3_data': None}], 'text': '- [2] D. Vile, \"The road to becoming a data driven business,\" Freeform Dynamics Ltd., New Milton, United Kingdom, Research Report US47913321, Nov. 2020. [Online]. Available: https://www.freeformdynamics.com/wp-content/uploads/2020/11/ 2020-The road to becoming a data driven business.pdf', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.6318664550781, 452.4910888671875, 563.1107177734375, 470.129638671875], 'page': 10, 'span': [0, 119], '__ref_s3_data': None}], 'text': '- [3] M. Aslett and N. Patience, \"Data platforms market map 2021,\" S&P Global Market Intelligence, Tech. Rep., Sep. 2021.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.5053405761719, 416.533447265625, 563.2227172851562, 451.75634765625], 'page': 10, 'span': [0, 231], '__ref_s3_data': None}], 'text': '- [4] G. Aggarwal. (2021, Jan.) How the pandemic has accelerated cloud adoption. Forbes. Jersey City, NJ. [Online]. Available: https://www.forbes.com/sites/forbestechcouncil/2021/01/15/ how-the-pandemic-has-accelerated-cloud-adoption', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.3817443847656, 389.4144287109375, 563.0360717773438, 415.70733642578125], 'page': 10, 'span': [0, 200], '__ref_s3_data': None}], 'text': '- [5] \"Enterprise survey series: DevOps and the cloud,\" Evans Data Corporation, Santa Cruz, CA, Research Report, Aug. 2021. [Online]. Available: https://evansdata.com/reports/viewRelease.php?reportID=45', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.73175048828125, 361.80242919921875, 563.03662109375, 388.3616027832031], 'page': 10, 'span': [0, 179], '__ref_s3_data': None}], 'text': \"- [6] J. Arundel and J. Domingus, Cloud Native DevOps with Kubernetes: Building, Deploying, and Scaling Modern Applications in the Cloud . Sebastopol, CA: O'Reilly Media, Apr. 2019.\", 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.62811279296875, 335.11053466796875, 563.2806396484375, 360.9382629394531], 'page': 10, 'span': [0, 175], '__ref_s3_data': None}], 'text': '- [7] H. Lin, P. Yang, and F. Zhang, \"Review of scene text detection and recognition,\" Archives of computational methods in engineering , vol. 27, no. 2, pp. 433-454, Apr. 2020.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.67388916015625, 289.9249267578125, 563.1982421875, 334.34613037109375], 'page': 10, 'span': [0, 330], '__ref_s3_data': None}], 'text': '- [8] Y. Xu, M. Li, L. Cui, S. Huang, F. Wei, and M. Zhou, \"Layoutlm: Pre-training of text and layout for document image understanding,\" in Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining , Aug. 2020, pp. 1192-1200. [Online]. Available: https://dl.acm.org/doi/10.1145/3394486.3403172', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [315.701904296875, 244.821533203125, 563.0390014648438, 289.1856994628906], 'page': 10, 'span': [0, 311], '__ref_s3_data': None}], 'text': '- [9] B. Pfitzmann, C. Auer, M. Dolfi, A. Nassar, and P. W. Staar, \"Doclaynet: A large human-annotated dataset for document-layout analysis,\" in Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery & Data Mining , 2022, to be published. [Online]. Available: https://doi.org/10.1145/3534678.3539043', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [311.4954833984375, 190.67242431640625, 563.2008666992188, 243.8887939453125], 'page': 10, 'span': [0, 378], '__ref_s3_data': None}], 'text': '- [10] N. Livathinos, C. Berrospi, M. Lysak, V. Kuropiatnyk, A. Nassar, A. Carvalho, M. Dolfi, C. Auer, K. Dinkla, and P. Staar, \"Robust pdf document conversion using recurrent neural networks,\" Proceedings of the AAAI Conference on Artificial Intelligence , vol. 35, no. 17, pp. 15 137-15 145, May 2021. [Online]. Available: https://ojs.aaai.org/index.php/AAAI/article/view/17777', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [311.4048767089844, 146.0043182373047, 563.1834106445312, 189.85186767578125], 'page': 10, 'span': [0, 297], '__ref_s3_data': None}], 'text': '- [11] X. Zheng, D. Burdick, L. Popa, X. Zhong, and N. X. R. Wang, \"Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context,\" in Proceedings of the IEEE/CVF winter conference on applications of computer vision , 2021, pp. 697-706.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [311.75091552734375, 109.6461181640625, 563.2018432617188, 144.9952392578125], 'page': 10, 'span': [0, 255], '__ref_s3_data': None}], 'text': '- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar, \"Tableformer: Table structure understanding with transformers,\" IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) , Jun. 2022. [Online]. Available: https://arxiv.org/abs/2203.01017', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [311.4164733886719, 73.83430480957031, 564.18408203125, 108.98297119140625], 'page': 10, 'span': [0, 223], '__ref_s3_data': None}], 'text': '- [13] N. Siegel, N. Lourie, R. Power, and W. Ammar, \"Extracting scientific figures with distantly supervised neural networks,\" in Proceedings of the 18th ACM/IEEE on joint conference on digital libraries , 2018, pp. 223-232.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.361778259277344, 713.4354858398438, 300.5360412597656, 739.5120849609375], 'page': 11, 'span': [0, 192], '__ref_s3_data': None}], 'text': '- [14] D. Duma, M. Liakata, A. Clare, J. Ravenscroft, and E. Klein, \"Rhetorical classification of anchor text for citation recommendation,\" D-Lib Magazine , vol. 22, no. 9/10, pp. 10-1045, 2016.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.49642562866211, 659.49267578125, 300.3289794921875, 712.3699340820312], 'page': 11, 'span': [0, 396], '__ref_s3_data': None}], 'text': '- [15] P. W. J. Staar, M. Dolfi, C. Auer, and C. Bekas, \"Corpus conversion service: A machine learning platform to ingest documents at scale,\" in Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining , ser. KDD \\'18. New York, NY, USA: Association for Computing Machinery, Jul. 2018, p. 774-782. [Online]. Available: https://doi.org/10.1145/3219819.3219834', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.4224853515625, 632.6595458984375, 300.2565612792969, 658.837890625], 'page': 11, 'span': [0, 157], '__ref_s3_data': None}], 'text': '- [16] G. M. Binmakhashen and S. A. Mahmoud, \"Document layout analysis: a comprehensive survey,\" ACM Computing Surveys (CSUR) , vol. 52, no. 6, pp. 1-36, 2019.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.28340530395508, 614.7494506835938, 300.21734619140625, 631.591064453125], 'page': 11, 'span': [0, 143], '__ref_s3_data': None}], 'text': '- [17] (2021, Dec.) Top computer languages. StatisticsTimes.com. [Online]. Available: https://statisticstimes.com/tech/top-computer-languages.php', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.29648208618164, 577.9435424804688, 300.60833740234375, 613.808349609375], 'page': 11, 'span': [0, 227], '__ref_s3_data': None}], 'text': \"- [18] V. Kumar. (2019, Sep.) Python vs R: What's best for machine learning. Towards Data Science Inc. Toronto, Canada. [Online]. Available: https://towardsdatascience.com/ python-vs-r-whats-best-for-machine-learning-93432084b480\", 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.440486907958984, 551.7584228515625, 300.3661193847656, 577.9423217773438], 'page': 11, 'span': [0, 175], '__ref_s3_data': None}], 'text': '- [19] G. Aggarwal. (2021) The state of developer ecosystem 2021. JetBrains. Prague, Czech Republic. [Online]. Available: https://www.jetbrains. com/lp/devecosystem-2021/python/', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.526912689208984, 525.1416015625, 300.47991943359375, 550.9553833007812], 'page': 11, 'span': [0, 197], '__ref_s3_data': None}], 'text': '- [20] M. Zaharia, M. Chowdhury, M. J. Franklin, S. Shenker, and I. Stoica, \"Spark: Cluster computing with working sets,\" in 2nd USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 10) , 2010.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.384796142578125, 498.01690673828125, 300.4748840332031, 524.2509765625], 'page': 11, 'span': [0, 180], '__ref_s3_data': None}], 'text': '- [21] (2022) About openshift serverless. Red Hat, Inc. Raleigh, NC. [Online]. Available: https://docs.openshift.com/container-platform/4.8/serverless/ discover/about-serverless.html', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.64189147949219, 480.0333557128906, 300.0221862792969, 496.8435363769531], 'page': 11, 'span': [0, 140], '__ref_s3_data': None}], 'text': '- [22] D. Garcia. (2017, Mar.) aiotasks. Next Technology Professionals. Warsaw, Poland. [Online]. Available: https://github.com/cr0hn/aiotasks', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.587677001953125, 462.358642578125, 300.02215576171875, 479.4700012207031], 'page': 11, 'span': [0, 120], '__ref_s3_data': None}], 'text': '- [23] A. Kovalevich. (2020, May) Celery pool asyncio. [Online]. Available: https://github.com/kai3341/celery-pool-asyncio', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.62297058105469, 444.0994567871094, 300.0257263183594, 461.73748779296875], 'page': 11, 'span': [0, 101], '__ref_s3_data': None}], 'text': '- [24] D. Beazley, \"Understanding the python gil,\" in PyCON Python Conference. Atlanta, Georgia , 2010.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.359004974365234, 417.5455627441406, 300.7257385253906, 443.3598937988281], 'page': 11, 'span': [0, 158], '__ref_s3_data': None}], 'text': '- [25] G. Aggarwal. (2021, Jul.) Detectron2 model zoo and baselines. [Online]. Available: https://github.com/facebookresearch/detectron2/ blob/main/MODEL ZOO.md', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.410465240478516, 390.0207824707031, 300.456298828125, 416.18792724609375], 'page': 11, 'span': [0, 196], '__ref_s3_data': None}], 'text': '- [26] P. W. Staar, M. Dolfi, and C. Auer, \"Corpus processing service: A knowledge graph platform to perform deep data exploration on corpora,\" Applied AI Letters , vol. 1, no. 2, p. e20, Dec. 2020.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.504295349121094, 345.8144226074219, 300.5445556640625, 389.45355224609375], 'page': 11, 'span': [0, 314], '__ref_s3_data': None}], 'text': '- [27] T. J. Jacobsson, A. Hultqvist, A. Garc\\'ıa-Fern\\'andez, A. Anand, A. AlAshouri, A. Hagfeldt, A. Crovetto, A. Abate, A. G. Ricciardulli, A. Vijayan et al. , \"An open-access database and analysis tool for perovskite solar cells based on the FAIR data principles,\" Nature Energy , vol. 7, no. 1, pp. 107-115, 2022.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.52201461791992, 309.53143310546875, 300.3795166015625, 344.7828063964844], 'page': 11, 'span': [0, 298], '__ref_s3_data': None}], 'text': '- [28] E. O. Pyzer-Knapp, J. W. Pitera, P. W. Staar, S. Takeda, T. Laino, D. P. Sanders, J. Sexton, J. R. Smith, and A. Curioni, \"Accelerating materials discovery using artificial intelligence, high performance computing and robotics,\" npj Computational Materials , vol. 8, no. 1, pp. 1-9, Apr. 2022.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.547607421875, 274.017822265625, 300.5572814941406, 308.9217224121094], 'page': 11, 'span': [0, 261], '__ref_s3_data': None}], 'text': '- [29] M. Piantanida, E. Bonamini, C. Caborni, F. Bergero, P. Staar, M. Dolfi, C. Auer, and V. Saturnino, \"Using knowledge graphs to navigate through geological concepts extracted from documents,\" in OMC Med Energy Conference and Exhibition . OnePetro, Sep. 2021.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.598052978515625, 238.21734619140625, 300.70361328125, 273.0455322265625], 'page': 11, 'span': [0, 231], '__ref_s3_data': None}], 'text': '- [30] V. D. Badal, D. Wright, Y. Katsis, H.-C. Kim, A. D. Swafford, R. Knight, and C.-N. Hsu, \"Challenges in the construction of knowledge bases for human microbiome-disease associations,\" Microbiome , vol. 7, no. 1, pp. 1-15, 2019.', 'type': 'paragraph', 'name': 'List-item', 'font': None}, {'prov': [{'bbox': [48.57107162475586, 193.02166748046875, 300.49127197265625, 237.1055908203125], 'page': 11, 'span': [0, 321], '__ref_s3_data': None}], 'text': '- [31] M. Manica, C. Auer, V. Weber, F. Zipoli, M. Dolfi, P. W. J. Staar, T. Laino, C. Bekas, A. Fujita, H. Toda, S. Hirose, and Y. Orii, \"An information extraction and knowledge graph platform for accelerating biochemical discoveries,\" CoRR , vol. abs/1907.08400, 2019. [Online]. Available: http://arxiv.org/abs/1907.08400', 'type': 'paragraph', 'name': 'List-item', 'font': None}], 'figures': [{'prov': [{'bbox': [50.88604736328125, 558.77099609375, 561.7911376953125, 742.2131958007812], 'page': 2, 'span': [0, 78], '__ref_s3_data': None}], 'text': 'Fig. 1. Sketch of operation dependency graph in a document conversion pipeline', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [48.31047821044922, 502.1199951171875, 299.5975341796875, 643.0848388671875], 'page': 4, 'span': [0, 510], '__ref_s3_data': None}], 'text': 'Fig. 2. Architecture diagram of our conversion service. User requests for document conversion are handled by a REST API service, which can dispatch workloads asynchronously to the compute infrastructure through a queueing mechanism. Workers pick up queued tasks from a message broker and store the results for later retrieval on a document database and cloud object storage. The ML models are served through separate microservices, which are consumed by workers when executing the document conversion pipeline.', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [47.52158737182617, 404.80792236328125, 300.072998046875, 710.0579223632812], 'page': 5, 'span': [0, 395], '__ref_s3_data': None}], 'text': 'Fig. 3. Sketch of workload distribution schemes on the document-level (top) and page-level (bottom). A conversion job is submitted at time t$_{0}$. Tasks are produced and consumed by the workers through the task queue. The queue drains at time t$_{d}$. Sustained throughput is upheld until the first worker finds no more work (t$_{w}$). The job only finishes after completion of all tasks (TTS).', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [61.860626220703125, 431.9028015136719, 288.02362060546875, 679.3151245117188], 'page': 7, 'span': [0, 44], '__ref_s3_data': None}], 'text': 'Fig. 4. Characteristics of the test dataset.', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [48.7349739074707, 604.4910278320312, 214.92897033691406, 741.4573974609375], 'page': 8, 'span': [0, 0], '__ref_s3_data': None}], 'text': '', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [220.35333251953125, 604.5574951171875, 384.72698974609375, 741.0494384765625], 'page': 8, 'span': [0, 0], '__ref_s3_data': None}], 'text': '', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [391.38787841796875, 604.5574951171875, 556.16259765625, 740.4262084960938], 'page': 8, 'span': [0, 261], '__ref_s3_data': None}], 'text': 'Fig. 5. Scaling of time-to-solution (A), effective throughput and sustained throughput on document-level distribution scheme (B), page-level distribution scheme (C) in both deployment profiles with node count. Gray diagonals indicate theoretical linear scaling.', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [338.3061218261719, 138.53965759277344, 535.2787475585938, 364.2861328125], 'page': 8, 'span': [0, 381], '__ref_s3_data': None}], 'text': 'Fig. 6. Active tasks over job runtime (top) and time schedule in 70 of 834 workers (bottom), using document-level scheme, profile A on 96 nodes. Each red bar marks the runtime of one task, equal to one document. The job completes only 14 minutes after the first worker finds no more work due to few, long documents in the tail. An anomaly around 7 min is caused by worker restarts.', 'type': 'figure', 'bounding-box': None}, {'prov': [{'bbox': [62.99921798706055, 389.0814514160156, 281.9828796386719, 538.5746459960938], 'page': 9, 'span': [0, 107], '__ref_s3_data': None}], 'text': 'Fig. 7. Measured serial times of operations in the document-le v el pipeline, profile A, across all scales.', 'type': 'figure', 'bounding-box': None}], 'tables': [{'prov': [{'bbox': [339.1385803222656, 624.9138793945312, 523.4959716796875, 703.262939453125], 'page': 7, 'span': [0, 0], '__ref_s3_data': None}], 'text': 'TABLE I Typical resource usage of workers and ML models under load in deployment profiles A and B.', 'type': 'table', '#-cols': 3, '#-rows': 6, 'data': [[{'bbox': None, 'spans': [[0, 0]], 'text': '', 'type': 'body', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 0, 'row-header': False, 'row-span': [0, 1]}, {'bbox': [461.5480041503906, 694.345458984375, 496.9671325683594, 701.4707641601562], 'spans': [[0, 1], [0, 2]], 'text': 'component', 'type': 'col_header', 'col': 1, 'col-header': True, 'col-span': [1, 3], 'row': 0, 'row-header': False, 'row-span': [0, 1]}, {'bbox': [461.5480041503906, 694.345458984375, 496.9671325683594, 701.4707641601562], 'spans': [[0, 1], [0, 2]], 'text': 'component', 'type': 'col_header', 'col': 2, 'col-header': True, 'col-span': [1, 3], 'row': 0, 'row-header': False, 'row-span': [0, 1]}], [{'bbox': None, 'spans': [[1, 0]], 'text': '', 'type': 'body', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 1, 'row-header': False, 'row-span': [1, 2]}, {'bbox': [441.1610107421875, 684.482421875, 463.57293701171875, 691.6077270507812], 'spans': [[1, 1]], 'text': 'worker', 'type': 'col_header', 'col': 1, 'col-header': True, 'col-span': [1, 2], 'row': 1, 'row-header': False, 'row-span': [1, 2]}, {'bbox': [482.68524169921875, 684.482421875, 517.3551635742188, 691.6077270507812], 'spans': [[1, 2]], 'text': 'ML model', 'type': 'col_header', 'col': 2, 'col-header': True, 'col-span': [2, 3], 'row': 1, 'row-header': False, 'row-span': [1, 2]}], [{'bbox': [387.4590148925781, 664.7554931640625, 414.9080505371094, 681.7437133789062], 'spans': [[2, 0]], 'text': 'Memory CPU', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 2, 'row-header': True, 'row-span': [2, 3]}, {'bbox': [441.1610107421875, 664.7554321289062, 470.7300720214844, 681.7437744140625], 'spans': [[2, 1]], 'text': '450 MB 0.4 cores', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 2, 'row-header': False, 'row-span': [2, 3]}, {'bbox': [482.6852111816406, 664.7554321289062, 512.2542724609375, 681.7437744140625], 'spans': [[2, 2]], 'text': '500 MB 0.7 cores', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 2, 'row-header': False, 'row-span': [2, 3]}], [{'bbox': [387.4590148925781, 654.8924560546875, 429.2063903808594, 662.0177612304688], 'spans': [[3, 0]], 'text': 'Replica ratio', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 3, 'row-header': True, 'row-span': [3, 4]}, {'bbox': [441.1610107421875, 654.8924560546875, 445.14605712890625, 662.0177612304688], 'spans': [[3, 1]], 'text': '1', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 3, 'row-header': False, 'row-span': [3, 4]}, {'bbox': [482.68524169921875, 654.8924560546875, 486.6702880859375, 662.0177612304688], 'spans': [[3, 2]], 'text': '1', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 3, 'row-header': False, 'row-span': [3, 4]}], [{'bbox': [387.4590148925781, 635.1664428710938, 414.9080505371094, 652.1547241210938], 'spans': [[4, 0]], 'text': 'Memory CPU', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 4, 'row-header': True, 'row-span': [4, 5]}, {'bbox': [441.1610107421875, 635.1664428710938, 470.7300720214844, 652.1547241210938], 'spans': [[4, 1]], 'text': '700 MB 1.2 cores', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 4, 'row-header': False, 'row-span': [4, 5]}, {'bbox': [482.6852111816406, 635.1664428710938, 512.2542724609375, 652.1547241210938], 'spans': [[4, 2]], 'text': '500 MB 0.7 cores', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 4, 'row-header': False, 'row-span': [4, 5]}], [{'bbox': [387.4590148925781, 625.303466796875, 429.2063903808594, 632.4287719726562], 'spans': [[5, 0]], 'text': 'Replica ratio', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 5, 'row-header': True, 'row-span': [5, 6]}, {'bbox': [441.1610107421875, 625.303466796875, 445.14605712890625, 632.4287719726562], 'spans': [[5, 1]], 'text': '1', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 5, 'row-header': False, 'row-span': [5, 6]}, {'bbox': [482.68524169921875, 625.303466796875, 486.6702880859375, 632.4287719726562], 'spans': [[5, 2]], 'text': '4', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 5, 'row-header': False, 'row-span': [5, 6]}]], 'model': None, 'bounding-box': None}, {'prov': [{'bbox': [337.73248291015625, 632.7647094726562, 537.3443603515625, 700.9937744140625], 'page': 9, 'span': [0, 0], '__ref_s3_data': None}], 'text': 'TABLE II', 'type': 'table', '#-cols': 4, '#-rows': 5, 'data': [[{'bbox': None, 'spans': [[0, 0]], 'text': '', 'type': 'body', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 0, 'row-header': False, 'row-span': [0, 1]}, {'bbox': None, 'spans': [[0, 1]], 'text': '', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 0, 'row-header': False, 'row-span': [0, 1]}, {'bbox': [428.04998779296875, 681.4334716796875, 511.2825622558594, 700.2147827148438], 'spans': [[0, 2]], 'text': 'distribution scheme document-level', 'type': 'col_header', 'col': 2, 'col-header': True, 'col-span': [2, 3], 'row': 0, 'row-header': False, 'row-span': [0, 1]}, {'bbox': [489.26800537109375, 681.4334716796875, 522.1445922851562, 688.5587768554688], 'spans': [[0, 3]], 'text': 'page-level', 'type': 'col_header', 'col': 3, 'col-header': True, 'col-span': [3, 4], 'row': 0, 'row-header': False, 'row-span': [0, 1]}], [{'bbox': [343.7909851074219, 681.4334716796875, 364.6009216308594, 688.5587768554688], 'spans': [[1, 0]], 'text': 'profile', 'type': 'body', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 1, 'row-header': False, 'row-span': [1, 2]}, {'bbox': [376.5559997558594, 669.37841796875, 416.09564208984375, 688.5587768554688], 'spans': [[1, 1]], 'text': 'system state idle', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 1, 'row-header': False, 'row-span': [1, 2]}, {'bbox': [428.04998779296875, 669.37841796875, 462.0344543457031, 676.5037231445312], 'spans': [[1, 2]], 'text': '32.6 ± 1.1', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 1, 'row-header': False, 'row-span': [1, 2]}, {'bbox': [489.26800537109375, 669.37841796875, 523.2525024414062, 676.5037231445312], 'spans': [[1, 3]], 'text': '26.3 ± 0.9', 'type': 'body', 'col': 3, 'col-header': False, 'col-span': [3, 4], 'row': 1, 'row-header': False, 'row-span': [1, 2]}], [{'bbox': [351.3190002441406, 664.0884399414062, 357.07342529296875, 671.2137451171875], 'spans': [[2, 0]], 'text': 'A', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 2, 'row-header': True, 'row-span': [2, 3]}, {'bbox': [376.5559997558594, 657.722412109375, 391.4521179199219, 664.8477172851562], 'spans': [[2, 1]], 'text': 'busy', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 2, 'row-header': False, 'row-span': [2, 3]}, {'bbox': [428.04998779296875, 657.722412109375, 470.00457763671875, 664.8477172851562], 'spans': [[2, 2]], 'text': '114.9 ± 17.5', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 2, 'row-header': False, 'row-span': [2, 3]}, {'bbox': [489.26800537109375, 657.722412109375, 531.22265625, 664.8477172851562], 'spans': [[2, 3]], 'text': '248.1 ± 17.3', 'type': 'body', 'col': 3, 'col-header': False, 'col-span': [3, 4], 'row': 2, 'row-header': False, 'row-span': [2, 3]}], [{'bbox': [351.5379943847656, 640.37744140625, 356.85406494140625, 647.5027465820312], 'spans': [[3, 0], [4, 0]], 'text': 'B', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 3, 'row-header': True, 'row-span': [3, 5]}, {'bbox': [376.5559997558594, 645.6674194335938, 388.5111389160156, 652.792724609375], 'spans': [[3, 1]], 'text': 'idle', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 3, 'row-header': False, 'row-span': [3, 4]}, {'bbox': [428.04998779296875, 645.6674194335938, 462.0344543457031, 652.792724609375], 'spans': [[3, 2]], 'text': '32.6 ± 0.2', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 3, 'row-header': False, 'row-span': [3, 4]}, {'bbox': [489.26800537109375, 645.6674194335938, 523.2525024414062, 652.792724609375], 'spans': [[3, 3]], 'text': '25.7 ± 0.7', 'type': 'body', 'col': 3, 'col-header': False, 'col-span': [3, 4], 'row': 3, 'row-header': False, 'row-span': [3, 4]}], [{'bbox': [351.5379943847656, 640.37744140625, 356.85406494140625, 647.5027465820312], 'spans': [[3, 0], [4, 0]], 'text': 'B', 'type': 'row_header', 'col': 0, 'col-header': False, 'col-span': [0, 1], 'row': 4, 'row-header': True, 'row-span': [3, 5]}, {'bbox': [376.5559997558594, 634.0114135742188, 391.4521179199219, 641.13671875], 'spans': [[4, 1]], 'text': 'busy', 'type': 'body', 'col': 1, 'col-header': False, 'col-span': [1, 2], 'row': 4, 'row-header': False, 'row-span': [4, 5]}, {'bbox': [428.04998779296875, 634.0114135742188, 470.00457763671875, 641.13671875], 'spans': [[4, 2]], 'text': '109.6 ± 47.7', 'type': 'body', 'col': 2, 'col-header': False, 'col-span': [2, 3], 'row': 4, 'row-header': False, 'row-span': [4, 5]}, {'bbox': [489.26800537109375, 634.0114135742188, 531.22265625, 641.13671875], 'spans': [[4, 3]], 'text': '220.4 ± 34.2', 'type': 'body', 'col': 3, 'col-header': False, 'col-span': [3, 4], 'row': 4, 'row-header': False, 'row-span': [4, 5]}]], 'model': None, 'bounding-box': None}], 'bitmaps': None, 'equations': [], 'footnotes': [], 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}, {'height': 792.0, 'page': 2, 'width': 612.0}, {'height': 792.0, 'page': 3, 'width': 612.0}, {'height': 792.0, 'page': 4, 'width': 612.0}, {'height': 792.0, 'page': 5, 'width': 612.0}, {'height': 792.0, 'page': 6, 'width': 612.0}, {'height': 792.0, 'page': 7, 'width': 612.0}, {'height': 792.0, 'page': 8, 'width': 612.0}, {'height': 792.0, 'page': 9, 'width': 612.0}, {'height': 792.0, 'page': 10, 'width': 612.0}, {'height': 792.0, 'page': 11, 'width': 612.0}], 'page-footers': [], 'page-headers': [], '_s3_data': {'pdf-pages': [], 'pdf-images': [{'mime': 'application/png', 'path': 'PDFImages/2a300d05af346f00bf424abc993bf4e09513666576197c92e57de30873c9133c.png', 'page': 1, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/2a300d05af346f00bf424abc993bf4e09513666576197c92e57de30873c9133c.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=ed857506c3efbc32f1aba9893c340ac9906d202b5fc831ca8017fe9572ee7340'}, {'mime': 'application/png', 'path': 'PDFImages/ee955828d2a26c15cef8f022cd482985df9ab6d906243bcf1e3af729e43b5661.png', 'page': 2, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/ee955828d2a26c15cef8f022cd482985df9ab6d906243bcf1e3af729e43b5661.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=024434fb4e0c44fcd199b2766375b0790c6968bfb710632b6e05a02acdfc2704'}, {'mime': 'application/png', 'path': 'PDFImages/a49ebf5cfb15672e98271aee72636d1e041710bfe352320b146f560ff02aacdb.png', 'page': 3, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/a49ebf5cfb15672e98271aee72636d1e041710bfe352320b146f560ff02aacdb.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=1b9f5e0eec14e4064046c030374d46da4a48ed4bb7f557f683f3c14a82d20053'}, {'mime': 'application/png', 'path': 'PDFImages/0e57f7ffa2a2d3eaedd1502f106a14f81a19091ca2460be698c5d04a4edf71f9.png', 'page': 4, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/0e57f7ffa2a2d3eaedd1502f106a14f81a19091ca2460be698c5d04a4edf71f9.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=191c3a08578b0900b922625bcb1ec32a54fb8682fa14baf2fcc69f1768dcf7a6'}, {'mime': 'application/png', 'path': 'PDFImages/6c93ed8b4b868881037950c521ba93bbd8ff677ac55ee5a1c05cf6a9346fcb17.png', 'page': 5, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/6c93ed8b4b868881037950c521ba93bbd8ff677ac55ee5a1c05cf6a9346fcb17.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=6560faa0bf0e256b0808be2d9d89f21aebd5f67919ab86c0f8698548e5e87c22'}, {'mime': 'application/png', 'path': 'PDFImages/e44ae6fd182bf2abbec705340aaff0e29de664cfd7790828894404010ab63ce2.png', 'page': 6, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/e44ae6fd182bf2abbec705340aaff0e29de664cfd7790828894404010ab63ce2.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=01e923aebe7854f24c2e1967aedbad54cbd4bbe638e37dbf154266a33d157e1d'}, {'mime': 'application/png', 'path': 'PDFImages/8261bcdbc98c7879ef87cabbe2ba4d7b002d2040051f8b4cc83c228782d57510.png', 'page': 7, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/8261bcdbc98c7879ef87cabbe2ba4d7b002d2040051f8b4cc83c228782d57510.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=bb0a8dea03c27322f2c320dd5e5b5a830ac02236097b61f4ebca4ef5de6da93e'}, {'mime': 'application/png', 'path': 'PDFImages/3dbbde09f9dc45906938702df7b92a06e65a54cc999b8cb5f545ffa6aa715b9b.png', 'page': 8, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/3dbbde09f9dc45906938702df7b92a06e65a54cc999b8cb5f545ffa6aa715b9b.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=dea199fe482db207add00d2839eaf0f04b419f21b7867e9ea6c715204d37cf90'}, {'mime': 'application/png', 'path': 'PDFImages/1e196e60db8fa7b4623e4266b86454f480414a1f49063df5519cf68c72797cba.png', 'page': 9, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/1e196e60db8fa7b4623e4266b86454f480414a1f49063df5519cf68c72797cba.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=bb8243319fb8ac2d5a19ab2b9de0ef6d35ab25b674b85eda43b37351343e05ea'}, {'mime': 'application/png', 'path': 'PDFImages/c57c6efaaca7a74f8607dfc05314027ef8b37b1a3c1b74993171935c9557b3ca.png', 'page': 10, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/c57c6efaaca7a74f8607dfc05314027ef8b37b1a3c1b74993171935c9557b3ca.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=6a9150a6376da3199d24cdbde777600cb2dc22a0db9b215f852ed0356bb24e53'}, {'mime': 'application/png', 'path': 'PDFImages/0a87df6f7456df78ef7a9eb25768d923d0f2136e91378cffb5cc9fcde7317619.png', 'page': 11, 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFImages/0a87df6f7456df78ef7a9eb25768d923d0f2136e91378cffb5cc9fcde7317619.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=1457638715a3f64b8a0fad5cab82dca779ff56152ad4fee54a07528002b150f5'}], 'pdf-document': [{'mime': 'application/pdf', 'path': 'PDFDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.pdf', 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/PDFDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=784252b1b012a1040bfac3c686ca413b522cbf1045e4af96ef0b5005e61badfd'}], 'markdown-document': {'mime': 'text/markdown', 'path': 'MD/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.md', 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/MD/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.md?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=331cdbc2cb4753f6ff1cab92fb0c8244f81374ebb471fad23a0761aee03269ee'}, 'json-document': {'mime': 'application/json', 'path': 'JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.json', 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.json?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=5c6545cdaf7b562594a41741080098c30feeb62928fab95afd84f65ee2df3a76'}, 'json-meta': {'mime': 'application/json', 'path': 'JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.meta.json', 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.meta.json?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=1781f8864b34b51b5419efb311b188b75d335c1420838d1c9f270dbe9af6ee4d'}, 'legacy-json-document': {'mime': 'application/json', 'path': 'JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.legacy.json', 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdata/kouros_468b81/entities/cps-dev-deepsearch-dev-projdata3ae2e3/JSONDocuments/6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea.legacy.json?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20250116%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20250116T135319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=2d055c76c073dd83b5bb05736eea503e8a6192222ccb29f583140ffab859dbdd'}}, 'identifiers': None}, 'sort': ['6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea']}\n", - "Finished fetching all data. Total is 1 records.\n", - "Data downloaded in /tmp/tmphf8euhqf\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FilenameTitleJSON PathMarkdown Path
02206.00785.pdf/tmp/tmphf8euhqf/6627d1b67955c51ff1aa8858de671.../tmp/tmphf8euhqf/6627d1b67955c51ff1aa8858de671...
\n", - "
" - ], - "text/plain": [ - " Filename Title JSON Path \\\n", - "0 2206.00785.pdf /tmp/tmphf8euhqf/6627d1b67955c51ff1aa8858de671... \n", - "\n", - " Markdown Path \n", - "0 /tmp/tmphf8euhqf/6627d1b67955c51ff1aa8858de671... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Run query\n", "query = DataQuery(search_query=\"*\", source=[\"*\"], coordinates=data_index.source)\n", @@ -588,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "fee869dd-6f23-4999-b262-b034d2161deb", "metadata": { "execution": { @@ -599,95 +501,7 @@ "shell.execute_reply.started": "2024-05-27T10:47:52.449214Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "## Markdown content" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "## Delivering Document Conversion as a Cloud Service with High Throughput and Responsiveness\n", - "\n", - "1 st Christoph Auer IBM Research Ruschlikon, Switzerland cau@zurich.ibm.com\n", - "\n", - "4 th Cesar Berrospi Ramis IBM Research Ruschlikon, Switzerland ceb@zurich.ibm.com\n", - "\n", - "2 nd Michele Dolfi IBM Research Ruschlikon, Switzerland dol@zurich.ibm.com\n", - "\n", - "5 th Peter W.J. Staar IBM Research Ruschlikon, Switzerland taa@zurich.ibm.com\n", - "\n", - "Abstract -Document understanding is a key business process in the data-driven economy since documents are central to knowledge discovery and business insights. Converting documents into a machine-processable format is a particular challenge here due to their huge variability in formats and complex structure. Accordingly, many algorithms and machine-learning methods emerged to solve particular tasks such as Optical Character Recognition (OCR), layout analysis, table-structure recovery, figure understanding, etc. We observe the adoption of such methods in document understanding solutions offered by all major cloud providers. Yet, publications outlining how such services are designed and optimized to scale in the cloud are scarce. In this paper, we focus on the case of document conversion to illustrate the particular challenges of scaling a complex data processing pipeline with a strong reliance on machine-learning methods on cloud infrastructure. Our key objective is to achieve high scalability and responsiveness for different workload profiles in a well-defined resource budget. We outline the requirements, design, and implementation choices of our document conversion service and reflect on the challenges we faced. Evidence for the scaling behavior and resource efficiency is provided for two alternative workload distribution strategies and deployment configurations. Our best-performing method achieves sustained throughput of over one million PDF pages per hour on 3072 CPU cores across 192 nodes.\n", - "\n", - "Index Terms -cloud applications, document understanding, distributed computing, artificial intelligence\n", - "\n", - "## I. INTRODUCTION\n", - "\n", - "Over the past decade, many organizations have accelerated their transformation into data-driven businesses, as studies have shown its positive impact in efficiency, decision making, or financial performance [1], [2]. Leading companies are increasingly deploying workloads on public and private cloud infrastructure, including business intelligence processing and machine learning models in data analytics platforms [3]. This is owed to several factors such as high availability, lower cost for compute, and storage [4], as well as the flexibility to scale up or down a cloud-based business process to fit the operational needs. Workloads and services can be container-\n", - "\n", - "ized, deployed, and orchestrated through widely adopted and standardized platforms like Kubernetes [5], [6].\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "## JSON content" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "{\n", - " \"_name\": \"\",\n", - " \"type\": \"pdf-document\",\n", - " \"description\": {\n", - " \"logs\": [\n", - " {\n", - " \"agent\": \"CPS Docling\",\n", - " \"type\": \"parsing\",\n", - " \"comment\": \"Docling 2.7.1 parsing of documents\",\n", - " \"date\": \"2025-01-16T13:53:04.836922+00:00\"\n", - " }\n", - " ],\n", - " \"collection\": {\n", - " \"type\": \"Document\"\n", - " }\n", - " },\n", - " \"file-info\": {\n", - " \"filename\": \"2206.00785.pdf\",\n", - " \"document-hash\": \"6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea\",\n", - " \"#-pages\": 11,\n", - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Peek first lines of a downloaded file\n", "with open(df.iloc[0][\"Markdown Path\"]) as demo_file:\n", @@ -730,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "id": "58c15840-b342-44e8-b654-32b3e35f3991", "metadata": { "execution": { @@ -748,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 14, "id": "d477507f-bcbe-45b5-9557-dd0e3109c81c", "metadata": { "execution": { @@ -768,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "57ac3ecb-f411-4dd2-936e-4e73825da323", "metadata": { "execution": { @@ -779,21 +593,7 @@ "shell.execute_reply.started": "2024-05-27T10:47:53.906917Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "#### Results\n", - "The data is now available. This file will now display the text from the scanned pages. Access it via the Deep Search UI at
https://pr-516-cps-dev.deepsearch-dev.zurich.ibm.com/projects/468b81ffe515a99172f93b07dd20cd50d4c19a3b/library/private/ad7097d15b2793b975c4caf0a2cbd7472336344f" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Load conversion settings and enable OCR\n", "cs = ConversionSettings()\n", @@ -836,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 16, "id": "16655fea-becc-4236-9ad3-ac1e7ba13c46", "metadata": { "execution": { @@ -854,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "4c3c8d9a-b06c-4966-98ee-335be9b659aa", "metadata": { "execution": { @@ -874,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "d5060e1e-f8da-4948-8d34-56ed86f5ddb3", "metadata": { "execution": { @@ -885,18 +685,7 @@ "shell.execute_reply.started": "2024-05-27T10:48:43.224475Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "['SUCCESS']" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Set custom target settings with raw pdf cells enabled\n", "tsettings = TargetSettings(add_raw_pages=True, add_annotations=False)\n", @@ -912,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "8d6dc87a-1501-4564-9914-f2787a85508b", "metadata": { "execution": { @@ -923,49 +712,7 @@ "shell.execute_reply.started": "2024-05-27T10:49:37.946098Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished fetching all data. Total is 0 records.\n", - "Data downloaded in /tmp/tmphf8euhqf\n" - ] - }, - { - "data": { - "text/markdown": [ - "#### Results\n", - "Here is the list of the files uploaded and the urls where to download the raw pdf cells details." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Run query\n", "query = DataQuery(\n", @@ -1014,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "9a0b82b8", "metadata": { "execution": { @@ -1025,17 +772,7 @@ "shell.execute_reply.started": "2024-05-27T10:49:40.028110Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data index tmp_20250115104829 deleted\n", - "Data index tmp_20250115104829-ocr deleted\n", - "Data index tmp_20250115104829-raw deleted\n" - ] - } - ], + "outputs": [], "source": [ "# Delete data index\n", "if CLEANUP:\n",