JohnSnowLabs · C-K-Loan · Jul 11, 2024 · Jun 24, 2024
diff --git a/examples/colab/ocr/ocr_visual_document_deid.ipynb b/examples/colab/ocr/ocr_visual_document_deid.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/ocr/ocr_visual_document_deid.ipynb)\n",
+    "\n",
+    "\n",
+    "## De-Identification\n",
+    "\n",
+    "Introducing our advanced healthcare deidentification model, effortlessly deployable with a single line of code. This powerful solution integrates state-of-the-art algorithms like ner_deid_subentity_augmented, ContextualParser, RegexMatcher, and TextMatcher, alongside a streamlined Deidentification stage. It efficiently masks sensitive entities such as names, locations, and medical records, ensuring compliance and data security in medical texts. Utilizing OCR capabilities, it also redacts detected information before saving the processed file to the specified location."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### Installing the libraries"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📋 Loading license number 0 from C:\\Users\\gadde/.johnsnowlabs\\licenses/license_number_0_for_Spark-Healthcare_Spark-OCR.json\n",
+      "👌 Launched \u001B[92mcpu optimized\u001B[39m session with with: 🚀Spark-NLP==5.3.2, 💊Spark-Healthcare==5.3.2, 🕶Spark-OCR==5.3.2, running on ⚡ PySpark==3.1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install johnsnowlabs\n",
+    "from johnsnowlabs import nlp\n",
+    "nlp.install(visual=True,force_browser=True)\n",
+    "nlp.start(visual=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-24T10:27:47.436477Z",
+     "start_time": "2024-06-24T10:27:21.668104700Z"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "pdf_deid_pdf_output download started this may take some time.\n",
+      "Approx size to download 1.6 GB\n",
+      "[OK!]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#loading the model\n",
+    "\n",
+    "model = nlu.load(\"en.image_deid\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-24T10:28:08.210754700Z",
+     "start_time": "2024-06-24T10:27:47.452292500Z"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## PDF De-Identification\n",
+    "\n",
+    "With the specified input and output paths provided as arguments, the model efficiently processes PDF files, performing de-identification as needed, and seamlessly stores the processed documents at the designated location."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#provide the input and the output path\n",
+    "input_path,output_path = ['F:\\\\Work\\\\repos\\\\nlu\\\\tests\\\\datasets\\\\ocr\\\\deid\\\\download.pdf','F:\\\\Work\\\\repos\\\\nlu\\\\tests\\\\datasets\\\\ocr\\\\deid\\\\deid2.pdf'], ['F:\\\\Work\\\\repos\\\\nlu\\\\tests\\\\datasets\\\\ocr\\\\deid\\\\download_deidentified.pdf','F:\\\\Work\\\\repos\\\\nlu\\\\tests\\\\datasets\\\\ocr\\\\deid\\\\deid2_deidentified.pdf']\n",
+    "\n",
+    "#predict and save the deidentified pdf's.\n",
+    "dfs = model.predict(input_path, output_path=output_path)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-24T10:33:43.625036300Z",
+     "start_time": "2024-06-24T10:33:40.477056300Z"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -180,6 +180,7 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
          streamlit_caching: bool = False,
          apple_silicon: bool = False
          ) -> NLUPipeline:
+
     '''
     Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components
     You must call nlu.auth() BEFORE calling nlu.load() to access licensed models.
@@ -192,6 +193,7 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
     :param request: A NLU model_anno_obj/pipeline/component_to_resolve reference. You can request multiple components by separating them with whitespace. I.e. nlu.load('elmo bert albert')
     :return: returns a non fitted nlu pipeline object
     '''
+
     if streamlit_caching and not nlu.st_cache_enabled:
         enable_streamlit_caching()
         return nlu.load(request, path, verbose, gpu, streamlit_caching)

diff --git a/nlu/ocr_components/table_extractors/imag2pdf/__init__.py b/nlu/ocr_components/table_extractors/imag2pdf/__init__.py
diff --git a/nlu/ocr_components/table_extractors/imag2pdf/image2pdf.py b/nlu/ocr_components/table_extractors/imag2pdf/image2pdf.py
@@ -0,0 +1,7 @@
+class Image2PDF:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageToPdf
+        return ImageToPdf() \
+            .setInputCol("image_with_regions") \
+            .setOutputCol("content")
diff --git a/nlu/ocr_components/table_extractors/pdf2image/__init__.py b/nlu/ocr_components/table_extractors/pdf2image/__init__.py
diff --git a/nlu/ocr_components/table_extractors/pdf2image/pdf2image.py b/nlu/ocr_components/table_extractors/pdf2image/pdf2image.py
@@ -0,0 +1,9 @@
+class PDF2Image:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import PdfToImage
+        return PdfToImage() \
+            .setPartitionNum(12) \
+            .setInputCol("content") \
+            .setOutputCol("ocr_image") \
+            .setKeepInput(False)
diff --git a/nlu/ocr_components/text_recognizers/img2text/img2text.py b/nlu/ocr_components/text_recognizers/img2text/img2text.py
@@ -3,7 +3,10 @@ class Img2Text:
     def get_default_model():
         from sparkocr.transformers import ImageToText
         return ImageToText() \
-            .setInputCol("image") \
+            .setInputCol("ocr_image") \
             .setOutputCol("text") \
-            .setOcrParams(["preserve_interword_spaces=1", ])
+            .setIgnoreResolution(False) \
+            .setPageIteratorLevel(PageIteratorLevel.SYMBOL) \
+            .setPageSegMode(PageSegmentationMode.SPARSE_TEXT) \
+            .setConfidenceThreshold(70)
 
diff --git a/nlu/ocr_components/utils/image_draw_regions/__init__.py b/nlu/ocr_components/utils/image_draw_regions/__init__.py
diff --git a/nlu/ocr_components/utils/image_draw_regions/image_draw_regions.py b/nlu/ocr_components/utils/image_draw_regions/image_draw_regions.py
@@ -0,0 +1,15 @@
+class ImageDrawRegions:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageDrawRegions
+        return ImageDrawRegions() \
+            .setInputCol("ocr_image") \
+            .setInputRegionsCol("ocr_positions") \
+            .setOutputCol("image_with_regions") \
+            .setFilledRect(True)
+
+# .setInputRegionsCol("ocr_table_16969+
+#
+#
+#
+# ") \
diff --git a/nlu/ocr_components/utils/position_finder/__init__.py b/nlu/ocr_components/utils/position_finder/__init__.py
diff --git a/nlu/ocr_components/utils/position_finder/position_finder.py b/nlu/ocr_components/utils/position_finder/position_finder.py
@@ -0,0 +1,8 @@
+class PositionFinder:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import PositionFinder
+        return PositionFinder() \
+            .setInputCols("ner_chunk_subentity") \
+            .setOutputCol("ocr_positions") \
+            .setPageMatrixCol("positions")
diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py
@@ -199,10 +199,14 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
         # special edge case for lang detectors
         language = 'xx'
     if path is None:
-        if license_type != Licenses.open_source:
+        if license_type != Licenses.open_source and license_type != Licenses.ocr:
             pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models')
             uid = pipe.model.uid
 
+        elif license_type == Licenses.ocr:
+            pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/ocr')
+            uid = pipe.model.uid
+
         else:
             pipe = PretrainedPipeline(nlp_ref, lang=language)
             uid = pipe.model.uid

diff --git a/nlu/pipe/extractors/extractor_configs_OCR.py b/nlu/pipe/extractors/extractor_configs_OCR.py
@@ -68,3 +68,33 @@ def default_binary_to_image_config(output_col_prefix='binary_image'):
         name='FULL binary to image extractor ',
         description='Gets all fields generated by binary to image Transformer ',
     )
+
+def default_pdf_to_image_config(output_col_prefix='pdf_image'):
+    return SparkOCRExtractorConfig(
+        output_col_prefix=output_col_prefix,
+        get_image_origin=True,
+        get_image_height=True,
+        get_image_width=True,
+        get_image_n_channels=True,
+        get_image_mode=True,
+        get_image_resolution=True,
+        get_image_data = True,
+        get_img_positions = True,
+        name='FULL pdf to image extractor ',
+        description='Gets all fields generated by pdf to image Transformer ',
+    )
+
+def default_position_finder_config(output_col_prefix='positions'):
+    return SparkOCRExtractorConfig(
+        output_col_prefix=output_col_prefix,
+        get_image_origin=True,
+        get_image_height=True,
+        get_image_width=True,
+        get_image_n_channels=True,
+        get_image_mode=True,
+        get_image_resolution=True,
+        get_image_data = True,
+        get_img_positions = True,
+        name='FULL binary to image extractor ',
+        description='Gets all fields generated by binary to image Transformer ',
+    )
diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -296,6 +296,33 @@ def unpack_and_apply_extractors(self, pdf: Union[pyspark.sql.DataFrame, pd.DataF
         # Vanilla Spark Pipe
         return apply_extractors_and_merge(pdf.toPandas().applymap(extract_pyspark_rows), anno_2_ex_config,
                                           keep_stranger_features, stranger_features)
+    # def pythonify_spark_ocr_dataframe(self, processed,
+    #                               output_path=[],
+    #                                   file_paths=[]):
+    #
+    #     result = processed.select("pdf", "path").collect()
+    #     for index, row in enumerate(result):
+    #         pdf_content = row.pdf
+    #         outputFilePath = output_path[index]
+    #         with open(outputFilePath, "wb") as f:
+    #             f.write(pdf_content)
+    #     # for index, row in enumerate(result.select("pdf", "path").toLocalIterator()):
+    #     #     outputFilePath = output_path[index]
+    #     #     with open(outputFilePath, "wb") as f:
+    #     #         f.write(row.pdf)
+    #     # return
+    def pythonify_spark_ocr_dataframe(self, processed, output_path=[], file_paths=[]):
+        result = processed.select("pdf", "path").collect()
+
+        for value in result:
+            temp_path = value.path.split('/')[-1]
+            for index, path in enumerate(file_paths):
+                if path == temp_path:
+                    outputFilePath = output_path[index]
+                    pdf_content = value.pdf
+                    with open(outputFilePath, "wb") as f:
+                        f.write(pdf_content)
+                    break
 
     def pythonify_spark_dataframe(self, processed,
                                   keep_stranger_features=True,
@@ -466,6 +493,7 @@ def save(self, path, component='entire_pipeline', overwrite=True):
     def predict(self,
                 data,
                 output_level='',
+                output_path='',
                 positions=False,
                 keep_stranger_features=True,
                 metadata=False,
@@ -494,7 +522,7 @@ def predict(self,
         :return:
         '''
         from nlu.pipe.utils.predict_helper import __predict__
-        return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread,
+        return __predict__(self, data, output_level, output_path, positions, keep_stranger_features, metadata, multithread,
                            drop_irrelevant_cols, return_spark_df, get_embeddings)
     def predict_embeds(self,
                        data,

diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py
@@ -118,7 +118,7 @@ def predict_multi_threaded_light_pipe(pipe, data, output_level, positions, keep_
                                           )
 
 
-def __predict_ocr_spark(pipe, data, output_level, positions, keep_stranger_features, metadata,
+def __predict_ocr_spark(pipe, data, output_level, output_path, positions, keep_stranger_features, metadata,
                         drop_irrelevant_cols, get_embeddings):
     """
         Check if there are any OCR components in the Pipe.
@@ -160,14 +160,21 @@ def __predict_ocr_spark(pipe, data, output_level, positions, keep_stranger_featu
     data = data.withColumn('origin_index', monotonically_increasing_id().alias('origin_index'))
 
     data = pipe.vanilla_transformer_pipe.transform(data)
-    return pipe.pythonify_spark_dataframe(data,
-                                          keep_stranger_features=keep_stranger_features,
-                                          output_metadata=metadata,
-                                          drop_irrelevant_cols=drop_irrelevant_cols,
-                                          positions=positions,
-                                          output_level=output_level,
-                                          get_embeddings=get_embeddings
+
+    if 'ImageToPdf' in str(list(pipe.values())[-1]):
+        return pipe.pythonify_spark_ocr_dataframe(data,
+                                          output_path=output_path,
+                                          file_paths=file_paths
                                           )
+    else:
+        return pipe.pythonify_spark_dataframe(data,
+                                              keep_stranger_features=keep_stranger_features,
+                                              output_metadata=metadata,
+                                              drop_irrelevant_cols=drop_irrelevant_cols,
+                                              positions=positions,
+                                              output_level=output_level,
+                                              get_embeddings=get_embeddings
+                                              )
 
 
 def __predict_audio_spark(pipe, data, output_level, positions, keep_stranger_features, metadata,
@@ -267,7 +274,7 @@ def try_update_session():
     except Exception as e:
         print(f"Error updating session: {e}")
 
-def __predict__(pipe, data, output_level, positions, keep_stranger_features, metadata, multithread,
+def __predict__(pipe, data, output_level, output_path, positions, keep_stranger_features, metadata, multithread,
                 drop_irrelevant_cols, return_spark_df, get_embeddings, embed_only=False,normal_pred_on_db=False):
     '''
     Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String
@@ -281,6 +288,8 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
     :param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline
     :return:
     '''
+    if output_path is None:
+        output_path = []
 
     if embed_only:
         pipe.fit()
@@ -341,7 +350,7 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
     if pipe.contains_ocr_components:
         # Ocr processing
         try:
-            return __predict_ocr_spark(pipe, data, output_level, positions, keep_stranger_features,
+            return __predict_ocr_spark(pipe, data, output_level, output_path, positions, keep_stranger_features,
                                        metadata, drop_irrelevant_cols, get_embeddings=get_embeddings)
         except Exception as err:
             logger.warning(f"Predictions Failed={err}")