DS4SD · PeterStaar-IBM · Dec 3, 2023 · Nov 10, 2023 · Nov 12, 2023 · Nov 14, 2023
diff --git a/deepsearch_glm/glm_utils.py b/deepsearch_glm/glm_utils.py
@@ -43,8 +43,7 @@ def load_glm_config(idir:str):
 def load_glm(idir:str):
 
     config = load_glm_config(idir)
-
-    #glm = andromeda_glm.glm_model()
+
     glm = glm_model()
     glm.load(config)
 
@@ -60,7 +59,7 @@ def create_glm_config_from_docs(odir:str, json_files:list[str],
             },
             "save": {
                 "root": odir,
-                "write-CSV": True,
+                "write-CSV": False,
                 "write-JSON": False,
                 "write-path-text": False
             }
@@ -129,7 +128,6 @@ def create_glm_from_docs(odir:str, json_files:list[str],
 
     config = create_glm_config_from_docs(odir, json_files, nlp_models)
 
-    #glm = andromeda_glm.glm_model()
     glm = glm_model()
     glm.create(config)
 

diff --git a/deepsearch_glm/nlp_apply_on_docs.py b/deepsearch_glm/nlp_apply_on_docs.py
@@ -7,8 +7,9 @@
 
 import pandas as pd
 
-from utils.ds_utils import convert_pdffiles, to_legacy_document_format
+from tabulate import tabulate
 
+from utils.ds_utils import convert_pdffiles, to_legacy_document_format
 from deepsearch_glm.andromeda_nlp import nlp_model
 
 def parse_arguments():
@@ -100,6 +101,14 @@ def init_nlp_model(models:str, filters:list[str]=[]):
 
     return model
 
+def show_texts(doc_j):
+
+    data=[]
+    for item in doc_j["texts"]:
+        data.append([item["hash"], item["text-hash"], item["text"][0:48]])
+
+    print(tabulate(data, headers=["hash", "text-hash", "text"]))
+
 def show_doc(doc_j):
 
     """
@@ -125,6 +134,9 @@ def show_doc(doc_j):
     print(json.dumps(doc_j["tables"][0], indent=2))
     """        
 
+    if "texts" in doc_j:
+        show_texts(doc_j)
+
     if "properties" in doc_j:
         props = pd.DataFrame(doc_j["properties"]["data"],
                              columns=doc_j["properties"]["headers"])