DS4SD · dolfim-ibm · Dec 6, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml
@@ -4,6 +4,12 @@ inputs:
   python-version:
     description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
     default: '3.11'
+  run_install:
+    description: "Install the dependencies."
+    default: 'true'
+  install_extras:
+    description: "When installing depdencies, the extra dependencies are included."
+    default: 'true'
 runs:
   using: 'composite'
   steps:
@@ -21,6 +27,11 @@ runs:
         poetry env use ${{ steps.py.outputs.python-path }}
         poetry run python --version
       shell: bash
-    - name: Install only dependencies and not the package itself
+    - name: Install the dependencies and the extas (but not the package itself)
+      if: ${{ inputs.run_install == 'true' && inputs.install_extras == 'true' }}
       run: poetry install --all-extras --no-root
       shell: bash
+    - name: Install only dependencies and not the package itself
+      if: ${{ inputs.run_install == 'true' && inputs.install_extras != 'true' }}
+      run: poetry install --no-root
+      shell: bash
diff --git a/.github/scripts/build_rhel.sh b/.github/scripts/build_rhel.sh
@@ -49,5 +49,5 @@ docker build --progress=plain \
 
     RUN cd /src \
         && pip3.11 install pytest \
-        && pytest ./tests/test_glm.py -v
+        && pytest ./tests/test_simple_interface.py -v
 EOF
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -18,7 +18,18 @@ jobs:
       - uses: ./.github/actions/setup-poetry
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install with poetry
+          run_install: 'false'
+
+      - name: Compile and install
+        run: |
+          poetry install
+          ls -l
+          ls -l deepsearch_glm
+
+      - name: Test interface without extras
+        run: |
+          poetry run pytest ./tests/test_simple_interface.py -vs
+      - name: Install extras
         run: |
           poetry install --all-extras
           ls -l

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -304,7 +304,7 @@ jobs:
       - name: Quick test wheel
         if: ${{ matrix.os.platform_id == 'win_amd64' || (matrix.os.platform_id == 'macosx_arm64' && matrix.os.name == 'macos-14') }}
         run: |
-          poetry install --no-interaction --no-root --only=test
+          poetry install --no-interaction --no-root --all-extras
           poetry run python -c 'from deepsearch_glm import andromeda_glm'
           poetry run pytest ./tests/test_glm.py -v
 

diff --git a/deepsearch_glm/glm_query.py b/deepsearch_glm/glm_query.py
@@ -2,9 +2,7 @@
 """Module to query the GLM"""
 
 import argparse
-import json
 
-import pandas as pd
 from tabulate import tabulate
 
 from deepsearch_glm.andromeda_glm import glm_query

diff --git a/deepsearch_glm/nlp_model_training/name_classifier.py b/deepsearch_glm/nlp_model_training/name_classifier.py
@@ -7,8 +7,6 @@
 import re
 
 import pandas as pd
-from tabulate import tabulate
-from tqdm import tqdm
 
 from deepsearch_glm.glm_utils import read_edges_in_dataframe, read_nodes_in_dataframe
 from deepsearch_glm.nlp_utils import (

diff --git a/deepsearch_glm/nlp_model_training/person_name_classifier.py b/deepsearch_glm/nlp_model_training/person_name_classifier.py
@@ -6,10 +6,8 @@
 import json
 import os
 import random
-import re
 
 import pandas as pd
-from tabulate import tabulate
 from tqdm import tqdm
 
 from deepsearch_glm.glm_utils import read_edges_in_dataframe, read_nodes_in_dataframe

diff --git a/deepsearch_glm/nlp_utils.py b/deepsearch_glm/nlp_utils.py
@@ -71,9 +71,6 @@ def init_nlp_model(
     model = nlp_model()
     model.set_loglevel(loglevel)
 
-    configs = model.get_apply_configs()
-    # print(json.dumps(configs, indent=2))
-
     config = model.get_apply_configs()[0]
     config["models"] = model_names
     config["subject-filters"] = filters

diff --git a/deepsearch_glm/utils/common.py b/deepsearch_glm/utils/common.py
@@ -1,13 +1,9 @@
 import os
 
-from dotenv import load_dotenv
-
 
 def get_scratch_dir():
     """Get scratch directory from environment variable `DEEPSEARCH_GLM_SCRATCH_DIR` (defined in .env)"""
 
-    load_dotenv()
-
     tmpdir = os.path.abspath(os.getenv("DEEPSEARCH_GLM_SCRATCH_DIR"))
 
     if not os.path.exists(tmpdir):

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,18 +24,16 @@ build = "build.py"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-docling-core = "^2.0"
+pywin32 = { version = "^307", markers = "sys_platform == 'win32'" }
+docling-core = { version = "^2.0", optional = true }
 deepsearch-toolkit = { version = "^1.1.0", optional = true }
-tabulate = ">=0.8.9"
-numpy = ">=1.24.4,<3.0.0"
-pandas = ">=1.5.1,<3.0.0"
+tabulate = { version = ">=0.8.9", optional = true }
+pandas = { version = ">=1.5.1,<3.0.0", optional = true }
 matplotlib = { version = "^3.7.1", optional = true }
-python-dotenv = "^1.0.0"
-tqdm = "^4.64.0"
-rich = "^13.7.0"
-docutils = "!=0.21"
-pywin32 = { version = "^307", markers = "sys_platform == 'win32'" }
-requests = "^2.32.3"
+python-dotenv = { version = "^1.0.0", optional = true }
+tqdm = { version = "^4.64.0", optional = true }
+rich = { version = "^13.7.0", optional = true }
+requests = { version = "^2.32.3", optional = true }
 
 [tool.poetry.group.test.dependencies]
 pytest = "^7.4.2"
@@ -46,6 +44,7 @@ isort = "^5.13.2"
 mypy = "^1.7.1"
 pre-commit = "2.17.0"
 pylint = "^3.0.3"
+docutils = "!=0.21"  # added by python-semantic-release
 python-semantic-release = "^7.32.2"
 
 [tool.poetry.group.build.dependencies]
@@ -73,12 +72,13 @@ pandas = [
 # support recursive extras: https://github.com/python-poetry/poetry/issues/3369)
 
 pyplot = ["matplotlib"]
-toolkit = ["deepsearch-toolkit"]
-
+toolkit = ["deepsearch-toolkit", "python-dotenv"]
+docling = ["docling-core", "pandas"]
+utils = ["tabulate", "python-dotenv", "pandas", "tqdm", "rich", "requests"]
 
 [tool.black]
 line-length = 88
-target-version = ["py38"]
+target-version = ["py39"]
 include = '\.pyi?$'
 #extend-exclude = """
 ## Exclude generated API code
@@ -88,16 +88,14 @@ include = '\.pyi?$'
 [tool.isort]
 profile = "black"
 line_length = 88
-#skip_glob = ["docs", "deepsearch/cps/apis"]
-py_version=38
-#known_first_party = ["cps"]
+py_version=39
 
 [tool.mypy]
 # plugins = ["pydantic.mypy"]
 pretty = true
 # strict = true
 #no_implicit_optional = true
-python_version = 3.8
+python_version = "3.9"
 disable_error_code = ["import-untyped"]
 
 #[[tool.mypy.overrides]]

diff --git a/src/pybind/base_log.h b/src/pybind/base_log.h
@@ -12,6 +12,8 @@ namespace andromeda_py
 
     base_log();
 
+    base_log(std::string level);
+
     bool set_loglevel(std::string level);
   };
 
@@ -33,23 +35,32 @@ namespace andromeda_py
     */
   }
 
+  base_log::base_log(std::string level)
+  {
+    set_loglevel(level);
+  }
+
   bool base_log::set_loglevel(std::string level)
   {
-    if(level=="INFO")
+    if(level=="INFO" or level=="info")
       {
 	loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
       }
-    else if(level=="WARNING")
+    else if(level=="WARNING" or level=="warning")
       {
 	loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
       }
-    else if(level=="ERROR")
+    else if(level=="ERROR" or level=="error")
       {
 	loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
       }
+    else if(level=="FATAL" or level=="fatal")
+      {
+	loguru::g_stderr_verbosity = loguru::Verbosity_FATAL;
+      }
     else
       {
-	loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
+	loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
 	return false;
       }
 

diff --git a/src/pybind/nlp_interface.h b/src/pybind/nlp_interface.h
@@ -11,6 +11,8 @@ namespace andromeda_py
   public:
 
     nlp_model();
+    nlp_model(std::string loglevel, bool text_ordering, bool normalise_chars, bool normalise_text);
+
     ~nlp_model();
 
     bool initialise(const nlohmann::json config_);
@@ -66,6 +68,21 @@ namespace andromeda_py
     char_normaliser(andromeda::text_element::create_char_normaliser(false)),
     text_normaliser(andromeda::text_element::create_text_normaliser(false))
   {}
+
+  nlp_model::nlp_model(std::string loglevel, bool text_ordering, bool normalise_chars, bool normalise_text):
+    base_log::base_log(loglevel),
+    base_resources::base_resources(),
+
+    config(nlohmann::json::object({})),
+
+    order_text(text_ordering),
+    models({}),
+
+    char_normaliser(andromeda::text_element::create_char_normaliser(normalise_chars)),
+    text_normaliser(andromeda::text_element::create_text_normaliser(normalise_text))
+  {
+    config["order-text"] = order_text;
+  }
 
   nlp_model::~nlp_model()
   {}

diff --git a/src/pybind/nlp_modules.h b/src/pybind/nlp_modules.h
@@ -14,7 +14,15 @@ PYBIND11_MODULE(andromeda_nlp, m) {
 
   pybind11::class_<andromeda_py::nlp_model>(m, "nlp_model")
     .def(pybind11::init())
-
+    .def(pybind11::init<std::string, bool, bool, bool>(),
+	 pybind11::arg("loglevel"),
+	 pybind11::arg("text_ordering") = true,
+	 pybind11::arg("normalise_chars") = true,
+	 pybind11::arg("normalise_text") = true,
+	 R"(
+    Initialise the NLP models with standard parameters.)"
+	 )
+
     .def("set_loglevel", &andromeda_py::nlp_model::set_loglevel)
     .def("get_resources_path", &andromeda_py::nlp_model::get_resources_path)
 

diff --git a/tests/test_simple_interface.py b/tests/test_simple_interface.py
@@ -0,0 +1,24 @@
+import json
+
+from deepsearch_glm.andromeda_nlp import nlp_model
+
+
+def test_simple_interface_v1():
+    model = nlp_model()
+    model.set_loglevel("WARNING")
+
+    config = model.get_apply_configs()[0]
+    config["models"] = ""
+    config["subject-filters"] = []
+
+    model.initialise(config)
+
+    doc = json.load(open("tests/data/docs/1806.02284.json"))
+    output = model.apply_on_doc(doc)
+
+
+def test_simple_interface_v2():
+    model = nlp_model(loglevel="warning", text_ordering=True)
+
+    doc = json.load(open("tests/data/docs/1806.02284.json"))
+    output = model.apply_on_doc(doc)
diff --git a/tests/test_structs.py b/tests/test_structs.py
@@ -4,7 +4,6 @@
 import json
 
 import pandas as pd
-from tabulate import tabulate
 
 from deepsearch_glm.andromeda_structs import ds_document, ds_table, ds_text
 from deepsearch_glm.nlp_utils import init_nlp_model