Skip to content

Commit

Permalink
chore: minor refactor incl. explicit typing (#39)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Oct 7, 2024
1 parent 246627f commit 39ae6cd
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 20 deletions.
31 changes: 18 additions & 13 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@
#

"""Define base classes for chunking."""
import re
from abc import ABC, abstractmethod
from typing import Iterator, Optional
from typing import Final, Iterator, Optional

from pydantic import BaseModel, model_validator
from pydantic import BaseModel, Field, field_validator

from docling_core.types import BoundingBox, Document
from docling_core.types.base import _JSON_POINTER_REGEX

# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")


def _create_path(pos: int, path_prefix: str = "main-text") -> str:
Expand All @@ -19,21 +24,21 @@ def _create_path(pos: int, path_prefix: str = "main-text") -> str:
class Chunk(BaseModel):
"""Data model for Chunk."""

path: str
path: str = Field(pattern=_JSON_POINTER_REGEX)
text: str
heading: Optional[str] = None

@model_validator(mode="before")
@field_validator("path", mode="before")
@classmethod
def _json_pointer_from_json_path(cls, data):
path = data.get("path")
if path.startswith("$."):
parts = path.split("[")
data["path"] = _create_path(
pos=parts[1][:-1],
path_prefix=parts[0][2:],
)
return data
def _json_pointer_from_json_path(cls, path: str):
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
groups = match.groups()
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
return _create_path(
pos=int(groups[1]),
path_prefix=groups[0],
)
return path


class ChunkWithMetadata(Chunk):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
"""Simple metadata extractor module."""


from typing import Any
from typing import Any, Final

from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
from docling_core.types import Document as DLDocument

_DL_DOC_HASH = "dl_doc_hash"
_ORIGIN = "origin"
_DL_DOC_HASH: Final[str] = "dl_doc_hash"
_ORIGIN: Final[str] = "origin"


class SimpleMetadataExtractor(BaseMetadataExtractor):
Expand Down
5 changes: 4 additions & 1 deletion docling_core/types/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""Define common models across types."""
from datetime import datetime, timezone
from enum import Enum
from typing import Generic, Hashable, List, Literal, Optional, TypeVar
from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar

from pydantic import (
AfterValidator,
Expand All @@ -28,6 +28,9 @@
from docling_core.utils.alias import AliasModel
from docling_core.utils.validators import validate_datetime, validate_unique_list

# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"

LanguageT = TypeVar("LanguageT", bound=str)
IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
Expand Down
4 changes: 1 addition & 3 deletions docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing_extensions import Annotated

from docling_core.search.package import VERSION_PATTERN
from docling_core.types.base import _JSON_POINTER_REGEX
from docling_core.types.doc.tokens import DocumentToken
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
Expand All @@ -28,9 +29,6 @@
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.0.0"

# (subset of) JSON Pointer URI fragment identifier format:
_JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"


class BasePictureData(BaseModel): # TBD
"""BasePictureData."""
Expand Down
16 changes: 16 additions & 0 deletions test/test_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from docling_core.transforms.chunker.base import Chunk


def test_chunk_migration():
input_path = "$.main-text[42]" # deprected path format
expected_path = "#/main-text/42"
chunk = Chunk(
path=input_path,
text="foo",
)
assert chunk.path == expected_path

0 comments on commit 39ae6cd

Please sign in to comment.