diff --git a/biblealignlib/burrito/AlignmentGroup.py b/biblealignlib/burrito/AlignmentGroup.py index e755a37..d48d159 100644 --- a/biblealignlib/burrito/AlignmentGroup.py +++ b/biblealignlib/burrito/AlignmentGroup.py @@ -27,7 +27,7 @@ import biblealignlib as bal from .AlignmentType import TranslationType -from .source import macula_prefixer +from .source import macula_prefixer, macula_unprefixer # hoisting means this can be defined at several different levels, so @@ -292,7 +292,12 @@ def update_target_selectors(self, selectors: list[str]) -> None: self.references["target"].selectors = sorted(selectors) def asdict( - self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False + self, + positional: bool = False, + withmeta: bool = True, + withmaculaprefix: bool = False, + source_tokens: Optional[dict[str, Any]] = None, + target_tokens: Optional[dict[str, Any]] = None, ) -> dict[str, Any]: """Return a dict of values suitable for serialization. @@ -307,6 +312,14 @@ def asdict( With withmaculaprefix=True (the default is False), prefix source references with 'o' or 'n' depending on canon. + With source_tokens provided as a dict mapping bare token IDs to token + objects, source selectors are replaced with tokenstr representations + ("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used. + + With target_tokens provided as a dict mapping token IDs to token + objects, target selectors are replaced with tokenstr representations + ("{id}|{text}"). + """ recdict: dict[str, Any] = {} if positional: @@ -319,12 +332,28 @@ def asdict( else: # typical case sourcerefs: list[str] = self.references["source"].selectors - if withmaculaprefix: + if source_tokens is not None: + # Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix + bare_ids = [macula_unprefixer(sel) for sel in sourcerefs] + display_ids = ( + [macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids + ) + sourcerefs = [ + f"{did}|{source_tokens[bare].text}" if bare in source_tokens else did + for bare, did in zip(bare_ids, display_ids) + ] + elif withmaculaprefix: # default: add back the Macula prefix sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs] # else leave as is (atypical) recdict["source"] = sourcerefs - recdict["target"] = self.references["target"].selectors + targetrefs: list[str] = self.references["target"].selectors + if target_tokens is not None: + targetrefs = [ + f"{sel}|{target_tokens[sel].text}" if sel in target_tokens else sel + for sel in targetrefs + ] + recdict["target"] = targetrefs if withmeta: recdict.update( { @@ -380,12 +409,25 @@ def __repr__(self) -> str: docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents]) return f"" - def asdict(self, hoist: bool = True) -> dict[str, Any]: + def asdict( + self, + hoist: bool = True, + source_tokens: Optional[dict[str, Any]] = None, + target_tokens: Optional[dict[str, Any]] = None, + ) -> dict[str, Any]: """Return a dict of values suitable for serialization. This is opinionated about the preferred serialization: hoists as much as possible to upper levels. + With source_tokens provided as a dict mapping bare token IDs to token + objects, source selectors in each record are replaced with tokenstr + representations ("{id}|{text}"). + + With target_tokens provided as a dict mapping token IDs to token + objects, target selectors in each record are replaced with tokenstr + representations ("{id}|{text}"). + """ # for now positional: bool = False @@ -395,7 +437,13 @@ def asdict(self, hoist: bool = True) -> dict[str, Any]: "meta": self.meta.asdict(), "type": self._type, "records": [ - rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records + rec.asdict( + positional=positional, + withmeta=withmeta, + source_tokens=source_tokens, + target_tokens=target_tokens, + ) + for rec in self.records ], } @@ -446,10 +494,27 @@ def __repr__(self) -> str: """Return a printed representation.""" return f"" - def asdict(self, hoist: bool = True) -> dict[str, Any]: - """Return an opionated dict of values suitable for serialization.""" + def asdict( + self, + hoist: bool = True, + source_tokens: Optional[dict[str, Any]] = None, + target_tokens: Optional[dict[str, Any]] = None, + ) -> dict[str, Any]: + """Return an opionated dict of values suitable for serialization. + + With source_tokens and target_tokens, passes them to each group's + asdict() so that selectors are replaced with tokenstr representations. + + """ return { "format": self.format, "version": self.version, - "groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)], + "groups": [ + self.groups[0].asdict( + hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens + ), + self.groups[1].asdict( + hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens + ), + ], } diff --git a/biblealignlib/burrito/__init__.py b/biblealignlib/burrito/__init__.py index bb73855..d6e4b2d 100644 --- a/biblealignlib/burrito/__init__.py +++ b/biblealignlib/burrito/__init__.py @@ -20,7 +20,7 @@ from .manager import Manager, VerseData from .BaseToken import BaseToken, asbool, bare_id from .DiffRecord import DiffReason, DiffRecord -from .source import macula_prefixer, macula_unprefixer, Source, SourceReader +from .source import macula_prefixer, macula_unprefixer, strip_tokenstr, Source, SourceReader from .target import Target, TargetReader from .util import groupby_key, groupby_bcid, groupby_bcv, token_groupby_bc, filter_by_bcv @@ -54,6 +54,7 @@ # source "macula_prefixer", "macula_unprefixer", + "strip_tokenstr", "Source", "SourceReader", # target diff --git a/biblealignlib/burrito/alignments.py b/biblealignlib/burrito/alignments.py index 4ac8727..35b5d1c 100644 --- a/biblealignlib/burrito/alignments.py +++ b/biblealignlib/burrito/alignments.py @@ -28,7 +28,7 @@ from .AlignmentSet import AlignmentSet from .AlignmentType import TranslationType from .BadRecord import BadRecord, Reason -from .source import SourceReader, macula_unprefixer +from .source import SourceReader, macula_unprefixer, strip_tokenstr from .target import TargetReader @@ -112,8 +112,10 @@ def __init__( def _targetid(self, targetid: str) -> str: """Return a normalized target ID. + Strips any tokenstr text suffix ("{id}|{text}" → "{id}") first. With self.keeptargetwordpart = False, drop the last digit. """ + targetid = strip_tokenstr(targetid) if not self.keeptargetwordpart and len(targetid) == 12: return targetid[:11] else: @@ -297,11 +299,24 @@ def filter_books(self, keep: tuple = ()) -> AlignmentGroup: # copied from gc2sb.manager.write_alignment_group with minor changes -def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True) -> None: +def write_alignment_group( + group: AlignmentGroup, + f: TextIO, + hoist: bool = True, + source_tokens: Optional[dict[str, Any]] = None, + target_tokens: Optional[dict[str, Any]] = None, +) -> None: """Write JSON data for an arbitrary group in Scripture Burrito format. Writes some of the JSON by hand to get records on the same line. Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1". + + With source_tokens provided as a dict mapping bare token IDs to token + objects, source selectors are written as tokenstr representations + ("{id}|{text}") instead of plain IDs. + + With target_tokens provided as a dict mapping token IDs to token objects, + target selectors are written as tokenstr representations ("{id}|{text}"). """ def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None: @@ -324,7 +339,7 @@ def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[st """ bcv = arec.source_bcv bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1 - recdict = arec.asdict() + recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens) recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}" return recdict diff --git a/biblealignlib/burrito/source.py b/biblealignlib/burrito/source.py index 5ff8557..c357b32 100644 --- a/biblealignlib/burrito/source.py +++ b/biblealignlib/burrito/source.py @@ -75,8 +75,23 @@ def macula_prefixer(bcvwp: str) -> str: raise ValueError(f"Unable to add macula prefix to {bcvwp}") +def strip_tokenstr(selector: str) -> str: + """Return only the ID portion of a selector, dropping any tokenstr text suffix. + + A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε"). + Plain IDs without a '|' are returned unchanged. + """ + return selector.split("|", 1)[0] if "|" in selector else selector + + def macula_unprefixer(bcvwp: str) -> str: - """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.""" + """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged. + + Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before + checking for the prefix, so both plain IDs and tokenstr selectors are + handled correctly. + """ + bcvwp = strip_tokenstr(bcvwp) if PREFIXRE.match(bcvwp): return bcvwp[1:] else: diff --git a/tests/biblealignlib/burrito/test_AlignmentGroup.py b/tests/biblealignlib/burrito/test_AlignmentGroup.py index 6813f4b..902c8a8 100644 --- a/tests/biblealignlib/burrito/test_AlignmentGroup.py +++ b/tests/biblealignlib/burrito/test_AlignmentGroup.py @@ -1,9 +1,9 @@ -"""Test code in burrito.AlignmentRecord - -Does not test writing files. -""" +"""Test code in burrito.AlignmentRecord""" import copy +import io +import json +from types import SimpleNamespace import pytest @@ -14,6 +14,7 @@ AlignmentRecord, AlignmentGroup, TopLevelGroups, + write_alignment_group, ) @@ -294,6 +295,55 @@ def test_asdict_withmeta(self, record: AlignmentRecord) -> None: assert role in recdict assert "meta" in recdict + def test_asdict_with_source_tokens(self, record: AlignmentRecord) -> None: + """source_tokens replaces source selectors with tokenstr representations.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + recdict = record.asdict(source_tokens=source_tokens) + assert recdict["source"] == ["41004003001|Ἀκούετε", "41004003002|ἰδοὺ"] + # target selectors unchanged + assert recdict["target"] == ["410040030021"] + + def test_asdict_with_source_tokens_withmaculaprefix(self, record: AlignmentRecord) -> None: + """source_tokens with withmaculaprefix uses the prefixed ID in tokenstr.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + recdict = record.asdict(source_tokens=source_tokens, withmaculaprefix=True) + assert recdict["source"] == ["n41004003001|Ἀκούετε", "n41004003002|ἰδοὺ"] + + def test_asdict_with_target_tokens(self, record: AlignmentRecord) -> None: + """target_tokens replaces target selectors with tokenstr representations.""" + target_tokens = { + "410040030021": SimpleNamespace(text="Listen"), + } + recdict = record.asdict(target_tokens=target_tokens) + assert recdict["target"] == ["410040030021|Listen"] + # source selectors unchanged + assert recdict["source"] == ["n41004003001", "n41004003002"] + + def test_asdict_with_both_token_dicts(self, record: AlignmentRecord) -> None: + """Both source_tokens and target_tokens replace selectors with tokenstr.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + target_tokens = { + "410040030021": SimpleNamespace(text="Listen"), + } + recdict = record.asdict(source_tokens=source_tokens, target_tokens=target_tokens) + assert recdict["source"] == ["41004003001|Ἀκούετε", "41004003002|ἰδοὺ"] + assert recdict["target"] == ["410040030021|Listen"] + + def test_asdict_missing_token_leaves_selector(self, record: AlignmentRecord) -> None: + """Selectors not found in token dicts are left as plain IDs (bare, no macula prefix).""" + recdict = record.asdict(source_tokens={}, target_tokens={}) + assert recdict["source"] == ["41004003001", "41004003002"] + assert recdict["target"] == ["410040030021"] + class TestAlignmentGroup: """Test AlignmentGroup().""" @@ -313,6 +363,85 @@ def test_asdict(self, group: AlignmentGroup) -> None: for k in ["meta", "type", "records"]: assert k in recdict + def test_asdict_with_token_dicts(self, group: AlignmentGroup) -> None: + """AlignmentGroup.asdict() passes token dicts through to records.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + target_tokens = { + "410040030021": SimpleNamespace(text="Listen"), + } + recdict = group.asdict(source_tokens=source_tokens, target_tokens=target_tokens) + rec = recdict["records"][0] + assert rec["source"] == ["41004003001|Ἀκούετε", "41004003002|ἰδοὺ"] + assert rec["target"] == ["410040030021|Listen"] + + +class TestWriteAlignmentGroup: + """Test write_alignment_group() tokenstr output.""" + + def test_write_default(self, group: AlignmentGroup) -> None: + """write_alignment_group without token dicts writes plain IDs.""" + buf = io.StringIO() + write_alignment_group(group, buf) + result = json.loads(buf.getvalue()) + rec = result["records"][0] + # plain IDs, no '|' separator + assert all("|" not in sel for sel in rec["source"]) + assert all("|" not in sel for sel in rec["target"]) + + def test_write_with_source_tokens(self, group: AlignmentGroup) -> None: + """write_alignment_group with source_tokens writes tokenstr for source.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + buf = io.StringIO() + write_alignment_group(group, buf, source_tokens=source_tokens) + result = json.loads(buf.getvalue()) + rec = result["records"][0] + assert rec["source"] == ["41004003001|Ἀκούετε", "41004003002|ἰδοὺ"] + assert all("|" not in sel for sel in rec["target"]) + + def test_write_with_target_tokens(self, group: AlignmentGroup) -> None: + """write_alignment_group with target_tokens writes tokenstr for target.""" + target_tokens = { + "410040030021": SimpleNamespace(text="Listen"), + } + buf = io.StringIO() + write_alignment_group(group, buf, target_tokens=target_tokens) + result = json.loads(buf.getvalue()) + rec = result["records"][0] + assert rec["target"] == ["410040030021|Listen"] + assert all("|" not in sel for sel in rec["source"]) + + def test_write_with_both_token_dicts(self, group: AlignmentGroup) -> None: + """write_alignment_group with both token dicts writes tokenstr for source and target.""" + source_tokens = { + "41004003001": SimpleNamespace(text="Ἀκούετε"), + "41004003002": SimpleNamespace(text="ἰδοὺ"), + } + target_tokens = { + "410040030021": SimpleNamespace(text="Listen"), + } + buf = io.StringIO() + write_alignment_group(group, buf, source_tokens=source_tokens, target_tokens=target_tokens) + result = json.loads(buf.getvalue()) + rec = result["records"][0] + assert rec["source"] == ["41004003001|Ἀκούετε", "41004003002|ἰδοὺ"] + assert rec["target"] == ["410040030021|Listen"] + + def test_write_produces_valid_json(self, group: AlignmentGroup) -> None: + """write_alignment_group output is always valid JSON.""" + source_tokens = {"41004003001": SimpleNamespace(text="Ἀκούετε")} + buf = io.StringIO() + write_alignment_group(group, buf, source_tokens=source_tokens) + # must not raise + result = json.loads(buf.getvalue()) + assert "records" in result + assert "documents" in result + # not needed for AlignmentHub class TestTopLevelGroups: diff --git a/tests/biblealignlib/burrito/test_source.py b/tests/biblealignlib/burrito/test_source.py index 0ef1740..f78c193 100644 --- a/tests/biblealignlib/burrito/test_source.py +++ b/tests/biblealignlib/burrito/test_source.py @@ -12,7 +12,7 @@ import pytest from biblealignlib import SOURCES -from biblealignlib.burrito import Source, SourceReader, macula_prefixer, macula_unprefixer +from biblealignlib.burrito import Source, SourceReader, macula_prefixer, macula_unprefixer, strip_tokenstr class TestMacula_Prefixer: @@ -36,6 +36,24 @@ def test_error(self) -> None: assert macula_prefixer("abc") == "abc" +class TestStripTokenstr: + """Test strip_tokenstr().""" + + def test_plain_id_unchanged(self) -> None: + """Plain IDs without '|' are returned unchanged.""" + assert strip_tokenstr("41001001001") == "41001001001" + assert strip_tokenstr("n41001001001") == "n41001001001" + + def test_strips_text_suffix(self) -> None: + """Tokenstr selectors have the text suffix removed.""" + assert strip_tokenstr("n41004003001|Ἀκούετε") == "n41004003001" + assert strip_tokenstr("41004003001|Ἀκούετε") == "41004003001" + + def test_only_first_pipe_split(self) -> None: + """Only the first '|' is treated as the separator.""" + assert strip_tokenstr("41004003001|foo|bar") == "41004003001" + + class TestMacula_Unprefixer: """Test macula_unprefixer().""" @@ -49,6 +67,14 @@ def test_nt(self) -> None: assert macula_unprefixer("n41001001001") == "41001001001" assert macula_unprefixer("41001001001") == "41001001001" + def test_tokenstr_with_prefix(self) -> None: + """Tokenstr selectors with a macula prefix are unprefixed correctly.""" + assert macula_unprefixer("n41004003001|Ἀκούετε") == "41004003001" + + def test_tokenstr_without_prefix(self) -> None: + """Tokenstr selectors without a macula prefix return the bare ID.""" + assert macula_unprefixer("41004003001|Ἀκούετε") == "41004003001" + @pytest.fixture(scope="module") def mrk_4_9_4() -> Source: