Clear-Bible · sboisen · Mar 24, 2026 · Mar 24, 2026 · Mar 25, 2026 · Copilot
diff --git a/biblealignlib/burrito/AlignmentGroup.py b/biblealignlib/burrito/AlignmentGroup.py
@@ -27,7 +27,7 @@
 import biblealignlib as bal
 
 from .AlignmentType import TranslationType
-from .source import macula_prefixer
+from .source import macula_prefixer, macula_unprefixer
 
 
 # hoisting means this can be defined at several different levels, so
@@ -292,7 +292,12 @@ def update_target_selectors(self, selectors: list[str]) -> None:
         self.references["target"].selectors = sorted(selectors)
 
     def asdict(
-        self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
+        self,
+        positional: bool = False,
+        withmeta: bool = True,
+        withmaculaprefix: bool = False,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
     ) -> dict[str, Any]:
         """Return a dict of values suitable for serialization.
 
@@ -307,6 +312,14 @@ def asdict(
         With withmaculaprefix=True (the default is False), prefix
         source references with 'o' or 'n' depending on canon.
 
+        With source_tokens provided as a dict mapping bare token IDs to token
+        objects, source selectors are replaced with tokenstr representations
+        ("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used.
+
+        With target_tokens provided as a dict mapping token IDs to token
+        objects, target selectors are replaced with tokenstr representations
+        ("{id}|{text}").
+
         """
         recdict: dict[str, Any] = {}
         if positional:
@@ -319,12 +332,28 @@ def asdict(
         else:
             # typical case
             sourcerefs: list[str] = self.references["source"].selectors
-            if withmaculaprefix:
+            if source_tokens is not None:
+                # Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix
+                bare_ids = [macula_unprefixer(sel) for sel in sourcerefs]
+                display_ids = (
+                    [macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids
+                )
+                sourcerefs = [
+                    f"{did}|{source_tokens[bare].text}" if bare in source_tokens else did
+                    for bare, did in zip(bare_ids, display_ids)
+                ]
+            elif withmaculaprefix:
                 # default: add back the Macula prefix
                 sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
             # else leave as is (atypical)
             recdict["source"] = sourcerefs
-            recdict["target"] = self.references["target"].selectors
+            targetrefs: list[str] = self.references["target"].selectors
+            if target_tokens is not None:
+                targetrefs = [
+                    f"{sel}|{target_tokens[sel].text}" if sel in target_tokens else sel
+                    for sel in targetrefs
+                ]
+            recdict["target"] = targetrefs
         if withmeta:
             recdict.update(
                 {
@@ -380,12 +409,25 @@ def __repr__(self) -> str:
         docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
         return f"<AlignmentGroup{docids}: {len(self.records)} records>"
 
-    def asdict(self, hoist: bool = True) -> dict[str, Any]:
+    def asdict(
+        self,
+        hoist: bool = True,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
         """Return a dict of values suitable for serialization.
 
         This is opinionated about the preferred serialization: hoists
         as much as possible to upper levels.
 
+        With source_tokens provided as a dict mapping bare token IDs to token
+        objects, source selectors in each record are replaced with tokenstr
+        representations ("{id}|{text}").
+
+        With target_tokens provided as a dict mapping token IDs to token
+        objects, target selectors in each record are replaced with tokenstr
+        representations ("{id}|{text}").
+
         """
         # for now
         positional: bool = False
@@ -395,7 +437,13 @@ def asdict(self, hoist: bool = True) -> dict[str, Any]:
             "meta": self.meta.asdict(),
             "type": self._type,
             "records": [
-                rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records
+                rec.asdict(
+                    positional=positional,
+                    withmeta=withmeta,
+                    source_tokens=source_tokens,
+                    target_tokens=target_tokens,
+                )
+                for rec in self.records
             ],
         }
 
@@ -446,10 +494,27 @@ def __repr__(self) -> str:
         """Return a printed representation."""
         return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"
 
-    def asdict(self, hoist: bool = True) -> dict[str, Any]:
-        """Return an opionated dict of values suitable for serialization."""
+    def asdict(
+        self,
+        hoist: bool = True,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
+        """Return an opionated dict of values suitable for serialization.
-        """Return an opionated dict of values suitable for serialization.
+        """Return an opinionated dict of values suitable for serialization.
-        """Return an opionated dict of values suitable for serialization.
+        """Return an opinionated dict of values suitable for serialization.
+
+        With source_tokens and target_tokens, passes them to each group's
+        asdict() so that selectors are replaced with tokenstr representations.
+
+        """
         return {
             "format": self.format,
             "version": self.version,
-            "groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)],
+            "groups": [
+                self.groups[0].asdict(
+                    hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
+                ),
+                self.groups[1].asdict(
+                    hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
+                ),
+            ],
         }
-        }
+        }
+
+
+def test_toplevelgroups_asdict_forwards_tokens() -> None:
+    """Unit test for TopLevelGroups.asdict token forwarding and selector formatting.
+
+    This uses dummy group objects so we do not depend on AlignmentGroup internals.
+    """
+
+    class _DummyMeta:
+        def __init__(self, conforms_to: str) -> None:
+            self.conformsTo = conforms_to
+
+    class _DummyDoc:
+        def __init__(self, docid: str) -> None:
+            self.docid = docid
+
+    class _DummyGroup:
+        def __init__(self, canon: str, sourcedocid: str, target_docid: str) -> None:
+            # attributes used in TopLevelGroups.__post_init__
+            self.roles = ["source", "target"]
+            self.meta = _DummyMeta(conforms_to="test-conforms-to")
+            self.documents = (_DummyDoc(sourcedocid), _DummyDoc(target_docid))
+            self.canon = canon
+            self.sourcedocid = sourcedocid
+
+        def asdict(
+            self,
+            hoist: bool = True,
+            source_tokens: Optional[dict[str, Any]] = None,
+            target_tokens: Optional[dict[str, Any]] = None,
+        ) -> dict[str, Any]:
+            # Simulate tokenstr formatting with fallback to plain IDs.
+            source_tokens = source_tokens or {}
+            target_tokens = target_tokens or {}
+
+            def _src(id_: str) -> str:
+                return source_tokens.get(id_, id_)
+
+            def _tgt(id_: str) -> str:
+                return target_tokens.get(id_, id_)
+
+            return {
+                "alignments": [
+                    {
+                        "source": [
+                            {"selector": _src("s1")},          # has token
+                            {"selector": _src("s_missing")},   # no token
+                        ],
+                        "target": [
+                            {"selector": _tgt("t1")},          # has token
+                            {"selector": _tgt("t_missing")},   # no token
+                        ],
+                    }
+                ]
+            }
+
+    # Prepare dummy groups: one OT and one NT, sharing target docid.
+    target_docid = "target-doc"
+    group_ot = _DummyGroup(canon="ot", sourcedocid="source-ot", target_docid=target_docid)
+    group_nt = _DummyGroup(canon="nt", sourcedocid="source-nt", target_docid=target_docid)
+
+    tgroups = TopLevelGroups(groups=(group_ot, group_nt))
+
+    source_tokens = {"s1": "GEN.1.1!1"}
+    target_tokens = {"t1": "GEN.1.1!1"}
+
+    result = tgroups.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
+
+    # Ensure we have two groups in the serialized structure.
+    assert len(result["groups"]) == 2
+
+    first_group = result["groups"][0]
+    align = first_group["alignments"][0]
+
+    # Token-present IDs should be replaced with token strings.
+    assert align["source"][0]["selector"] == "GEN.1.1!1"
+    assert align["target"][0]["selector"] == "GEN.1.1!1"
+
+    # Missing IDs should fall back to plain IDs.
+    assert align["source"][1]["selector"] == "s_missing"
+    assert align["target"][1]["selector"] == "t_missing"
-        }
+        }
+
+
+def test_toplevelgroups_asdict_forwards_tokens() -> None:
+    """Unit test for TopLevelGroups.asdict token forwarding and selector formatting.
+
+    This uses dummy group objects so we do not depend on AlignmentGroup internals.
+    """
+
+    class _DummyMeta:
+        def __init__(self, conforms_to: str) -> None:
+            self.conformsTo = conforms_to
+
+    class _DummyDoc:
+        def __init__(self, docid: str) -> None:
+            self.docid = docid
+
+    class _DummyGroup:
+        def __init__(self, canon: str, sourcedocid: str, target_docid: str) -> None:
+            # attributes used in TopLevelGroups.__post_init__
+            self.roles = ["source", "target"]
+            self.meta = _DummyMeta(conforms_to="test-conforms-to")
+            self.documents = (_DummyDoc(sourcedocid), _DummyDoc(target_docid))
+            self.canon = canon
+            self.sourcedocid = sourcedocid
+
+        def asdict(
+            self,
+            hoist: bool = True,
+            source_tokens: Optional[dict[str, Any]] = None,
+            target_tokens: Optional[dict[str, Any]] = None,
+        ) -> dict[str, Any]:
+            # Simulate tokenstr formatting with fallback to plain IDs.
+            source_tokens = source_tokens or {}
+            target_tokens = target_tokens or {}
+
+            def _src(id_: str) -> str:
+                return source_tokens.get(id_, id_)
+
+            def _tgt(id_: str) -> str:
+                return target_tokens.get(id_, id_)
+
+            return {
+                "alignments": [
+                    {
+                        "source": [
+                            {"selector": _src("s1")},          # has token
+                            {"selector": _src("s_missing")},   # no token
+                        ],
+                        "target": [
+                            {"selector": _tgt("t1")},          # has token
+                            {"selector": _tgt("t_missing")},   # no token
+                        ],
+                    }
+                ]
+            }
+
+    # Prepare dummy groups: one OT and one NT, sharing target docid.
+    target_docid = "target-doc"
+    group_ot = _DummyGroup(canon="ot", sourcedocid="source-ot", target_docid=target_docid)
+    group_nt = _DummyGroup(canon="nt", sourcedocid="source-nt", target_docid=target_docid)
+
+    tgroups = TopLevelGroups(groups=(group_ot, group_nt))
+
+    source_tokens = {"s1": "GEN.1.1!1"}
+    target_tokens = {"t1": "GEN.1.1!1"}
+
+    result = tgroups.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
+
+    # Ensure we have two groups in the serialized structure.
+    assert len(result["groups"]) == 2
+
+    first_group = result["groups"][0]
+    align = first_group["alignments"][0]
+
+    # Token-present IDs should be replaced with token strings.
+    assert align["source"][0]["selector"] == "GEN.1.1!1"
+    assert align["target"][0]["selector"] == "GEN.1.1!1"
+
+    # Missing IDs should fall back to plain IDs.
+    assert align["source"][1]["selector"] == "s_missing"
+    assert align["target"][1]["selector"] == "t_missing"
diff --git a/biblealignlib/burrito/__init__.py b/biblealignlib/burrito/__init__.py
@@ -20,7 +20,7 @@
 from .manager import Manager, VerseData
 from .BaseToken import BaseToken, asbool, bare_id
 from .DiffRecord import DiffReason, DiffRecord
-from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
+from .source import macula_prefixer, macula_unprefixer, strip_tokenstr, Source, SourceReader
 from .target import Target, TargetReader
 from .util import groupby_key, groupby_bcid, groupby_bcv, token_groupby_bc, filter_by_bcv
 
@@ -54,6 +54,7 @@
     # source
     "macula_prefixer",
     "macula_unprefixer",
+    "strip_tokenstr",
     "Source",
     "SourceReader",
     # target

diff --git a/biblealignlib/burrito/alignments.py b/biblealignlib/burrito/alignments.py
@@ -28,7 +28,7 @@
 from .AlignmentSet import AlignmentSet
 from .AlignmentType import TranslationType
 from .BadRecord import BadRecord, Reason
-from .source import SourceReader, macula_unprefixer
+from .source import SourceReader, macula_unprefixer, strip_tokenstr
 from .target import TargetReader
 
 
@@ -112,8 +112,10 @@ def __init__(
     def _targetid(self, targetid: str) -> str:
         """Return a normalized target ID.
 
+        Strips any tokenstr text suffix ("{id}|{text}" → "{id}") first.
         With self.keeptargetwordpart = False, drop the last digit.
         """
+        targetid = strip_tokenstr(targetid)
         if not self.keeptargetwordpart and len(targetid) == 12:
             return targetid[:11]
         else:
@@ -297,11 +299,24 @@ def filter_books(self, keep: tuple = ()) -> AlignmentGroup:
 
 
 # copied from gc2sb.manager.write_alignment_group with minor changes
-def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True) -> None:
+def write_alignment_group(
+    group: AlignmentGroup,
+    f: TextIO,
+    hoist: bool = True,
+    source_tokens: Optional[dict[str, Any]] = None,
+    target_tokens: Optional[dict[str, Any]] = None,
+) -> None:
     """Write JSON data for an arbitrary group in Scripture Burrito format.
 
     Writes some of the JSON by hand to get records on the same line.
     Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
+
+    With source_tokens provided as a dict mapping bare token IDs to token
+    objects, source selectors are written as tokenstr representations
+    ("{id}|{text}") instead of plain IDs.
+
+    With target_tokens provided as a dict mapping token IDs to token objects,
+    target selectors are written as tokenstr representations ("{id}|{text}").
     """
 
     def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
@@ -324,7 +339,7 @@ def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[st
         """
         bcv = arec.source_bcv
         bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
-        recdict = arec.asdict()
+        recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
         recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
         return recdict
 

diff --git a/biblealignlib/burrito/source.py b/biblealignlib/burrito/source.py
@@ -75,8 +75,23 @@ def macula_prefixer(bcvwp: str) -> str:
         raise ValueError(f"Unable to add macula prefix to {bcvwp}")
 
 
+def strip_tokenstr(selector: str) -> str:
+    """Return only the ID portion of a selector, dropping any tokenstr text suffix.
+
+    A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε").
+    Plain IDs without a '|' are returned unchanged.
+    """
+    return selector.split("|", 1)[0] if "|" in selector else selector
+
+
 def macula_unprefixer(bcvwp: str) -> str:
-    """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged."""
+    """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.
+
+    Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before
+    checking for the prefix, so both plain IDs and tokenstr selectors are
+    handled correctly.
+    """
+    bcvwp = strip_tokenstr(bcvwp)
     if PREFIXRE.match(bcvwp):
         return bcvwp[1:]
     else: