Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 74 additions & 9 deletions biblealignlib/burrito/AlignmentGroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import biblealignlib as bal

from .AlignmentType import TranslationType
from .source import macula_prefixer
from .source import macula_prefixer, macula_unprefixer


# hoisting means this can be defined at several different levels, so
Expand Down Expand Up @@ -292,7 +292,12 @@ def update_target_selectors(self, selectors: list[str]) -> None:
self.references["target"].selectors = sorted(selectors)

def asdict(
self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
self,
positional: bool = False,
withmeta: bool = True,
withmaculaprefix: bool = False,
source_tokens: Optional[dict[str, Any]] = None,
target_tokens: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
"""Return a dict of values suitable for serialization.

Expand All @@ -307,6 +312,14 @@ def asdict(
With withmaculaprefix=True (the default is False), prefix
source references with 'o' or 'n' depending on canon.

With source_tokens provided as a dict mapping bare token IDs to token
objects, source selectors are replaced with tokenstr representations
("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used.

With target_tokens provided as a dict mapping token IDs to token
objects, target selectors are replaced with tokenstr representations
("{id}|{text}").

"""
recdict: dict[str, Any] = {}
if positional:
Expand All @@ -319,12 +332,28 @@ def asdict(
else:
# typical case
sourcerefs: list[str] = self.references["source"].selectors
if withmaculaprefix:
if source_tokens is not None:
# Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix
bare_ids = [macula_unprefixer(sel) for sel in sourcerefs]
display_ids = (
[macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids
)
sourcerefs = [
f"{did}|{source_tokens[bare].text}" if bare in source_tokens else did
for bare, did in zip(bare_ids, display_ids)
]
elif withmaculaprefix:
# default: add back the Macula prefix
sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
# else leave as is (atypical)
recdict["source"] = sourcerefs
recdict["target"] = self.references["target"].selectors
targetrefs: list[str] = self.references["target"].selectors
if target_tokens is not None:
targetrefs = [
f"{sel}|{target_tokens[sel].text}" if sel in target_tokens else sel
for sel in targetrefs
]
recdict["target"] = targetrefs
if withmeta:
recdict.update(
{
Expand Down Expand Up @@ -380,12 +409,25 @@ def __repr__(self) -> str:
docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
return f"<AlignmentGroup{docids}: {len(self.records)} records>"

def asdict(self, hoist: bool = True) -> dict[str, Any]:
def asdict(
self,
hoist: bool = True,
source_tokens: Optional[dict[str, Any]] = None,
target_tokens: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
"""Return a dict of values suitable for serialization.

This is opinionated about the preferred serialization: hoists
as much as possible to upper levels.

With source_tokens provided as a dict mapping bare token IDs to token
objects, source selectors in each record are replaced with tokenstr
representations ("{id}|{text}").

With target_tokens provided as a dict mapping token IDs to token
objects, target selectors in each record are replaced with tokenstr
representations ("{id}|{text}").

"""
# for now
positional: bool = False
Expand All @@ -395,7 +437,13 @@ def asdict(self, hoist: bool = True) -> dict[str, Any]:
"meta": self.meta.asdict(),
"type": self._type,
"records": [
rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records
rec.asdict(
positional=positional,
withmeta=withmeta,
source_tokens=source_tokens,
target_tokens=target_tokens,
)
for rec in self.records
],
}

Expand Down Expand Up @@ -446,10 +494,27 @@ def __repr__(self) -> str:
"""Return a printed representation."""
return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"

def asdict(self, hoist: bool = True) -> dict[str, Any]:
"""Return an opionated dict of values suitable for serialization."""
def asdict(
self,
hoist: bool = True,
source_tokens: Optional[dict[str, Any]] = None,
target_tokens: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
"""Return an opionated dict of values suitable for serialization.
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in docstring: "opionated" should be "opinionated".

Suggested change
"""Return an opionated dict of values suitable for serialization.
"""Return an opinionated dict of values suitable for serialization.

Copilot uses AI. Check for mistakes.

With source_tokens and target_tokens, passes them to each group's
asdict() so that selectors are replaced with tokenstr representations.

"""
return {
"format": self.format,
"version": self.version,
"groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)],
"groups": [
self.groups[0].asdict(
hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
),
self.groups[1].asdict(
hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
),
],
}
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TopLevelGroups.asdict() now accepts source_tokens/target_tokens and forwards them into each group, but there isn't a test exercising this new behavior. Please add a unit test that calls TopLevelGroups.asdict(source_tokens=..., target_tokens=...) and asserts selectors are tokenstr-formatted in the nested records (and that missing tokens fall back to plain IDs).

Suggested change
}
}
def test_toplevelgroups_asdict_forwards_tokens() -> None:
"""Unit test for TopLevelGroups.asdict token forwarding and selector formatting.
This uses dummy group objects so we do not depend on AlignmentGroup internals.
"""
class _DummyMeta:
def __init__(self, conforms_to: str) -> None:
self.conformsTo = conforms_to
class _DummyDoc:
def __init__(self, docid: str) -> None:
self.docid = docid
class _DummyGroup:
def __init__(self, canon: str, sourcedocid: str, target_docid: str) -> None:
# attributes used in TopLevelGroups.__post_init__
self.roles = ["source", "target"]
self.meta = _DummyMeta(conforms_to="test-conforms-to")
self.documents = (_DummyDoc(sourcedocid), _DummyDoc(target_docid))
self.canon = canon
self.sourcedocid = sourcedocid
def asdict(
self,
hoist: bool = True,
source_tokens: Optional[dict[str, Any]] = None,
target_tokens: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
# Simulate tokenstr formatting with fallback to plain IDs.
source_tokens = source_tokens or {}
target_tokens = target_tokens or {}
def _src(id_: str) -> str:
return source_tokens.get(id_, id_)
def _tgt(id_: str) -> str:
return target_tokens.get(id_, id_)
return {
"alignments": [
{
"source": [
{"selector": _src("s1")}, # has token
{"selector": _src("s_missing")}, # no token
],
"target": [
{"selector": _tgt("t1")}, # has token
{"selector": _tgt("t_missing")}, # no token
],
}
]
}
# Prepare dummy groups: one OT and one NT, sharing target docid.
target_docid = "target-doc"
group_ot = _DummyGroup(canon="ot", sourcedocid="source-ot", target_docid=target_docid)
group_nt = _DummyGroup(canon="nt", sourcedocid="source-nt", target_docid=target_docid)
tgroups = TopLevelGroups(groups=(group_ot, group_nt))
source_tokens = {"s1": "GEN.1.1!1"}
target_tokens = {"t1": "GEN.1.1!1"}
result = tgroups.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
# Ensure we have two groups in the serialized structure.
assert len(result["groups"]) == 2
first_group = result["groups"][0]
align = first_group["alignments"][0]
# Token-present IDs should be replaced with token strings.
assert align["source"][0]["selector"] == "GEN.1.1!1"
assert align["target"][0]["selector"] == "GEN.1.1!1"
# Missing IDs should fall back to plain IDs.
assert align["source"][1]["selector"] == "s_missing"
assert align["target"][1]["selector"] == "t_missing"

Copilot uses AI. Check for mistakes.
3 changes: 2 additions & 1 deletion biblealignlib/burrito/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .manager import Manager, VerseData
from .BaseToken import BaseToken, asbool, bare_id
from .DiffRecord import DiffReason, DiffRecord
from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
from .source import macula_prefixer, macula_unprefixer, strip_tokenstr, Source, SourceReader
from .target import Target, TargetReader
from .util import groupby_key, groupby_bcid, groupby_bcv, token_groupby_bc, filter_by_bcv

Expand Down Expand Up @@ -54,6 +54,7 @@
# source
"macula_prefixer",
"macula_unprefixer",
"strip_tokenstr",
"Source",
"SourceReader",
# target
Expand Down
21 changes: 18 additions & 3 deletions biblealignlib/burrito/alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from .AlignmentSet import AlignmentSet
from .AlignmentType import TranslationType
from .BadRecord import BadRecord, Reason
from .source import SourceReader, macula_unprefixer
from .source import SourceReader, macula_unprefixer, strip_tokenstr
from .target import TargetReader


Expand Down Expand Up @@ -112,8 +112,10 @@ def __init__(
def _targetid(self, targetid: str) -> str:
"""Return a normalized target ID.

Strips any tokenstr text suffix ("{id}|{text}" → "{id}") first.
With self.keeptargetwordpart = False, drop the last digit.
"""
targetid = strip_tokenstr(targetid)
if not self.keeptargetwordpart and len(targetid) == 12:
return targetid[:11]
else:
Expand Down Expand Up @@ -297,11 +299,24 @@ def filter_books(self, keep: tuple = ()) -> AlignmentGroup:


# copied from gc2sb.manager.write_alignment_group with minor changes
def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True) -> None:
def write_alignment_group(
group: AlignmentGroup,
f: TextIO,
hoist: bool = True,
source_tokens: Optional[dict[str, Any]] = None,
target_tokens: Optional[dict[str, Any]] = None,
) -> None:
"""Write JSON data for an arbitrary group in Scripture Burrito format.

Writes some of the JSON by hand to get records on the same line.
Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".

With source_tokens provided as a dict mapping bare token IDs to token
objects, source selectors are written as tokenstr representations
("{id}|{text}") instead of plain IDs.

With target_tokens provided as a dict mapping token IDs to token objects,
target selectors are written as tokenstr representations ("{id}|{text}").
"""

def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
Expand All @@ -324,7 +339,7 @@ def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[st
"""
bcv = arec.source_bcv
bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
recdict = arec.asdict()
recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
return recdict

Expand Down
17 changes: 16 additions & 1 deletion biblealignlib/burrito/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,23 @@ def macula_prefixer(bcvwp: str) -> str:
raise ValueError(f"Unable to add macula prefix to {bcvwp}")


def strip_tokenstr(selector: str) -> str:
"""Return only the ID portion of a selector, dropping any tokenstr text suffix.

A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε").
Plain IDs without a '|' are returned unchanged.
"""
return selector.split("|", 1)[0] if "|" in selector else selector


def macula_unprefixer(bcvwp: str) -> str:
"""Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged."""
"""Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.

Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before
checking for the prefix, so both plain IDs and tokenstr selectors are
handled correctly.
"""
bcvwp = strip_tokenstr(bcvwp)
if PREFIXRE.match(bcvwp):
return bcvwp[1:]
else:
Expand Down
Loading