Skip to content

Commit b23a807

Browse files
authored
Allow word IDs on synset members list (#256)
The OEWN uses word IDs while OMW uses sense IDs for the synset members attribute. This meant that OEWN synset members were not being returned in the right order. Fixes #255
1 parent 0e3d796 commit b23a807

File tree

4 files changed

+83
-11
lines changed

4 files changed

+83
-11
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
## [Unreleased][unreleased]
44

5+
## Changed
6+
7+
* `wn.add()` allows synset members to be lexical entry IDs for rank
8+
calculations ([#255])
9+
510

611
## [v0.12.0]
712

@@ -771,3 +776,4 @@ abandoned, but this is an entirely new codebase.
771776
[#241]: https://github.com/goodmami/wn/issues/241
772777
[#246]: https://github.com/goodmami/wn/issues/246
773778
[#250]: https://github.com/goodmami/wn/issues/250
779+
[#255]: https://github.com/goodmami/wn/issues/255

tests/data/sense-member-order.xml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd">
3+
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
5+
<!-- duplicate ID in synsets -->
6+
7+
<Lexicon id="test"
8+
label="Testing Sense Member Orders"
9+
language="en"
10+
11+
license="https://creativecommons.org/licenses/by/4.0/"
12+
version="1">
13+
14+
<LexicalEntry id="test-foo-n">
15+
<Lemma partOfSpeech="n" writtenForm="foo" />
16+
<Sense id="test-01-foo-n" synset="test-01-n" />
17+
<Sense id="test-02-foo-n" synset="test-02-n" />
18+
</LexicalEntry>
19+
20+
<LexicalEntry id="test-bar-n">
21+
<Lemma partOfSpeech="n" writtenForm="bar" />
22+
<Sense id="test-02-bar-n" synset="test-02-n" />
23+
<Sense id="test-01-bar-n" synset="test-01-n" />
24+
</LexicalEntry>
25+
26+
<!-- sense IDs as members -->
27+
<Synset id="test-01-n" ili="i12345" partOfSpeech="n" members="test-01-bar-n test-01-foo-n"/>
28+
<!-- word IDs as members -->
29+
<Synset id="test-02-n" ili="i12346" partOfSpeech="n" members="test-bar-n test-foo-n" />
30+
31+
</Lexicon>
32+
33+
</LexicalResource>

tests/secondary_query_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,25 @@ def test_synset_lexicalized():
146146
def test_synset_translate():
147147
assert len(wn.synset('test-en-0001-n').translate(lang='es')) == 1
148148
assert len(wn.synset('test-es-0001-n').translate(lang='en')) == 1
149+
150+
151+
@pytest.mark.usefixtures('uninitialized_datadir')
152+
def test_word_sense_order(datadir):
153+
wn.add(datadir / 'sense-member-order.xml')
154+
assert [s.id for s in wn.word('test-foo-n').senses()] == [
155+
"test-01-foo-n", "test-02-foo-n",
156+
]
157+
assert [s.id for s in wn.word('test-bar-n').senses()] == [
158+
"test-02-bar-n", "test-01-bar-n",
159+
]
160+
161+
162+
@pytest.mark.usefixtures('uninitialized_datadir')
163+
def test_synset_member_order(datadir):
164+
wn.add(datadir / 'sense-member-order.xml')
165+
assert [s.id for s in wn.synset('test-01-n').senses()] == [
166+
"test-01-bar-n", "test-01-foo-n",
167+
]
168+
assert [s.id for s in wn.synset('test-02-n').senses()] == [
169+
"test-02-bar-n", "test-02-foo-n",
170+
]

wn/_add.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -665,9 +665,11 @@ def _insert_senses(
665665
progress: ProgressHandler
666666
) -> None:
667667
progress.set(status='Senses')
668-
ssrank = {s: i
669-
for ss in _local_synsets(synsets)
670-
for i, s in enumerate(ss.get('members', []))}
668+
ssrank = {
669+
(ss['id'], _id): i
670+
for ss in _local_synsets(synsets)
671+
for i, _id in enumerate(ss.get('members', []))
672+
}
671673
query = f'''
672674
INSERT INTO senses
673675
VALUES (null,
@@ -682,14 +684,23 @@ def _insert_senses(
682684
'''
683685
for batch in _batch(entries):
684686
data = [
685-
(sense['id'],
686-
lexid,
687-
entry['id'], lexidmap.get(entry['id'], lexid),
688-
i,
689-
sense['synset'], lexidmap.get(sense['synset'], lexid),
690-
ssrank.get(sense['id'], DEFAULT_MEMBER_RANK),
691-
sense.get('lexicalized', True),
692-
sense['meta'])
687+
(
688+
sense['id'],
689+
lexid,
690+
entry['id'], lexidmap.get(entry['id'], lexid),
691+
i,
692+
sense['synset'], lexidmap.get(sense['synset'], lexid),
693+
# members can be sense or entry IDs
694+
ssrank.get(
695+
(sense['synset'], sense['id']),
696+
ssrank.get(
697+
(sense['synset'], entry['id']),
698+
DEFAULT_MEMBER_RANK
699+
)
700+
),
701+
sense.get('lexicalized', True),
702+
sense['meta']
703+
)
693704
for entry in batch
694705
for i, sense in enumerate(_local_senses(_senses(entry)))
695706
]

0 commit comments

Comments
 (0)