Skip to content

Commit a2360fa

Browse files
committed
corrected wrong xNext behavior
merged suggest changes for issue #1 by saaj * Tokenizer.tokenize returns tokenized text, start/end position in given text in bytes, and it changed a generator. * sets position/length in the correct unit(bytes instead of length of unicode string) * skips empty token
1 parent 759c34a commit a2360fa

File tree

8 files changed

+179
-34
lines changed

8 files changed

+179
-34
lines changed

.gitignore

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
*.py[co]
2-
*.sw[po]
3-
__pycache__
4-
*~
5-
1+
.tox/
2+
build/
3+
dist/
4+
env/
5+
sqlitefts.egg-info/
6+
*.pyc
7+
*.py~

setup.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,28 @@
11
from setuptools import setup
2+
import sys
3+
24

35
setup(
46
name="sqlitefts",
57
version="0.1",
6-
packages=["sqlitefts"]
8+
packages=["sqlitefts"],
9+
description='A Python binding of SQLite Full Text Search Tokenizer',
10+
url='https://github.com/hideaki-t/igo-python/',
11+
classifiers=[
12+
'Development Status :: 2 - Pre-Alpha',
13+
'Intended Audience :: Developers',
14+
'License :: OSI Approved :: MIT License',
15+
'Operating System :: OS Independent',
16+
'Operating System :: POSIX :: Linux',
17+
'Programming Language :: Python :: 2.7',
18+
'Programming Language :: Python :: 3.3',
19+
'Programming Language :: Python :: 3.4',
20+
'Topic :: Database'
21+
'Topic :: Software Development :: Libraries :: Python Modules'
22+
],
23+
author='Hideaki Takahashi',
24+
author_email='[email protected]',
25+
license='MIT',
26+
keywords=['SQLite', 'Full-text search', 'FTS'],
27+
install_requires=['enum34'] if sys.version_info < (3, 4) else []
728
)

sqlitefts/sqlite_tokenizer.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@
99
from ctypes import POINTER, CFUNCTYPE
1010
import struct
1111

12+
try:
13+
from enum import Enum
14+
except:
15+
pass
16+
17+
18+
class SQLiteResultCodes(Enum):
19+
SQLITE_OK = 0
20+
SQLITE_DONE = 101
21+
1222

1323
class sqlite3_tokenizer_module(ctypes.Structure):
1424
pass
@@ -46,8 +56,8 @@ class sqlite3_tokenizer_cursor(ctypes.Structure):
4656
class Tokenizer:
4757
""" Tokenizer base class """
4858
def tokenize(text):
49-
""" Tokenizer given unicode text. Returns an iterator of token """
50-
return text
59+
""" Tokenize given unicode text. Yields each tokenized token, start position(in bytes), end positon(in bytes)"""
60+
yield text, 0, len(text.encode('utf-8'))
5161

5262

5363
tokenizer_modules = {}
@@ -64,11 +74,11 @@ def xcreate(argc, argv, ppTokenizer):
6474
tkn.t = tokenizer
6575
tokenizers[ctypes.addressof(tkn)] = tkn
6676
ppTokenizer[0] = ctypes.pointer(tkn)
67-
return 0
77+
return SQLiteResultCodes.SQLITE_OK.value
6878

6979
def xdestroy(pTokenizer):
7080
del(tokenizers[ctypes.addressof(pTokenizer[0])])
71-
return 0
81+
return SQLiteResultCodes.SQLITE_OK.value
7282

7383
def xopen(pTokenizer, pInput, nInput, ppCursor):
7484
cur = sqlite3_tokenizer_cursor()
@@ -78,28 +88,33 @@ def xopen(pTokenizer, pInput, nInput, ppCursor):
7888
cur.offset = 0
7989
cursors[ctypes.addressof(cur)] = cur
8090
ppCursor[0] = ctypes.pointer(cur)
81-
return 0
91+
return SQLiteResultCodes.SQLITE_OK.value
8292

8393
def xnext(pCursor, ppToken, pnBytes,
8494
piStartOffset, piEndOffset, piPosition):
8595
try:
8696
cur = pCursor[0]
87-
token = next(cur.tokens).encode('utf-8')
88-
tokenlen = len(token)
89-
ppToken[0] = token
90-
pnBytes[0] = tokenlen
91-
piStartOffset[0] = cur.offset
92-
cur.offset += tokenlen
93-
piEndOffset[0] = cur.offset
97+
98+
while True:
99+
normalized, inputBegin, inputEnd = next(cur.tokens)
100+
normalized = normalized.encode('utf-8')
101+
if normalized:
102+
break
103+
104+
ppToken[0] = normalized
105+
pnBytes[0] = len(normalized)
106+
piStartOffset[0] = inputBegin
107+
piEndOffset[0] = inputEnd
108+
cur.offset = inputEnd
94109
piPosition[0] = cur.pos
95110
cur.pos += 1
96111
except StopIteration:
97-
return 101
98-
return 0
112+
return SQLiteResultCodes.SQLITE_DONE.value
113+
return SQLiteResultCodes.SQLITE_OK.value
99114

100115
def xclose(pCursor):
101116
del(cursors[ctypes.addressof(pCursor[0])])
102-
return 0
117+
return SQLiteResultCodes.SQLITE_OK.value
103118

104119
tokenizer_module = sqlite3_tokenizer_module(
105120
0,

tests/test_base.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
# coding: utf-8
22
from __future__ import print_function, unicode_literals
3-
import sys
4-
import os
53
import sqlite3
64
import ctypes
75
import struct
6+
import re
87

98
import sqlitefts.sqlite_tokenizer as fts
109

10+
1111
class SimpleTokenizer(fts.Tokenizer):
12+
_p = re.compile(r'\S+')
13+
1214
def tokenize(self, text):
13-
return iter(text.split(' '))
15+
for m in self._p.finditer(text):
16+
s, e = m.span()
17+
yield text[s:e], s, e
18+
1419

1520
def test_make_tokenizer():
1621
c = sqlite3.connect(':memory:')

tests/test_base2.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import print_function
3+
4+
import unittest
5+
import sqlite3
6+
import re
7+
8+
import sqlitefts.sqlite_tokenizer as fts
9+
10+
11+
class BaseTokenizer(fts.Tokenizer):
12+
13+
_spliter = re.compile(r'\s+|\S+')
14+
_nonws = re.compile(r'\S+')
15+
16+
def _normalize(self, token):
17+
return token
18+
19+
def _tokenize(self, text):
20+
pos = 0
21+
for t in self._spliter.findall(text):
22+
byteLen = len(t.encode('utf-8'))
23+
if self._nonws.match(t):
24+
yield self._normalize(t), pos, pos + byteLen
25+
pos += byteLen
26+
27+
def tokenize(self, text):
28+
return self._tokenize(text)
29+
30+
31+
class DebugTokenizer(BaseTokenizer):
32+
33+
_limit = 16
34+
35+
def _normalize(self, token):
36+
if not self._limit:
37+
raise RuntimeError()
38+
self._limit -= 1
39+
40+
print(token, token[0:-1])
41+
return token[0:-1]
42+
43+
44+
class OriginalDebugTokenizer(fts.Tokenizer):
45+
46+
_limit = 16
47+
48+
def tokenize(self, text):
49+
if not self._limit:
50+
raise RuntimeError()
51+
self._limit -= 1
52+
53+
print(text, [w[0:-1] for w in text.split(' ')])
54+
return (w[0:-1] for w in text.split(' '))
55+
56+
57+
class TestCase(unittest.TestCase):
58+
59+
def setUp(self):
60+
name = 'test'
61+
conn = sqlite3.connect(':memory:')
62+
63+
fts.register_tokenizer(conn, name, fts.make_tokenizer_module(DebugTokenizer()))
64+
conn.execute('CREATE VIRTUAL TABLE fts USING FTS4(tokenize={})'.format(name))
65+
66+
self.testee = conn
67+
68+
def testZeroLengthToken(self):
69+
result = self.testee.executemany('INSERT INTO fts VALUES(?)', [('Make things I',), (u'Some σ φχικλψ',)])
70+
self.assertEqual(2, result.rowcount)
71+
72+
def testInfiniteRecursion(self):
73+
contents = [('abc def',), ('abc xyz',)]
74+
result = self.testee.executemany('INSERT INTO fts VALUES(?)', contents)
75+
self.assertEqual(2, result.rowcount)
76+
77+
result = self.testee.execute("SELECT * FROM fts WHERE fts MATCH 'abc'").fetchall()
78+
self.assertEqual(2, len(result))

tests/test_igo.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
# coding: utf-8
22
from __future__ import print_function, unicode_literals
3-
import sys
4-
import os
53
import sqlite3
64
import ctypes
75
import struct
@@ -11,16 +9,19 @@
119
import pytest
1210
igo = pytest.importorskip('igo')
1311

12+
1413
class IgoTokenizer(fts.Tokenizer):
1514
def __init__(self, path=None):
1615
self.tagger = igo.tagger.Tagger(path)
1716

1817
def tokenize(self, text):
19-
return iter([m.surface for m in self.tagger.parse(text)])
18+
for m in self.tagger.parse(text):
19+
yield m.surface, m.start, m.start + len(m.surface.encode('utf-8'))
2020

2121

2222
t = IgoTokenizer('./ipadic')
2323

24+
2425
def test_make_tokenizer():
2526
c = sqlite3.connect(':memory:')
2627
tokenizer_module = fts.make_tokenizer_module(t)

tests/test_tinysegmenter.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,22 @@
1212
import pytest
1313
ts = pytest.importorskip('tinysegmenter')
1414

15+
1516
class TinySegmenterTokenizer(fts.Tokenizer):
1617
def __init__(self, path=None):
1718
self.segmenter = ts.TinySegmenter()
1819

1920
def tokenize(self, text):
20-
return iter(self.segmenter.tokenize(text))
21+
p = 0
22+
for t in self.segmenter.tokenize(text):
23+
np = text[p:].index(t)
24+
yield t, np, len(t.encode('utf-8'))
25+
p = np
2126

2227

2328
t = TinySegmenterTokenizer()
2429

30+
2531
def test_make_tokenizer():
2632
c = sqlite3.connect(':memory:')
2733
tokenizer_module = fts.make_tokenizer_module(t)

tox.ini

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
11
[tox]
2-
envlist = py27, py33
3-
#envlist = py27, py33, pypy
2+
envlist = py27, py33, py34
43

54
[testenv]
65
changedir=tests
7-
deps = pytest
8-
igo-python
9-
git+git://git.tuxfamily.org/gitroot/tinysegmente/tinysegmenter.git
6+
deps=
7+
pytest
8+
igo-python
9+
git+git://git.tuxfamily.org/gitroot/tinysegmente/tinysegmenter.git
1010
#tinysegmenter
11-
commands = py.test -sv
11+
commands=
12+
py.test -sv
13+
14+
[testenv:pp34]
15+
deps=
16+
{[testenv]deps}
17+
enum34
18+
19+
[testenv:py33]
20+
deps=
21+
{[testenv:pp34]deps}
22+
[testenv:py27]
23+
deps=
24+
{[testenv:pp34]deps}
25+
26+
[testenv:pypy]
27+
deps=
28+
{[testenv:pp34]deps}
1229

0 commit comments

Comments
 (0)