Skip to content

Commit 83447f4

Browse files
committed
add 'fts5' module to write FTS5 tokenizer
see #5
1 parent 52a8b82 commit 83447f4

File tree

4 files changed

+293
-38
lines changed

4 files changed

+293
-38
lines changed

sqlitefts/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from .tokenizer import register_tokenizer
2-
from .fts3 import Tokenizer, make_tokenizer_module
1+
from .fts3 import Tokenizer, make_tokenizer_module, register_tokenizer
32
from . import tokenizer, ranking
43

54
__all__ = ["Tokenizer", "make_tokenizer_module", "register_tokenizer",

sqlitefts/fts3.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@
33
PoC SQLite FTS5 tokenizer in Python
44
"""
55
from __future__ import print_function, unicode_literals
6+
import sys
7+
import struct
68

7-
from .tokenizer import ffi, SQLITE_OK, SQLITE_DONE
9+
from .tokenizer import (ffi, dll, get_db_from_connection, SQLITE_OK,
10+
SQLITE_DONE)
11+
12+
SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER = 1004
813

914
ffi.cdef('''
1015
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
@@ -127,3 +132,21 @@ def xclose(pCursor):
127132
tokenizer_modules[tokenizer] = (tokenizer_module, xcreate, xdestroy, xopen,
128133
xclose, xnext)
129134
return tokenizer_module
135+
136+
137+
def enable_fts3_tokenizer(c):
138+
db = get_db_from_connection(c)
139+
rc = dll.sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER,
140+
ffi.cast('int', 1), ffi.NULL)
141+
return rc == 0
142+
143+
144+
def register_tokenizer(c, name, tokenizer_module):
145+
""" register tokenizer module with SQLite connection. """
146+
module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
147+
address_blob = struct.pack("P", module_addr)
148+
if sys.version_info.major == 2:
149+
address_blob = buffer(address_blob)
150+
enable_fts3_tokenizer(c)
151+
r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
152+
return r

sqlitefts/tokenizer.py

+8-35
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,13 @@
33
a proof of concept implementation of SQLite FTS tokenizers in Python
44
"""
55
from __future__ import print_function, unicode_literals
6-
76
import sys
8-
import struct
97

108
from cffi import FFI
119

1210
SQLITE_OK = 0
1311
SQLITE_DONE = 101
1412

15-
if sys.version_info.major == 2:
16-
global buffer
17-
else:
18-
buffer = lambda x: x
19-
2013
ffi = FFI()
2114
ffi.cdef('''
2215
typedef struct sqlite3 sqlite3;
@@ -43,31 +36,11 @@
4336
dll = ffi.dlopen(find_library("sqlite3"))
4437

4538

46-
def f():
47-
SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER = 1004
48-
49-
def enable_fts3_tokenizer(c):
50-
db = getattr(c, '_db', None)
51-
if db:
52-
# pypy's SQLite3 connection has _db using cffi
53-
db = ffi.cast('sqlite3*', db)
54-
else:
55-
db = ffi.cast('PyObject *', id(c)).db
56-
rc = dll.sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER,
57-
ffi.cast('int', 1), ffi.NULL)
58-
return rc == 0
59-
60-
return enable_fts3_tokenizer
61-
62-
63-
enable_fts3_tokenizer = f()
64-
del f
65-
66-
67-
def register_tokenizer(c, name, tokenizer_module):
68-
""" register tokenizer module with SQLite connection. """
69-
module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
70-
address_blob = buffer(struct.pack("P", module_addr))
71-
enable_fts3_tokenizer(c)
72-
r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
73-
return r
39+
def get_db_from_connection(c):
40+
db = getattr(c, '_db', None)
41+
if db:
42+
# pypy's SQLite3 connection has _db using cffi
43+
db = ffi.cast('sqlite3*', db)
44+
else:
45+
db = ffi.cast('PyObject *', id(c)).db
46+
return db

tests/test_fts5.py

+260
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
# coding: utf-8
2+
from __future__ import print_function, unicode_literals
3+
import sqlite3
4+
import re
5+
6+
from cffi import FFI
7+
8+
from sqlitefts import fts5
9+
from sqlitefts import Tokenizer
10+
11+
ffi = FFI()
12+
13+
14+
class SimpleTokenizer(Tokenizer):
15+
_p = re.compile(r'\w+', re.UNICODE)
16+
17+
def tokenize(self, text):
18+
for m in self._p.finditer(text):
19+
s, e = m.span()
20+
t = text[s:e]
21+
l = len(t.encode('utf-8'))
22+
p = len(text[:s].encode('utf-8'))
23+
yield t, p, p + l
24+
25+
26+
def test_fts5_api_from_db():
27+
c = sqlite3.connect(':memory:')
28+
fts5api = fts5.fts5_api_from_db(c)
29+
print(fts5api)
30+
assert fts5api.iVersion == 2
31+
assert fts5api.xCreateTokenizer
32+
c.close()
33+
34+
35+
def test_make_tokenizer():
36+
c = sqlite3.connect(':memory:')
37+
tm = fts5.make_fts5_tokenizer(SimpleTokenizer())
38+
assert all(
39+
getattr(tm, x) is not None
40+
for x in ('xCreate', 'xDelete', 'xTokenize'))
41+
c.close()
42+
43+
44+
def test_register_tokenizer():
45+
name = 'simpe'
46+
c = sqlite3.connect(':memory:')
47+
tm = fts5.make_fts5_tokenizer(SimpleTokenizer())
48+
assert fts5.register_tokenizer(c, name, tm)
49+
c.close()
50+
51+
52+
def test_register_tokenizer_with_destroy():
53+
name = 'simpe'
54+
c = sqlite3.connect(':memory:')
55+
56+
arg_on_destroy = None
57+
context = "hello"
58+
59+
def on_destroy(x):
60+
nonlocal arg_on_destroy
61+
arg_on_destroy = x
62+
63+
tm = fts5.make_fts5_tokenizer(SimpleTokenizer())
64+
assert fts5.register_tokenizer(
65+
c, name, tm, context=context, on_destroy=on_destroy)
66+
c.close()
67+
assert arg_on_destroy == context
68+
69+
70+
def test_createtable():
71+
c = sqlite3.connect(':memory:')
72+
c.row_factory = sqlite3.Row
73+
name = 'super_simple'
74+
sql = "CREATE VIRTUAL TABLE fts USING fts5(w, tokenize={})".format(name)
75+
fts5.register_tokenizer(c, name,
76+
fts5.make_fts5_tokenizer(SimpleTokenizer()))
77+
c.execute(sql)
78+
79+
r = c.execute(
80+
"SELECT * FROM sqlite_master WHERE type='table' AND name='fts'").fetchone(
81+
)
82+
assert r
83+
assert r[str('type')] == 'table' and r[str('name')] == 'fts' and r[str(
84+
'tbl_name')] == 'fts'
85+
assert r[str('sql')].upper() == sql.upper()
86+
c.close()
87+
88+
89+
def test_insert():
90+
c = sqlite3.connect(':memory:')
91+
c.row_factory = sqlite3.Row
92+
name = 'super_simple'
93+
content = 'これは日本語で書かれています'
94+
fts5.register_tokenizer(c, name,
95+
fts5.make_fts5_tokenizer(SimpleTokenizer()))
96+
c.execute(
97+
"CREATE VIRTUAL TABLE fts USING FTS5(content, tokenize={})".format(
98+
name))
99+
r = c.execute('INSERT INTO fts VALUES(?)', (content, ))
100+
assert r.rowcount == 1
101+
r = c.execute("SELECT * FROM fts").fetchone()
102+
assert r
103+
assert r[str('content')] == content
104+
c.close()
105+
106+
107+
def test_match():
108+
c = sqlite3.connect(':memory:')
109+
c.row_factory = sqlite3.Row
110+
name = 'super_simple'
111+
contents = [('abc def', ), ('abc xyz', ), ('あいうえお かきくけこ', ),
112+
('あいうえお らりるれろ', )]
113+
fts5.register_tokenizer(c, name,
114+
fts5.make_fts5_tokenizer(SimpleTokenizer()))
115+
c.execute(
116+
"CREATE VIRTUAL TABLE fts USING FTS5(content, tokenize={})".format(
117+
name))
118+
r = c.executemany('INSERT INTO fts VALUES(?)', contents)
119+
assert r.rowcount == 4
120+
r = c.execute("SELECT * FROM fts").fetchall()
121+
assert len(r) == 4
122+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'abc'").fetchall()
123+
assert len(r) == 2
124+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'def'").fetchall()
125+
assert len(r) == 1 and r[0][str('content')] == contents[0][0]
126+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'xyz'").fetchall()
127+
assert len(r) == 1 and r[0][str('content')] == contents[1][0]
128+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'zzz'").fetchall()
129+
assert len(r) == 0
130+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'あいうえお'").fetchall()
131+
assert len(r) == 2
132+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'かきくけこ'").fetchall()
133+
assert len(r) == 1 and r[0][str('content')] == contents[2][0]
134+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'らりるれろ'").fetchall()
135+
assert len(r) == 1 and r[0][str('content')] == contents[3][0]
136+
r = c.execute("SELECT * FROM fts WHERE fts MATCH 'まみむめも'").fetchall()
137+
assert len(r) == 0
138+
c.close()
139+
140+
141+
def test_full_text_index_queries():
142+
name = 'super_simple'
143+
docs = [(
144+
'README',
145+
'sqlitefts-python provides binding for tokenizer of SQLite Full-Text search(FTS3/4). It allows you to write tokenizers in Python.'
146+
), ('LICENSE',
147+
'''Permission is hereby granted, free of charge, to any person obtaining a copy
148+
of this software and associated documentation files (the "Software"), to deal
149+
in the Software without restriction, including without limitation the rights
150+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
151+
copies of the Software, and to permit persons to whom the Software is
152+
furnished to do so, subject to the following conditions:'''),
153+
('日本語', 'あいうえお かきくけこ さしすせそ たちつてと なにぬねの')]
154+
with sqlite3.connect(':memory:') as c:
155+
c.row_factory = sqlite3.Row
156+
fts5.register_tokenizer(c, name,
157+
fts5.make_fts5_tokenizer(SimpleTokenizer()))
158+
c.execute(
159+
"CREATE VIRTUAL TABLE docs USING FTS5(title, body, tokenize={})".format(
160+
name))
161+
c.executemany("INSERT INTO docs(title, body) VALUES(?, ?)", docs)
162+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'Python'").fetchall(
163+
)
164+
assert len(r) == 1
165+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'bind'").fetchall()
166+
assert len(r) == 0
167+
r = c.execute(
168+
"SELECT * FROM docs WHERE docs MATCH 'binding'").fetchall()
169+
assert len(r) == 1
170+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'to'").fetchall()
171+
assert len(r) == 2
172+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'あいうえお'").fetchall()
173+
assert len(r) == 1
174+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'らりるれろ'").fetchall()
175+
assert len(r) == 0
176+
assert (
177+
c.execute(
178+
"SELECT * FROM docs WHERE docs MATCH 'binding'").fetchall()[0]
179+
== c.execute(
180+
"SELECT * FROM docs WHERE docs MATCH 'body:binding'").fetchall(
181+
)[0])
182+
assert (c.execute(
183+
"SELECT * FROM docs WHERE docs MATCH 'body:binding'").fetchall(
184+
)[0] == c.execute(
185+
"SELECT * FROM docs WHERE docs MATCH 'body:binding'").fetchall(
186+
)[0])
187+
assert (
188+
c.execute("SELECT * FROM docs WHERE docs MATCH 'あいうえお'").fetchall(
189+
)[0] == c.execute(
190+
"SELECT * FROM docs WHERE docs MATCH 'body:あいうえお'").fetchall()[
191+
0])
192+
r = c.execute(
193+
"SELECT * FROM docs WHERE docs MATCH 'title:bind'").fetchall()
194+
assert len(r) == 0
195+
r = c.execute(
196+
"SELECT * FROM docs WHERE docs MATCH 'title:README'").fetchall()
197+
assert len(r) == 1
198+
r = c.execute(
199+
"SELECT * FROM docs WHERE docs MATCH 'title:日本語'").fetchall()
200+
assert len(r) == 1
201+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'to in'").fetchall()
202+
assert len(r) == 2
203+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'Py*'").fetchall()
204+
assert len(r) == 1
205+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'Z*'").fetchall()
206+
assert len(r) == 0
207+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'あ*'").fetchall()
208+
assert len(r) == 1
209+
r = c.execute("SELECT * FROM docs WHERE docs MATCH 'ん*'").fetchall()
210+
assert len(r) == 0
211+
r = c.execute(
212+
"SELECT * FROM docs WHERE docs MATCH 'tokenizer SQLite'").fetchall(
213+
)
214+
assert len(r) == 1
215+
r = c.execute(
216+
"SELECT * FROM docs WHERE docs MATCH '\"tokenizer SQLite\"'").fetchall(
217+
)
218+
assert len(r) == 0
219+
r = c.execute(
220+
"SELECT * FROM docs WHERE docs MATCH 'あいうえお たちつてと'").fetchall()
221+
assert len(r) == 1
222+
r = c.execute(
223+
"SELECT * FROM docs WHERE docs MATCH '\"あいうえお たちつてと\"'").fetchall()
224+
assert len(r) == 0
225+
r = c.execute(
226+
"SELECT * FROM docs WHERE docs MATCH 'tok* + SQL*'").fetchall()
227+
assert len(r) == 0
228+
r = c.execute(
229+
"SELECT * FROM docs WHERE docs MATCH 'tok* of SQL*'").fetchall()
230+
assert len(r) == 1
231+
r = c.execute(
232+
"SELECT * FROM docs WHERE docs MATCH 'あ* + さ*'").fetchall()
233+
assert len(r) == 0
234+
r = c.execute(
235+
"SELECT * FROM docs WHERE docs MATCH 'あ* かきくけこ さ*'").fetchall()
236+
assert len(r) == 1
237+
r = c.execute(
238+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(tokenizer SQLite)'").fetchall(
239+
)
240+
assert len(r) == 1
241+
r = c.execute(
242+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(binding SQLite, 2)'").fetchall(
243+
)
244+
assert len(r) == 0
245+
r = c.execute(
246+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(binding SQLite, 3)'").fetchall(
247+
)
248+
assert len(r) == 1
249+
r = c.execute(
250+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(あいうえお たちつてと)'").fetchall(
251+
)
252+
assert len(r) == 1
253+
r = c.execute(
254+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(あいうえお たちつてと, 2)'").fetchall(
255+
)
256+
assert len(r) == 1
257+
r = c.execute(
258+
"SELECT * FROM docs WHERE docs MATCH 'NEAR(あいうえお たちつてと, 3)'").fetchall(
259+
)
260+
assert len(r) == 1

0 commit comments

Comments
 (0)