Skip to content

Commit c52b6f3

Browse files
committed
add 'fts5' module to write FTS5 tokenizer
see #5
1 parent 52a8b82 commit c52b6f3

File tree

5 files changed

+409
-38
lines changed

5 files changed

+409
-38
lines changed

sqlitefts/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from .tokenizer import register_tokenizer
2-
from .fts3 import Tokenizer, make_tokenizer_module
1+
from .fts3 import Tokenizer, make_tokenizer_module, register_tokenizer
32
from . import tokenizer, ranking
43

54
__all__ = ["Tokenizer", "make_tokenizer_module", "register_tokenizer",

sqlitefts/fts3.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@
33
PoC SQLite FTS5 tokenizer in Python
44
"""
55
from __future__ import print_function, unicode_literals
6+
import sys
7+
import struct
68

7-
from .tokenizer import ffi, SQLITE_OK, SQLITE_DONE
9+
from .tokenizer import (ffi, dll, get_db_from_connection, SQLITE_OK,
10+
SQLITE_DONE)
11+
12+
SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER = 1004
813

914
ffi.cdef('''
1015
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
@@ -127,3 +132,24 @@ def xclose(pCursor):
127132
tokenizer_modules[tokenizer] = (tokenizer_module, xcreate, xdestroy, xopen,
128133
xclose, xnext)
129134
return tokenizer_module
135+
136+
137+
def enable_fts3_tokenizer(c):
138+
db = get_db_from_connection(c)
139+
rc = dll.sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER,
140+
ffi.cast('int', 1), ffi.NULL)
141+
return rc == 0
142+
143+
144+
def register_tokenizer(c, name, tokenizer_module):
145+
""" register tokenizer module with SQLite connection. """
146+
module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
147+
address_blob = struct.pack("P", module_addr)
148+
if sys.version_info.major == 2:
149+
address_blob = buffer(address_blob)
150+
enable_fts3_tokenizer(c)
151+
r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
152+
return r
153+
154+
155+
__all__ = ["Tokenizer", "make_tokenizer_module", "register_tokenizer"]

sqlitefts/fts5.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# coding: utf-8
2+
"""
3+
PoC SQLite FTS5 tokenizer in Python
4+
"""
5+
from __future__ import print_function, unicode_literals
6+
import struct
7+
8+
from .tokenizer import ffi, SQLITE_OK
9+
10+
FTS5_TOKENIZE_QUERY = 0x0001
11+
FTS5_TOKENIZE_PREFIX = 0x0002
12+
FTS5_TOKENIZE_DOCUMENT = 0x0004
13+
FTS5_TOKENIZE_AUX = 0x0008
14+
FTS5_TOKEN_COLOCATED = 0x0001
15+
16+
ffi.cdef('''
17+
typedef struct fts5_api fts5_api;
18+
typedef struct fts5_tokenizer fts5_tokenizer;
19+
typedef struct Fts5Tokenizer Fts5Tokenizer;
20+
21+
struct fts5_api {
22+
int iVersion;
23+
int (*xCreateTokenizer)(
24+
fts5_api *pApi, const char *zName, void *pContext,
25+
fts5_tokenizer *pTokenizer,void (*xDestroy)(void*));
26+
};
27+
28+
struct fts5_tokenizer {
29+
int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
30+
void (*xDelete)(Fts5Tokenizer*);
31+
int (*xTokenize)(
32+
Fts5Tokenizer*, void *pCtx, int flags, const char *pText, int nText,
33+
int (*xToken)(
34+
void *pCtx, int tflags,const char *pToken,
35+
int nToken, int iStart, int iEnd));
36+
};
37+
''')
38+
39+
fts5_tokenizers = {}
40+
"""hold references to prevent GC"""
41+
42+
43+
def fts5_api_from_db(c):
44+
cur = c.cursor()
45+
try:
46+
cur.execute('SELECT fts5()')
47+
blob = cur.fetchone()[0]
48+
pRet = ffi.cast('fts5_api*', struct.unpack("P", blob)[0])
49+
finally:
50+
cur.close()
51+
return pRet
52+
53+
54+
def register_tokenizer(c, name, tokenizer, context=None, on_destroy=None):
55+
""" need to keep reference of context and on_destroy """
56+
fts5api = fts5_api_from_db(c)
57+
pContext = ffi.new_handle(context)
58+
if on_destroy is None:
59+
xDestroy = ffi.NULL
60+
else:
61+
62+
@ffi.callback('void(void*)')
63+
def xDestroy(context):
64+
on_destroy(ffi.from_handle(context))
65+
66+
fts5_tokenizers[name] = (tokenizer, pContext, xDestroy)
67+
r = fts5api.xCreateTokenizer(fts5api, name.encode('utf-8'), pContext,
68+
tokenizer, xDestroy)
69+
return r == SQLITE_OK
70+
71+
72+
def make_fts5_tokenizer(tokenizer):
73+
""" make tokenizer module """
74+
t = ffi.new_handle(tokenizer)
75+
tokenizers = {}
76+
77+
@ffi.callback('int(void*, const char **, int, Fts5Tokenizer **)')
78+
def xcreate(ctx, argv, argc, ppOut):
79+
# is keeping ctor instead of keeping tokenizer instance good idea?
80+
# i.e. t = ctor(context, argv)
81+
tkn = ffi.cast('Fts5Tokenizer *', t)
82+
tokenizers[int(ffi.cast('intptr_t', tkn))] = tkn
83+
ppOut[0] = tkn
84+
return SQLITE_OK
85+
86+
@ffi.callback('void(Fts5Tokenizer *)')
87+
def xdelete(pTokenizer):
88+
del tokenizers[int(ffi.cast('intptr_t', pTokenizer))]
89+
return None
90+
91+
@ffi.callback('int(Fts5Tokenizer *, void *, int, const char *, int, '
92+
'int(void*, int, const char *, int, int, int))')
93+
def xtokenize(pTokenizer, pCtx, flags, pText, nText, xToken):
94+
tokenizer = ffi.from_handle(ffi.cast('void *', pTokenizer))
95+
text = ffi.string(pText[0:nText]).decode('utf-8')
96+
for normalized, inputBegin, inputEnd in tokenizer.tokenize(text):
97+
normalized = normalized.encode('utf-8')
98+
if not normalized:
99+
continue
100+
101+
# TODO: Synonym Support
102+
r = xToken(pCtx, 0, ffi.new('char[]', normalized), len(normalized),
103+
inputBegin, inputEnd)
104+
if r != SQLITE_OK:
105+
return r
106+
return SQLITE_OK
107+
108+
fts5_tokenizer = ffi.new("fts5_tokenizer *", [xcreate, xdelete, xtokenize])
109+
fts5_tokenizers[tokenizer] = (fts5_tokenizer, xcreate, xdelete, xtokenize)
110+
return fts5_tokenizer
111+
112+
113+
__all__ = ["register_tokenizer", "make_fts5_tokenizer"]

sqlitefts/tokenizer.py

+8-35
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,13 @@
33
a proof of concept implementation of SQLite FTS tokenizers in Python
44
"""
55
from __future__ import print_function, unicode_literals
6-
76
import sys
8-
import struct
97

108
from cffi import FFI
119

1210
SQLITE_OK = 0
1311
SQLITE_DONE = 101
1412

15-
if sys.version_info.major == 2:
16-
global buffer
17-
else:
18-
buffer = lambda x: x
19-
2013
ffi = FFI()
2114
ffi.cdef('''
2215
typedef struct sqlite3 sqlite3;
@@ -43,31 +36,11 @@
4336
dll = ffi.dlopen(find_library("sqlite3"))
4437

4538

46-
def f():
47-
SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER = 1004
48-
49-
def enable_fts3_tokenizer(c):
50-
db = getattr(c, '_db', None)
51-
if db:
52-
# pypy's SQLite3 connection has _db using cffi
53-
db = ffi.cast('sqlite3*', db)
54-
else:
55-
db = ffi.cast('PyObject *', id(c)).db
56-
rc = dll.sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER,
57-
ffi.cast('int', 1), ffi.NULL)
58-
return rc == 0
59-
60-
return enable_fts3_tokenizer
61-
62-
63-
enable_fts3_tokenizer = f()
64-
del f
65-
66-
67-
def register_tokenizer(c, name, tokenizer_module):
68-
""" register tokenizer module with SQLite connection. """
69-
module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
70-
address_blob = buffer(struct.pack("P", module_addr))
71-
enable_fts3_tokenizer(c)
72-
r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
73-
return r
39+
def get_db_from_connection(c):
40+
db = getattr(c, '_db', None)
41+
if db:
42+
# pypy's SQLite3 connection has _db using cffi
43+
db = ffi.cast('sqlite3*', db)
44+
else:
45+
db = ffi.cast('PyObject *', id(c)).db
46+
return db

0 commit comments

Comments
 (0)