Skip to content

Commit 4fd146b

Browse files
committed
moved FTS3 related code to fts3.py
1 parent e858a5a commit 4fd146b

File tree

3 files changed

+133
-122
lines changed

3 files changed

+133
-122
lines changed

sqlitefts/__init__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from .tokenizer import Tokenizer, make_tokenizer_module, register_tokenizer
1+
from .tokenizer import register_tokenizer
2+
from .fts3 import Tokenizer, make_tokenizer_module
23
from . import tokenizer, ranking
34

4-
__all__ = ["Tokenizer", "make_tokenizer_module", "register_tokenizer", "tokenizer", "ranking"]
5+
__all__ = ["Tokenizer", "make_tokenizer_module", "register_tokenizer",
6+
"tokenizer", "ranking"]

sqlitefts/fts3.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# coding: utf-8
2+
"""
3+
PoC SQLite FTS5 tokenizer in Python
4+
"""
5+
from __future__ import print_function, unicode_literals
6+
7+
from .tokenizer import ffi, SQLITE_OK, SQLITE_DONE
8+
9+
ffi.cdef('''
10+
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
11+
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
12+
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
13+
struct sqlite3_tokenizer_module {
14+
int iVersion;
15+
int (*xCreate)(
16+
int argc, const char *const*argv, sqlite3_tokenizer **ppTokenizer);
17+
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
18+
int (*xOpen)(
19+
sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes,
20+
sqlite3_tokenizer_cursor **ppCursor);
21+
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
22+
int (*xNext)(
23+
sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes,
24+
int *piStartOffset, int *piEndOffset, int *piPosition);
25+
int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
26+
};
27+
28+
struct sqlite3_tokenizer {
29+
const sqlite3_tokenizer_module *pModule;
30+
void *t;
31+
};
32+
33+
struct sqlite3_tokenizer_cursor {
34+
sqlite3_tokenizer *pTokenizer;
35+
void *tokens;
36+
size_t pos;
37+
size_t offset;
38+
};
39+
''')
40+
41+
42+
class Tokenizer:
43+
""" Tokenizer base class """
44+
45+
def tokenize(text):
46+
"""
47+
Tokenize given unicode text. Yields each tokenized token,
48+
start position(in bytes), end positon(in bytes)
49+
"""
50+
yield text, 0, len(text.encode('utf-8'))
51+
52+
53+
tokenizer_modules = {}
54+
"""hold references to prevent GC"""
55+
56+
57+
def make_tokenizer_module(tokenizer):
58+
""" make tokenizer module """
59+
if tokenizer in tokenizer_modules:
60+
return tokenizer_modules[tokenizer]
61+
62+
t = ffi.new_handle(tokenizer)
63+
tokenizers = {}
64+
cursors = {}
65+
66+
@ffi.callback('int(int, const char *const*, sqlite3_tokenizer **)')
67+
def xcreate(argc, argv, ppTokenizer):
68+
tkn = ffi.new('sqlite3_tokenizer *')
69+
tkn.t = t
70+
tokenizers[int(ffi.cast('intptr_t', tkn))] = tkn
71+
ppTokenizer[0] = tkn
72+
return SQLITE_OK
73+
74+
@ffi.callback('int(sqlite3_tokenizer *)')
75+
def xdestroy(pTokenizer):
76+
del tokenizers[int(ffi.cast('intptr_t', pTokenizer))]
77+
return SQLITE_OK
78+
79+
@ffi.callback(
80+
'int(sqlite3_tokenizer*, const char *, int, sqlite3_tokenizer_cursor **)'
81+
)
82+
def xopen(pTokenizer, pInput, nInput, ppCursor):
83+
cur = ffi.new('sqlite3_tokenizer_cursor *')
84+
tokenizer = ffi.from_handle(pTokenizer.t)
85+
tokens = tokenizer.tokenize(ffi.string(pInput).decode('utf-8'))
86+
tknh = ffi.new_handle(tokens)
87+
cur.pTokenizer = pTokenizer
88+
cur.tokens = tknh
89+
cur.pos = 0
90+
cur.offset = 0
91+
cursors[int(ffi.cast('intptr_t', cur))] = cur, tknh
92+
ppCursor[0] = cur
93+
return SQLITE_OK
94+
95+
@ffi.callback(
96+
'int(sqlite3_tokenizer_cursor*, const char **, int *, int *, int *, int *)'
97+
)
98+
def xnext(pCursor, ppToken, pnBytes, piStartOffset, piEndOffset,
99+
piPosition):
100+
try:
101+
cur = pCursor[0]
102+
tokens = ffi.from_handle(cur.tokens)
103+
while True:
104+
normalized, inputBegin, inputEnd = next(tokens)
105+
normalized = normalized.encode('utf-8')
106+
if normalized:
107+
break
108+
109+
ppToken[0] = ffi.new('char []', normalized) # ??
110+
pnBytes[0] = len(normalized)
111+
piStartOffset[0] = inputBegin
112+
piEndOffset[0] = inputEnd
113+
cur.offset = inputEnd
114+
piPosition[0] = cur.pos
115+
cur.pos += 1
116+
except StopIteration:
117+
return SQLITE_DONE
118+
return SQLITE_OK
119+
120+
@ffi.callback('int(sqlite3_tokenizer_cursor *)')
121+
def xclose(pCursor):
122+
del cursors[int(ffi.cast('intptr_t', pCursor))]
123+
return SQLITE_OK
124+
125+
tokenizer_module = ffi.new("sqlite3_tokenizer_module *",
126+
[0, xcreate, xdestroy, xopen, xclose, xnext])
127+
tokenizer_modules[tokenizer] = (tokenizer_module, xcreate, xdestroy, xopen,
128+
xclose, xnext)
129+
return tokenizer_module

sqlitefts/tokenizer.py

-120
Original file line numberDiff line numberDiff line change
@@ -34,36 +34,6 @@
3434
void *ob_type;
3535
sqlite3 *db;
3636
} PyObject;
37-
38-
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
39-
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
40-
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
41-
struct sqlite3_tokenizer_module {
42-
int iVersion;
43-
int (*xCreate)(
44-
int argc, const char *const*argv, sqlite3_tokenizer **ppTokenizer);
45-
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
46-
int (*xOpen)(
47-
sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes,
48-
sqlite3_tokenizer_cursor **ppCursor);
49-
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
50-
int (*xNext)(
51-
sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes,
52-
int *piStartOffset, int *piEndOffset, int *piPosition);
53-
int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
54-
};
55-
56-
struct sqlite3_tokenizer {
57-
const sqlite3_tokenizer_module *pModule;
58-
void *t;
59-
};
60-
61-
struct sqlite3_tokenizer_cursor {
62-
sqlite3_tokenizer *pTokenizer;
63-
void *tokens;
64-
size_t pos;
65-
size_t offset;
66-
};
6737
''')
6838

6939
if sys.platform == 'win32':
@@ -94,100 +64,10 @@ def enable_fts3_tokenizer(c):
9464
del f
9565

9666

97-
class Tokenizer:
98-
""" Tokenizer base class """
99-
100-
def tokenize(text):
101-
"""
102-
Tokenize given unicode text. Yields each tokenized token,
103-
start position(in bytes), end positon(in bytes)
104-
"""
105-
yield text, 0, len(text.encode('utf-8'))
106-
107-
108-
tokenizer_modules = {}
109-
"""hold references to prevent GC"""
110-
111-
112-
def make_tokenizer_module(tokenizer):
113-
""" make tokenizer module """
114-
if tokenizer in tokenizer_modules:
115-
return tokenizer_modules[tokenizer]
116-
117-
t = ffi.new_handle(tokenizer)
118-
tokenizers = {}
119-
cursors = {}
120-
121-
@ffi.callback('int(int, const char *const*, sqlite3_tokenizer **)')
122-
def xcreate(argc, argv, ppTokenizer):
123-
tkn = ffi.new('sqlite3_tokenizer *')
124-
tkn.t = t
125-
tokenizers[int(ffi.cast('intptr_t', tkn))] = tkn
126-
ppTokenizer[0] = tkn
127-
return SQLITE_OK
128-
129-
@ffi.callback('int(sqlite3_tokenizer *)')
130-
def xdestroy(pTokenizer):
131-
del tokenizers[int(ffi.cast('intptr_t', pTokenizer))]
132-
return SQLITE_OK
133-
134-
@ffi.callback(
135-
'int(sqlite3_tokenizer*, const char *, int, sqlite3_tokenizer_cursor **)'
136-
)
137-
def xopen(pTokenizer, pInput, nInput, ppCursor):
138-
cur = ffi.new('sqlite3_tokenizer_cursor *')
139-
tokenizer = ffi.from_handle(pTokenizer.t)
140-
tokens = tokenizer.tokenize(ffi.string(pInput).decode('utf-8'))
141-
tknh = ffi.new_handle(tokens)
142-
cur.pTokenizer = pTokenizer
143-
cur.tokens = tknh
144-
cur.pos = 0
145-
cur.offset = 0
146-
cursors[int(ffi.cast('intptr_t', cur))] = cur, tknh
147-
ppCursor[0] = cur
148-
return SQLITE_OK
149-
150-
@ffi.callback(
151-
'int(sqlite3_tokenizer_cursor*, const char **, int *, int *, int *, int *)'
152-
)
153-
def xnext(pCursor, ppToken, pnBytes, piStartOffset, piEndOffset,
154-
piPosition):
155-
try:
156-
cur = pCursor[0]
157-
tokens = ffi.from_handle(cur.tokens)
158-
while True:
159-
normalized, inputBegin, inputEnd = next(tokens)
160-
normalized = normalized.encode('utf-8')
161-
if normalized:
162-
break
163-
164-
ppToken[0] = ffi.new('char []', normalized) # ??
165-
pnBytes[0] = len(normalized)
166-
piStartOffset[0] = inputBegin
167-
piEndOffset[0] = inputEnd
168-
cur.offset = inputEnd
169-
piPosition[0] = cur.pos
170-
cur.pos += 1
171-
except StopIteration:
172-
return SQLITE_DONE
173-
return SQLITE_OK
174-
175-
@ffi.callback('int(sqlite3_tokenizer_cursor *)')
176-
def xclose(pCursor):
177-
del cursors[int(ffi.cast('intptr_t', pCursor))]
178-
return SQLITE_OK
179-
180-
tokenizer_module = ffi.new("sqlite3_tokenizer_module *",
181-
[0, xcreate, xdestroy, xopen, xclose, xnext])
182-
tokenizer_modules[tokenizer] = (xcreate, xdestroy, xopen, xclose, xnext)
183-
return tokenizer_module
184-
185-
18667
def register_tokenizer(c, name, tokenizer_module):
18768
""" register tokenizer module with SQLite connection. """
18869
module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
18970
address_blob = buffer(struct.pack("P", module_addr))
19071
enable_fts3_tokenizer(c)
19172
r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
192-
tokenizer_modules[module_addr] = tokenizer_module
19373
return r

0 commit comments

Comments
 (0)