|
34 | 34 | void *ob_type;
|
35 | 35 | sqlite3 *db;
|
36 | 36 | } PyObject;
|
37 |
| -
|
38 |
| -typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; |
39 |
| -typedef struct sqlite3_tokenizer sqlite3_tokenizer; |
40 |
| -typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; |
41 |
| -struct sqlite3_tokenizer_module { |
42 |
| - int iVersion; |
43 |
| - int (*xCreate)( |
44 |
| - int argc, const char *const*argv, sqlite3_tokenizer **ppTokenizer); |
45 |
| - int (*xDestroy)(sqlite3_tokenizer *pTokenizer); |
46 |
| - int (*xOpen)( |
47 |
| - sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes, |
48 |
| - sqlite3_tokenizer_cursor **ppCursor); |
49 |
| - int (*xClose)(sqlite3_tokenizer_cursor *pCursor); |
50 |
| - int (*xNext)( |
51 |
| - sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, |
52 |
| - int *piStartOffset, int *piEndOffset, int *piPosition); |
53 |
| - int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); |
54 |
| -}; |
55 |
| -
|
56 |
| -struct sqlite3_tokenizer { |
57 |
| - const sqlite3_tokenizer_module *pModule; |
58 |
| - void *t; |
59 |
| -}; |
60 |
| -
|
61 |
| -struct sqlite3_tokenizer_cursor { |
62 |
| - sqlite3_tokenizer *pTokenizer; |
63 |
| - void *tokens; |
64 |
| - size_t pos; |
65 |
| - size_t offset; |
66 |
| -}; |
67 | 37 | ''')
|
68 | 38 |
|
69 | 39 | if sys.platform == 'win32':
|
@@ -94,100 +64,10 @@ def enable_fts3_tokenizer(c):
|
94 | 64 | del f
|
95 | 65 |
|
96 | 66 |
|
97 |
| -class Tokenizer: |
98 |
| - """ Tokenizer base class """ |
99 |
| - |
100 |
| - def tokenize(text): |
101 |
| - """ |
102 |
| - Tokenize given unicode text. Yields each tokenized token, |
103 |
| - start position(in bytes), end positon(in bytes) |
104 |
| - """ |
105 |
| - yield text, 0, len(text.encode('utf-8')) |
106 |
| - |
107 |
| - |
108 |
| -tokenizer_modules = {} |
109 |
| -"""hold references to prevent GC""" |
110 |
| - |
111 |
| - |
112 |
| -def make_tokenizer_module(tokenizer): |
113 |
| - """ make tokenizer module """ |
114 |
| - if tokenizer in tokenizer_modules: |
115 |
| - return tokenizer_modules[tokenizer] |
116 |
| - |
117 |
| - t = ffi.new_handle(tokenizer) |
118 |
| - tokenizers = {} |
119 |
| - cursors = {} |
120 |
| - |
121 |
| - @ffi.callback('int(int, const char *const*, sqlite3_tokenizer **)') |
122 |
| - def xcreate(argc, argv, ppTokenizer): |
123 |
| - tkn = ffi.new('sqlite3_tokenizer *') |
124 |
| - tkn.t = t |
125 |
| - tokenizers[int(ffi.cast('intptr_t', tkn))] = tkn |
126 |
| - ppTokenizer[0] = tkn |
127 |
| - return SQLITE_OK |
128 |
| - |
129 |
| - @ffi.callback('int(sqlite3_tokenizer *)') |
130 |
| - def xdestroy(pTokenizer): |
131 |
| - del tokenizers[int(ffi.cast('intptr_t', pTokenizer))] |
132 |
| - return SQLITE_OK |
133 |
| - |
134 |
| - @ffi.callback( |
135 |
| - 'int(sqlite3_tokenizer*, const char *, int, sqlite3_tokenizer_cursor **)' |
136 |
| - ) |
137 |
| - def xopen(pTokenizer, pInput, nInput, ppCursor): |
138 |
| - cur = ffi.new('sqlite3_tokenizer_cursor *') |
139 |
| - tokenizer = ffi.from_handle(pTokenizer.t) |
140 |
| - tokens = tokenizer.tokenize(ffi.string(pInput).decode('utf-8')) |
141 |
| - tknh = ffi.new_handle(tokens) |
142 |
| - cur.pTokenizer = pTokenizer |
143 |
| - cur.tokens = tknh |
144 |
| - cur.pos = 0 |
145 |
| - cur.offset = 0 |
146 |
| - cursors[int(ffi.cast('intptr_t', cur))] = cur, tknh |
147 |
| - ppCursor[0] = cur |
148 |
| - return SQLITE_OK |
149 |
| - |
150 |
| - @ffi.callback( |
151 |
| - 'int(sqlite3_tokenizer_cursor*, const char **, int *, int *, int *, int *)' |
152 |
| - ) |
153 |
| - def xnext(pCursor, ppToken, pnBytes, piStartOffset, piEndOffset, |
154 |
| - piPosition): |
155 |
| - try: |
156 |
| - cur = pCursor[0] |
157 |
| - tokens = ffi.from_handle(cur.tokens) |
158 |
| - while True: |
159 |
| - normalized, inputBegin, inputEnd = next(tokens) |
160 |
| - normalized = normalized.encode('utf-8') |
161 |
| - if normalized: |
162 |
| - break |
163 |
| - |
164 |
| - ppToken[0] = ffi.new('char []', normalized) # ?? |
165 |
| - pnBytes[0] = len(normalized) |
166 |
| - piStartOffset[0] = inputBegin |
167 |
| - piEndOffset[0] = inputEnd |
168 |
| - cur.offset = inputEnd |
169 |
| - piPosition[0] = cur.pos |
170 |
| - cur.pos += 1 |
171 |
| - except StopIteration: |
172 |
| - return SQLITE_DONE |
173 |
| - return SQLITE_OK |
174 |
| - |
175 |
| - @ffi.callback('int(sqlite3_tokenizer_cursor *)') |
176 |
| - def xclose(pCursor): |
177 |
| - del cursors[int(ffi.cast('intptr_t', pCursor))] |
178 |
| - return SQLITE_OK |
179 |
| - |
180 |
| - tokenizer_module = ffi.new("sqlite3_tokenizer_module *", |
181 |
| - [0, xcreate, xdestroy, xopen, xclose, xnext]) |
182 |
| - tokenizer_modules[tokenizer] = (xcreate, xdestroy, xopen, xclose, xnext) |
183 |
| - return tokenizer_module |
184 |
| - |
185 |
| - |
186 | 67 | def register_tokenizer(c, name, tokenizer_module):
|
187 | 68 | """ register tokenizer module with SQLite connection. """
|
188 | 69 | module_addr = int(ffi.cast('uintptr_t', tokenizer_module))
|
189 | 70 | address_blob = buffer(struct.pack("P", module_addr))
|
190 | 71 | enable_fts3_tokenizer(c)
|
191 | 72 | r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob))
|
192 |
| - tokenizer_modules[module_addr] = tokenizer_module |
193 | 73 | return r
|
0 commit comments