|
34 | 34 | void *ob_type; |
35 | 35 | sqlite3 *db; |
36 | 36 | } PyObject; |
37 | | -
|
38 | | -typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; |
39 | | -typedef struct sqlite3_tokenizer sqlite3_tokenizer; |
40 | | -typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; |
41 | | -struct sqlite3_tokenizer_module { |
42 | | - int iVersion; |
43 | | - int (*xCreate)( |
44 | | - int argc, const char *const*argv, sqlite3_tokenizer **ppTokenizer); |
45 | | - int (*xDestroy)(sqlite3_tokenizer *pTokenizer); |
46 | | - int (*xOpen)( |
47 | | - sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes, |
48 | | - sqlite3_tokenizer_cursor **ppCursor); |
49 | | - int (*xClose)(sqlite3_tokenizer_cursor *pCursor); |
50 | | - int (*xNext)( |
51 | | - sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, |
52 | | - int *piStartOffset, int *piEndOffset, int *piPosition); |
53 | | - int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); |
54 | | -}; |
55 | | -
|
56 | | -struct sqlite3_tokenizer { |
57 | | - const sqlite3_tokenizer_module *pModule; |
58 | | - void *t; |
59 | | -}; |
60 | | -
|
61 | | -struct sqlite3_tokenizer_cursor { |
62 | | - sqlite3_tokenizer *pTokenizer; |
63 | | - void *tokens; |
64 | | - size_t pos; |
65 | | - size_t offset; |
66 | | -}; |
67 | 37 | ''') |
68 | 38 |
|
69 | 39 | if sys.platform == 'win32': |
@@ -94,100 +64,10 @@ def enable_fts3_tokenizer(c): |
94 | 64 | del f |
95 | 65 |
|
96 | 66 |
|
97 | | -class Tokenizer: |
98 | | - """ Tokenizer base class """ |
99 | | - |
100 | | - def tokenize(text): |
101 | | - """ |
102 | | - Tokenize given unicode text. Yields each tokenized token, |
103 | | - start position(in bytes), end positon(in bytes) |
104 | | - """ |
105 | | - yield text, 0, len(text.encode('utf-8')) |
106 | | - |
107 | | - |
108 | | -tokenizer_modules = {} |
109 | | -"""hold references to prevent GC""" |
110 | | - |
111 | | - |
112 | | -def make_tokenizer_module(tokenizer): |
113 | | - """ make tokenizer module """ |
114 | | - if tokenizer in tokenizer_modules: |
115 | | - return tokenizer_modules[tokenizer] |
116 | | - |
117 | | - t = ffi.new_handle(tokenizer) |
118 | | - tokenizers = {} |
119 | | - cursors = {} |
120 | | - |
121 | | - @ffi.callback('int(int, const char *const*, sqlite3_tokenizer **)') |
122 | | - def xcreate(argc, argv, ppTokenizer): |
123 | | - tkn = ffi.new('sqlite3_tokenizer *') |
124 | | - tkn.t = t |
125 | | - tokenizers[int(ffi.cast('intptr_t', tkn))] = tkn |
126 | | - ppTokenizer[0] = tkn |
127 | | - return SQLITE_OK |
128 | | - |
129 | | - @ffi.callback('int(sqlite3_tokenizer *)') |
130 | | - def xdestroy(pTokenizer): |
131 | | - del tokenizers[int(ffi.cast('intptr_t', pTokenizer))] |
132 | | - return SQLITE_OK |
133 | | - |
134 | | - @ffi.callback( |
135 | | - 'int(sqlite3_tokenizer*, const char *, int, sqlite3_tokenizer_cursor **)' |
136 | | - ) |
137 | | - def xopen(pTokenizer, pInput, nInput, ppCursor): |
138 | | - cur = ffi.new('sqlite3_tokenizer_cursor *') |
139 | | - tokenizer = ffi.from_handle(pTokenizer.t) |
140 | | - tokens = tokenizer.tokenize(ffi.string(pInput).decode('utf-8')) |
141 | | - tknh = ffi.new_handle(tokens) |
142 | | - cur.pTokenizer = pTokenizer |
143 | | - cur.tokens = tknh |
144 | | - cur.pos = 0 |
145 | | - cur.offset = 0 |
146 | | - cursors[int(ffi.cast('intptr_t', cur))] = cur, tknh |
147 | | - ppCursor[0] = cur |
148 | | - return SQLITE_OK |
149 | | - |
150 | | - @ffi.callback( |
151 | | - 'int(sqlite3_tokenizer_cursor*, const char **, int *, int *, int *, int *)' |
152 | | - ) |
153 | | - def xnext(pCursor, ppToken, pnBytes, piStartOffset, piEndOffset, |
154 | | - piPosition): |
155 | | - try: |
156 | | - cur = pCursor[0] |
157 | | - tokens = ffi.from_handle(cur.tokens) |
158 | | - while True: |
159 | | - normalized, inputBegin, inputEnd = next(tokens) |
160 | | - normalized = normalized.encode('utf-8') |
161 | | - if normalized: |
162 | | - break |
163 | | - |
164 | | - ppToken[0] = ffi.new('char []', normalized) # ?? |
165 | | - pnBytes[0] = len(normalized) |
166 | | - piStartOffset[0] = inputBegin |
167 | | - piEndOffset[0] = inputEnd |
168 | | - cur.offset = inputEnd |
169 | | - piPosition[0] = cur.pos |
170 | | - cur.pos += 1 |
171 | | - except StopIteration: |
172 | | - return SQLITE_DONE |
173 | | - return SQLITE_OK |
174 | | - |
175 | | - @ffi.callback('int(sqlite3_tokenizer_cursor *)') |
176 | | - def xclose(pCursor): |
177 | | - del cursors[int(ffi.cast('intptr_t', pCursor))] |
178 | | - return SQLITE_OK |
179 | | - |
180 | | - tokenizer_module = ffi.new("sqlite3_tokenizer_module *", |
181 | | - [0, xcreate, xdestroy, xopen, xclose, xnext]) |
182 | | - tokenizer_modules[tokenizer] = (xcreate, xdestroy, xopen, xclose, xnext) |
183 | | - return tokenizer_module |
184 | | - |
185 | | - |
186 | 67 | def register_tokenizer(c, name, tokenizer_module): |
187 | 68 | """ register tokenizer module with SQLite connection. """ |
188 | 69 | module_addr = int(ffi.cast('uintptr_t', tokenizer_module)) |
189 | 70 | address_blob = buffer(struct.pack("P", module_addr)) |
190 | 71 | enable_fts3_tokenizer(c) |
191 | 72 | r = c.execute('SELECT fts3_tokenizer(?, ?)', (name, address_blob)) |
192 | | - tokenizer_modules[module_addr] = tokenizer_module |
193 | 73 | return r |
0 commit comments