added ranking related things from https://gist.github.com/saaj/fdc8e6351d07fbb1a511

hideaki-t · hideaki-t · commit c4d78d9613da · 2014-06-10T08:28:11.000-04:00
a part of issue #2
diff --git a/sqlitefts/ranking.py b/sqlitefts/ranking.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+'''
+Ranking code based on:
+  https://github.com/coleifer/peewee/blob/master/playhouse/sqlite_ext.py
+'''
+
+
+import struct
+import math
+
+
+def parseMatchInfo(buf):
+    '''see http://sqlite.org/fts3.html#matchinfo'''
+    bufsize = len(buf)  # length in bytes
+    return [struct.unpack('@I', buf[i:i+4])[0] for i in range(0, bufsize, 4)]
+
+
+def simple(raw_match_info):
+    '''
+    handle match_info called w/default args 'pcx' - based on the example rank
+    function http://sqlite.org/fts3.html#appendix_a
+    '''
+    match_info = parseMatchInfo(raw_match_info)
+    score = 0.0
+    p, c = match_info[:2]
+    for phrase_num in range(p):
+        phrase_info_idx = 2 + (phrase_num * c * 3)
+        for col_num in range(c):
+            col_idx = phrase_info_idx + (col_num * 3)
+            x1, x2 = match_info[col_idx:col_idx + 2]
+            if x1 > 0:
+                score += float(x1) / x2
+    return score
+
+
+def bm25(raw_match_info, column_index, k1=1.2, b=0.75):
+    """
+    FTS4-only ranking function.
+
+    Usage:
+
+        # Format string *must* be pcxnal
+        # Second parameter to bm25 specifies the index of the column, on
+        # the table being queries.
+
+        bm25(matchinfo(document_tbl, 'pcxnal'), 1) AS rank
+    """
+    match_info = parseMatchInfo(raw_match_info)
+    score = 0.0
+    # p, 1 --> num terms
+    # c, 1 --> num cols
+    # x, (3 * p * c) --> for each phrase/column,
+    #     term_freq for this column
+    #     term_freq for all columns
+    #     total documents containing this term
+    # n, 1 --> total rows in table
+    # a, c --> for each column, avg number of tokens in this column
+    # l, c --> for each column, length of value for this column (in this row)
+    # s, c --> ignore
+    p, c = match_info[:2]
+    n_idx = 2 + (3 * p * c)
+    a_idx = n_idx + 1
+    l_idx = a_idx + c
+    n = match_info[n_idx]
+    a = match_info[a_idx: a_idx + c]
+    l = match_info[l_idx: l_idx + c]
+
+    total_docs = n
+    avg_length = float(a[column_index])
+    doc_length = float(l[column_index])
+    if avg_length == 0:
+        D = 0
+    else:
+        D = 1 - b + (b * (doc_length / avg_length))
+
+    for phrase in range(p):
+        # p, c, p0c01, p0c02, p0c03, p0c11, p0c12, p0c13, p1c01, p1c02, p1c03..
+        # So if we're interested in column <i>, the counts will be at indexes
+        x_idx = 2 + (3 * column_index * (phrase + 1))
+        term_freq = float(match_info[x_idx])
+        term_matches = float(match_info[x_idx + 2])
+
+        # The `max` check here is based on a suggestion in the Wikipedia
+        # article. For terms that are common to a majority of documents, the
+        # idf function can return negative values. Applying the max() here
+        # weeds out those values.
+        idf = max(
+            math.log(
+                (total_docs - term_matches + 0.5) /
+                (term_matches + 0.5)),
+            0)
+
+        denom = term_freq + (k1 * D)
+        if denom == 0:
+            rhs = 0
+        else:
+            rhs = (term_freq * (k1 + 1)) / denom
+
+        score += (idf * rhs)
+
+    return score
diff --git a/tests/test_ranking.py b/tests/test_ranking.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+
+
+import unittest
+import sqlite3
+import re
+
+import sqlitefts as fts
+from sqlitefts import ranking
+
+
+class Tokenizer(fts.Tokenizer):
+
+    _spliter = re.compile(r'\s+|\S+', re.UNICODE)
+    _nonws = re.compile(r'\S+', re.UNICODE)
+
+    def _normalize(self, token):
+        return token.lower()
+
+    def _tokenize(self, text):
+        pos = 0
+        for t in self._spliter.findall(text):
+            byteLen = len(t.encode('utf-8'))
+            if self._nonws.match(t):
+                yield self._normalize(t), pos, pos + byteLen
+            pos += byteLen
+
+    def tokenize(self, text):
+        return self._tokenize(text)
+
+
+class TestCase(unittest.TestCase):
+
+    def setUp(self):
+        name = 'test'
+        conn = sqlite3.connect(':memory:')
+        conn.row_factory = sqlite3.Row
+
+        fts.register_tokenizer(conn, name, fts.make_tokenizer_module(Tokenizer()))
+
+        conn.execute('CREATE VIRTUAL TABLE fts3 USING FTS3(tokenize={})'.format(name))
+        conn.execute('CREATE VIRTUAL TABLE fts4 USING FTS4(tokenize={})'.format(name))
+
+        values = [
+            (u'Make thing I',),
+            (u'Some thing φχικλψ thing',),
+            (u'Fusce volutpat hendrerit sem. Fusce sit amet vulputate dui. '
+             u'Sed posuere mi a nisl aliquet tempor. Praesent tincidunt vel nunc ac pharetra.',),
+            (u'Nam molestie euismod leo id aliquam. In hac habitasse platea dictumst.',),
+            (u'Vivamus tincidunt feugiat tellus ac bibendum. In rhoncus dignissim suscipit.',),
+            (u'Pellentesque hendrerit nulla rutrum luctus rutrum. Fusce hendrerit fermentum nunc at posuere.',),
+            ]
+        for n in ('fts3', 'fts4'):
+            result = conn.executemany('INSERT INTO {0} VALUES(?)'.format(n), values)
+            assert result.rowcount == len(values)
+
+        conn.create_function('bm25', 2, ranking.bm25)
+        conn.create_function('rank', 1, ranking.simple)
+
+        self.testee = conn
+
+    def testSimple(self):
+        sql = '''
+      SELECT content, rank(matchinfo(fts3)) AS rank
+      FROM fts3
+      WHERE fts3 MATCH :query
+      ORDER BY rank DESC
+    '''
+        actual = [dict(x) for x in self.testee.execute(sql, {'query': u'thing'})]
+
+        self.assertEqual(2, len(actual))
+        self.assertEqual({
+                             'content': u'Some thing φχικλψ thing',
+                             'rank': 0.6666666666666666
+                         }, actual[0])
+        self.assertEqual({
+                             'content': u'Make thing I',
+                             'rank': 0.3333333333333333
+                         }, actual[1])
+
+    def testBm25(self):
+        sql = '''
+      SELECT content, bm25(matchinfo(fts4, 'pcxnal'), 0) AS rank
+      FROM fts4
+      WHERE fts4 MATCH :query
+      ORDER BY rank DESC
+    '''
+        actual = [dict(x) for x in self.testee.execute(sql, {'query': u'thing'})]
+
+        self.assertEqual(2, len(actual))
+        self.assertEqual({
+                             'content': u'Some thing φχικλψ thing',
+                             'rank': 0.9722786938230542
+                         }, actual[0])
+        self.assertEqual({
+                             'content': u'Make thing I',
+                             'rank': 0.8236501036844982
+                         }, actual[1])