simple test for ExampleIndex nearest neighbor search

AInixProject · Aug 30, 2018 · 74ea647 · 74ea647
1 parent 195e26d
commit 74ea647
Show file tree

Hide file tree

Showing 9 changed files with 264 additions and 115 deletions.
diff --git a/.gitignore b/.gitignore
@@ -115,3 +115,5 @@ venv.bak/
 
 .idea/
 .vscode/
+
+.DS_STORE
diff --git a/ainix_common/.DS_Store b/ainix_common/.DS_Store
diff --git a/ainix_common/parsing/typecontext.py b/ainix_common/parsing/typecontext.py
@@ -290,7 +290,6 @@ def fill_default_parsers(self):
             if no_children and no_default:
                 self._link_no_args_obj_parser(object_)
 
-
     def verify(self):
         """After you have instantiated all the types and objects you need in this
         context, you can call this method to verify all referenced types are

diff --git a/ainix_kernel/indexing/exampleindex.py b/ainix_kernel/indexing/exampleindex.py
@@ -0,0 +1,114 @@
+from typing import List
+import attr
+import indexing.whooshbackend
+from indexing.index import IndexBackendScheme, IndexBackendFields, IndexBackendABC
+import parseast
+from parseast import StringParser
+from typecontext import TypeContext, AInixType
+from whoosh.query import Term, And, Or
+from whoosh.analysis.analyzers import Analyzer, StandardAnalyzer
+from whoosh.analysis.tokenizers import RegexTokenizer
+from whoosh.analysis.filters import LowercaseFilter
+
+
+@attr.s(auto_attribs=True, frozen=True)
+class Example:
+    xquery: str
+    ytext: str
+    xtype: str
+    ytype: str
+    weight: float
+    yindexable: str = None
+
+
+class ExamplesIndex:
+    """Provides a higher level interface around an IndexBackendABC specifically
+    related to the domain of AInix examples"""
+    DEFAULT_X_TYPE = "WordSequence"
+    # TODO (DNGros): this shouldn't really be here. Should not depend on whoosh
+    x_tokenizer = RegexTokenizer() | LowercaseFilter()
+
+    def __init__(self, type_context: TypeContext, backend: IndexBackendABC = None):
+        scheme = self.get_scheme()
+        self.backend = backend if backend else \
+            indexing.whooshbackend.WhooshIndexBackend(scheme)
+        self.type_context = type_context
+
+    @staticmethod
+    def get_scheme() -> 'IndexBackendScheme':
+        return IndexBackendScheme(
+            xquery=IndexBackendFields.TEXT,
+            ytext=IndexBackendFields.TEXT,
+            xtype=IndexBackendFields.ID,
+            ytype=IndexBackendFields.ID,
+            yindexable=IndexBackendFields.SPACE_UNSTORED_TEXT,
+            weight=IndexBackendFields.TEXT
+        )
+
+    def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
+        parser = StringParser(self.type_context.get_type_by_name(y_type))
+        # TODO (DNGros): cache the parsers for each type
+        ast = parser.create_parse_tree(y_string)
+        return ast.indexable_repr()
+
+    def add_example(self, example: Example) -> None:
+        self.backend.add_documents([attr.asdict(example)])
+
+    def add_many_to_many_with_weighted(
+        self,
+        x_values: List[str],
+        y_values: List[str],
+        x_type: str,
+        y_type: str,
+        weights: List[float],
+    ) -> None:
+        for x in x_values:
+            for y, weight in zip(y_values, weights):
+                new_example = Example(x, y, x_type, y_type, weight,
+                                      self._get_yparsed_rep(y, y_type))
+                self.add_example(new_example)
+
+    def _default_weight(self, i: int, n: int):
+        """Gets a default weight for a value. Each value in the sequence
+        is half as preferable as the one before it
+
+        Args:
+            i : index in the sequence of values (zero indexed)
+            n : total number of values in sequence
+        """
+        if i+1 > n:
+            raise ValueError()
+        sequence_sum = 2**n-1
+        return (2**(n-i-1))/sequence_sum
+
+    def add_many_to_many_default_weight(
+        self,
+        x_values: List[str],
+        y_values: List[str],
+        x_type: str,
+        y_type: str
+    ) -> None:
+        """Adds several examples with the y_values default weighted."""
+        y_count = len(y_values)
+        weights = [self._default_weight(i, y_count)
+                   for i, y in enumerate(y_values)]
+        self.add_many_to_many_with_weighted(x_values, y_values,
+                                            x_type, y_type, weights)
+
+    def get_nearest_examples(
+        self,
+        x_value: str,
+        choose_type: AInixType = None
+    ) -> List[Example]:
+        tokenized_x_value = (tok.text for tok in  self.x_tokenizer(x_value))
+        query = Or([Term("xquery", term) for term in tokenized_x_value])
+        if choose_type:
+            y_type_indexable_rep = parseast.indexable_repr_classify_type(choose_type)
+            print(y_type_indexable_rep)
+            query &= Term("yindexable", y_type_indexable_rep)
+        print("Query is")
+        print(query)
+        query_result = self.backend.query(query)
+        list_result = [Example(**hit.doc) for hit in query_result]
+        print(self.backend.query(Term("xquery", "what")))
+        return list_result
diff --git a/ainix_kernel/indexing/exampleloader.py b/ainix_kernel/indexing/exampleloader.py
@@ -4,6 +4,9 @@
 # TODO (DNGros): this kinda repeats code from the type loader. Figure out
 # how to make this DRYer...
 import yaml
+
+import indexing.exampleindex
+
 try:
     from yaml import CLoader as Loader, CDumper as Dumper
 except ImportError:
@@ -15,7 +18,7 @@
 
 def load_path(
     path : str,
-    index: index.ExamplesIndex,
+    index: indexing.exampleindex.ExamplesIndex,
 ) -> None:
     """Loads a *.ainix.yaml file and registers and definesgT
     or objects with the supplied type_context"""
@@ -25,14 +28,14 @@ def load_path(
         load_yaml(f, index)
 
 
-def load_yaml(filelike: IO, index: index.ExamplesIndex) -> None:
+def load_yaml(filelike: IO, index: indexing.exampleindex.ExamplesIndex) -> None:
     doc = yaml.safe_load(filelike)
     _load(doc, index)
 
 
 def _load(
     parsed_doc: Dict,
-    index: index.ExamplesIndex,
+    index: indexing.exampleindex.ExamplesIndex,
 ) -> None:
     """
     Args:
@@ -60,7 +63,7 @@ def _load_single_example(
     example_dict: Dict,
     xtype: str,
     ytype: str,
-    load_index: index.ExamplesIndex
+    load_index: indexing.exampleindex.ExamplesIndex
 ):
     x = example_dict['x']
     if not isinstance(x, list):
@@ -73,7 +76,7 @@ def _load_single_example(
     load_index.add_many_to_many_default_weight(x, y, xtype, ytype)
 
 
-def _load_example_set(define: Dict, load_index: index.ExamplesIndex):
+def _load_example_set(define: Dict, load_index: indexing.exampleindex.ExamplesIndex):
     y_type = define['y_type']
     x_type = define.get('x_type', load_index.DEFAULT_X_TYPE)
     examples = define['examples']

diff --git a/ainix_kernel/indexing/index.py b/ainix_kernel/indexing/index.py
@@ -1,111 +1,40 @@
 from abc import ABC, abstractmethod
-import attr
 from typing import Iterable, Dict, List
-from ainix_common.parsing import examplecontext
 import enum
+import whoosh.query
+import whoosh.searching
 import attr
-from typecontext import TypeContext, AInixType, AInixArgument
-from parseast import StringParser
-
-
-@attr.s(auto_attribs=True, frozen=True)
-class Example:
-    xquery: str
-    ytext: str
-    xtype: str
-    ytype: str
-    yparsed_rep: str
-    weight: float
-
-
-class ExamplesIndex:
-    """Provides a higher level interface around an IndexBackendABC specifically
-    related to the domain of AInix examples"""
-    DEFAULT_X_TYPE = "WordSequence"
-
-    def __init__(self, type_context: TypeContext):
-        import indexing.whooshbackend
-        scheme = self._create_scheme()
-        self.backend = indexing.whooshbackend.WhooshIndexBackend(scheme)
-        self.type_context = type_context
 
-    @staticmethod
-    def _create_scheme() -> 'IndexBackendScheme':
-        return IndexBackendScheme(
-            xquery=IndexBackendFields.TEXT,
-            ytext=IndexBackendFields.TEXT,
-            xtype=IndexBackendFields.ID,
-            ytype=IndexBackendFields.ID,
-            yparsed_rep=IndexBackendFields.TEXT,
-            weight=IndexBackendFields.TEXT
-        )
 
-    def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
-        parser = StringParser(self.type_context.get_type_by_name(y_type))
-        # TODO (DNGros): cache the parsers for each type
-        ast = parser.create_parse_tree(y_string)
-        return ast.indexable_repr()
+class Query(whoosh.query.Query):
+    """An query on IndexBackendj. For now to save time it will just exactly
+    copy a whoosh query. However, this should likely eventually be abstracted
+    away and tied to specific backend (or it might work to make other backends
+    just convert from whoosh to their query scheme. We'll figure it out later)"""
+    pass
 
-    def add_example(self, example: Example) -> None:
-        self.backend.add_documents([attr.asdict(example)])
 
-    def add_many_to_many_with_weighted(
-            self,
-            x_values: List[str],
-            y_values: List[str],
-            x_type: str,
-            y_type: str,
-            weights: List[float],
-    ) -> None:
-        for x in x_values:
-            for y, weight in zip(y_values, weights):
-                new_example = Example(x, y, x_type, y_type,
-                                      self._get_yparsed_rep(y, y_type),
-                                      weight)
-                self.add_example(new_example)
+# TODO (DNGros): figure out how want to do result and if need special object
+# for it
+#class Results(whoosh.searching.Results):
+#    """Results from on IndexBackend. For now to save time it will just exactly
+#    copy a whoosh query. However, this should likely eventually be abstracted
+#    away and tied to specific backend"""
+#    pass
 
-    def _default_weight(self, i: int, n: int):
-        """Gets a default weight for a value. Each value in the sequence
-        is half as preferable as the one before it
-
-        Args:
-            i : index in the sequence of values (zero indexed)
-            n : total number of values in sequence
-        """
-        if i+1 > n:
-            raise ValueError()
-        sequence_sum = 2**n-1
-        return (2**(n-i-1))/sequence_sum
-
-    def add_many_to_many_default_weight(
-        self,
-        x_values: List[str],
-        y_values: List[str],
-        x_type: str,
-        y_type: str
-    ) -> None:
-        """Adds several examples with the y_values default weighted."""
-        y_count = len(y_values)
-        weights = [self._default_weight(i, y_count)
-                   for i, y in enumerate(y_values)]
-        self.add_many_to_many_with_weighted(x_values, y_values,
-                                            x_type, y_type, weights)
-
-    def get_nearest_examples(
-        self,
-        x_value: str,
-        x_type: str = DEFAULT_X_TYPE,
-        y_type: str = None
-    ):
-        tokenized_x_value = x_value.split(" ")
-        return self.backend.field_or_terms("xquery", tokenized_x_value)
+@attr.s(auto_attribs=True)
+class SearchHit:
+    doc: Dict
+    score: float = None
 
 
 class IndexBackendFields(enum.Enum):
     TEXT = "TEXT_FIELD"
     ID = "ID_FILED"
     NUMBER = "NUMBER_FIELD"
     UNSTORED_TEXT = "UNSTORED_TEXT_FIELD"
+    # A unstored texts that tokenizes purly on spaces
+    SPACE_UNSTORED_TEXT = "UNSTORED_TEXT_FIELD_SPACE_TOKENIZE"
 
 
 class IndexBackendScheme:
@@ -124,3 +53,7 @@ class IndexBackendABC(ABC):
     @abstractmethod
     def add_documents(self, documents: Iterable[Dict]):
         pass
+
+    @abstractmethod
+    def query(self, query: Query) -> List[Dict]:
+        pass
Original file line number	Diff line number	Diff line change
Expand Up		@@ -115,3 +115,5 @@ venv.bak/

		.idea/
		.vscode/

		.DS_STORE