Skip to content

Commit

Permalink
simple test for ExampleIndex nearest neighbor search
Browse files Browse the repository at this point in the history
  • Loading branch information
DNGros committed Aug 30, 2018
1 parent 195e26d commit 74ea647
Show file tree
Hide file tree
Showing 9 changed files with 264 additions and 115 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,5 @@ venv.bak/

.idea/
.vscode/

.DS_STORE
Binary file removed ainix_common/.DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion ainix_common/parsing/typecontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ def fill_default_parsers(self):
if no_children and no_default:
self._link_no_args_obj_parser(object_)


def verify(self):
"""After you have instantiated all the types and objects you need in this
context, you can call this method to verify all referenced types are
Expand Down
114 changes: 114 additions & 0 deletions ainix_kernel/indexing/exampleindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from typing import List
import attr
import indexing.whooshbackend
from indexing.index import IndexBackendScheme, IndexBackendFields, IndexBackendABC
import parseast
from parseast import StringParser
from typecontext import TypeContext, AInixType
from whoosh.query import Term, And, Or
from whoosh.analysis.analyzers import Analyzer, StandardAnalyzer
from whoosh.analysis.tokenizers import RegexTokenizer
from whoosh.analysis.filters import LowercaseFilter


@attr.s(auto_attribs=True, frozen=True)
class Example:
xquery: str
ytext: str
xtype: str
ytype: str
weight: float
yindexable: str = None


class ExamplesIndex:
"""Provides a higher level interface around an IndexBackendABC specifically
related to the domain of AInix examples"""
DEFAULT_X_TYPE = "WordSequence"
# TODO (DNGros): this shouldn't really be here. Should not depend on whoosh
x_tokenizer = RegexTokenizer() | LowercaseFilter()

def __init__(self, type_context: TypeContext, backend: IndexBackendABC = None):
scheme = self.get_scheme()
self.backend = backend if backend else \
indexing.whooshbackend.WhooshIndexBackend(scheme)
self.type_context = type_context

@staticmethod
def get_scheme() -> 'IndexBackendScheme':
return IndexBackendScheme(
xquery=IndexBackendFields.TEXT,
ytext=IndexBackendFields.TEXT,
xtype=IndexBackendFields.ID,
ytype=IndexBackendFields.ID,
yindexable=IndexBackendFields.SPACE_UNSTORED_TEXT,
weight=IndexBackendFields.TEXT
)

def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
parser = StringParser(self.type_context.get_type_by_name(y_type))
# TODO (DNGros): cache the parsers for each type
ast = parser.create_parse_tree(y_string)
return ast.indexable_repr()

def add_example(self, example: Example) -> None:
self.backend.add_documents([attr.asdict(example)])

def add_many_to_many_with_weighted(
self,
x_values: List[str],
y_values: List[str],
x_type: str,
y_type: str,
weights: List[float],
) -> None:
for x in x_values:
for y, weight in zip(y_values, weights):
new_example = Example(x, y, x_type, y_type, weight,
self._get_yparsed_rep(y, y_type))
self.add_example(new_example)

def _default_weight(self, i: int, n: int):
"""Gets a default weight for a value. Each value in the sequence
is half as preferable as the one before it
Args:
i : index in the sequence of values (zero indexed)
n : total number of values in sequence
"""
if i+1 > n:
raise ValueError()
sequence_sum = 2**n-1
return (2**(n-i-1))/sequence_sum

def add_many_to_many_default_weight(
self,
x_values: List[str],
y_values: List[str],
x_type: str,
y_type: str
) -> None:
"""Adds several examples with the y_values default weighted."""
y_count = len(y_values)
weights = [self._default_weight(i, y_count)
for i, y in enumerate(y_values)]
self.add_many_to_many_with_weighted(x_values, y_values,
x_type, y_type, weights)

def get_nearest_examples(
self,
x_value: str,
choose_type: AInixType = None
) -> List[Example]:
tokenized_x_value = (tok.text for tok in self.x_tokenizer(x_value))
query = Or([Term("xquery", term) for term in tokenized_x_value])
if choose_type:
y_type_indexable_rep = parseast.indexable_repr_classify_type(choose_type)
print(y_type_indexable_rep)
query &= Term("yindexable", y_type_indexable_rep)
print("Query is")
print(query)
query_result = self.backend.query(query)
list_result = [Example(**hit.doc) for hit in query_result]
print(self.backend.query(Term("xquery", "what")))
return list_result
13 changes: 8 additions & 5 deletions ainix_kernel/indexing/exampleloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
# TODO (DNGros): this kinda repeats code from the type loader. Figure out
# how to make this DRYer...
import yaml

import indexing.exampleindex

try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
Expand All @@ -15,7 +18,7 @@

def load_path(
path : str,
index: index.ExamplesIndex,
index: indexing.exampleindex.ExamplesIndex,
) -> None:
"""Loads a *.ainix.yaml file and registers and definesgT
or objects with the supplied type_context"""
Expand All @@ -25,14 +28,14 @@ def load_path(
load_yaml(f, index)


def load_yaml(filelike: IO, index: index.ExamplesIndex) -> None:
def load_yaml(filelike: IO, index: indexing.exampleindex.ExamplesIndex) -> None:
doc = yaml.safe_load(filelike)
_load(doc, index)


def _load(
parsed_doc: Dict,
index: index.ExamplesIndex,
index: indexing.exampleindex.ExamplesIndex,
) -> None:
"""
Args:
Expand Down Expand Up @@ -60,7 +63,7 @@ def _load_single_example(
example_dict: Dict,
xtype: str,
ytype: str,
load_index: index.ExamplesIndex
load_index: indexing.exampleindex.ExamplesIndex
):
x = example_dict['x']
if not isinstance(x, list):
Expand All @@ -73,7 +76,7 @@ def _load_single_example(
load_index.add_many_to_many_default_weight(x, y, xtype, ytype)


def _load_example_set(define: Dict, load_index: index.ExamplesIndex):
def _load_example_set(define: Dict, load_index: indexing.exampleindex.ExamplesIndex):
y_type = define['y_type']
x_type = define.get('x_type', load_index.DEFAULT_X_TYPE)
examples = define['examples']
Expand Down
117 changes: 25 additions & 92 deletions ainix_kernel/indexing/index.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,40 @@
from abc import ABC, abstractmethod
import attr
from typing import Iterable, Dict, List
from ainix_common.parsing import examplecontext
import enum
import whoosh.query
import whoosh.searching
import attr
from typecontext import TypeContext, AInixType, AInixArgument
from parseast import StringParser


@attr.s(auto_attribs=True, frozen=True)
class Example:
xquery: str
ytext: str
xtype: str
ytype: str
yparsed_rep: str
weight: float


class ExamplesIndex:
"""Provides a higher level interface around an IndexBackendABC specifically
related to the domain of AInix examples"""
DEFAULT_X_TYPE = "WordSequence"

def __init__(self, type_context: TypeContext):
import indexing.whooshbackend
scheme = self._create_scheme()
self.backend = indexing.whooshbackend.WhooshIndexBackend(scheme)
self.type_context = type_context

@staticmethod
def _create_scheme() -> 'IndexBackendScheme':
return IndexBackendScheme(
xquery=IndexBackendFields.TEXT,
ytext=IndexBackendFields.TEXT,
xtype=IndexBackendFields.ID,
ytype=IndexBackendFields.ID,
yparsed_rep=IndexBackendFields.TEXT,
weight=IndexBackendFields.TEXT
)

def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
parser = StringParser(self.type_context.get_type_by_name(y_type))
# TODO (DNGros): cache the parsers for each type
ast = parser.create_parse_tree(y_string)
return ast.indexable_repr()
class Query(whoosh.query.Query):
"""An query on IndexBackendj. For now to save time it will just exactly
copy a whoosh query. However, this should likely eventually be abstracted
away and tied to specific backend (or it might work to make other backends
just convert from whoosh to their query scheme. We'll figure it out later)"""
pass

def add_example(self, example: Example) -> None:
self.backend.add_documents([attr.asdict(example)])

def add_many_to_many_with_weighted(
self,
x_values: List[str],
y_values: List[str],
x_type: str,
y_type: str,
weights: List[float],
) -> None:
for x in x_values:
for y, weight in zip(y_values, weights):
new_example = Example(x, y, x_type, y_type,
self._get_yparsed_rep(y, y_type),
weight)
self.add_example(new_example)
# TODO (DNGros): figure out how want to do result and if need special object
# for it
#class Results(whoosh.searching.Results):
# """Results from on IndexBackend. For now to save time it will just exactly
# copy a whoosh query. However, this should likely eventually be abstracted
# away and tied to specific backend"""
# pass

def _default_weight(self, i: int, n: int):
"""Gets a default weight for a value. Each value in the sequence
is half as preferable as the one before it
Args:
i : index in the sequence of values (zero indexed)
n : total number of values in sequence
"""
if i+1 > n:
raise ValueError()
sequence_sum = 2**n-1
return (2**(n-i-1))/sequence_sum

def add_many_to_many_default_weight(
self,
x_values: List[str],
y_values: List[str],
x_type: str,
y_type: str
) -> None:
"""Adds several examples with the y_values default weighted."""
y_count = len(y_values)
weights = [self._default_weight(i, y_count)
for i, y in enumerate(y_values)]
self.add_many_to_many_with_weighted(x_values, y_values,
x_type, y_type, weights)

def get_nearest_examples(
self,
x_value: str,
x_type: str = DEFAULT_X_TYPE,
y_type: str = None
):
tokenized_x_value = x_value.split(" ")
return self.backend.field_or_terms("xquery", tokenized_x_value)
@attr.s(auto_attribs=True)
class SearchHit:
doc: Dict
score: float = None


class IndexBackendFields(enum.Enum):
TEXT = "TEXT_FIELD"
ID = "ID_FILED"
NUMBER = "NUMBER_FIELD"
UNSTORED_TEXT = "UNSTORED_TEXT_FIELD"
# A unstored texts that tokenizes purly on spaces
SPACE_UNSTORED_TEXT = "UNSTORED_TEXT_FIELD_SPACE_TOKENIZE"


class IndexBackendScheme:
Expand All @@ -124,3 +53,7 @@ class IndexBackendABC(ABC):
@abstractmethod
def add_documents(self, documents: Iterable[Dict]):
pass

@abstractmethod
def query(self, query: Query) -> List[Dict]:
pass
Loading

0 comments on commit 74ea647

Please sign in to comment.