-
Notifications
You must be signed in to change notification settings - Fork 63
Transmit BayesDB stochasticity to crosscat engine #382
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ae3f370
1b0d67c
d4dd912
f6d6fb9
cfc8475
4ed9183
6af8e2b
b91e024
b2becd8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,9 @@ | |
interface for Crosscat. | ||
""" | ||
|
||
# For the EngineTemplate import | ||
from __future__ import absolute_import | ||
|
||
import apsw | ||
import itertools | ||
import json | ||
|
@@ -43,6 +46,9 @@ | |
from bayeslite.util import cursor_value | ||
from bayeslite.util import unique | ||
|
||
from crosscat.EngineTemplate import EngineTemplate | ||
from crosscat.MultiprocessingEngine import MultiprocessingEngine, Pool | ||
|
||
crosscat_schema_1 = ''' | ||
INSERT INTO bayesdb_metamodel (name, version) VALUES ('crosscat', 1); | ||
|
||
|
@@ -220,15 +226,55 @@ class CrosscatMetamodel(metamodel.IBayesDBMetamodel): | |
|
||
Internally, the Crosscat metamodel adds SQL tables to the database | ||
with names that begin with ``bayesdb_crosscat_``. | ||
|
||
`crosscat` can be a crosscat engine instance or a constructor callable | ||
which takes a seed. Prefer to pass a constructor, since that way a new | ||
crosscat engine is constructed for each query, and its random seed is taken | ||
from the BayesDB object. The 'crosscat as instance' case is kept for | ||
backwards compatibility. | ||
|
||
If using a crosscat MultiprocessingEngine, the process pool must be passed | ||
as a 'pool' argument. It would not do for a new process pool to be created | ||
every time a new query is made. | ||
|
||
E.g. | ||
|
||
>>> from crosscat import MultiprocessingEngine as mpe | ||
>>> import bayeslite.metamodels.crosscat as cc | ||
>>> from bayeslite import bayesdb_register_metamodel as register_metamodel | ||
>>> from bayeslite import bayesdb_open | ||
>>> pool = mpe.Pool(4) | ||
>>> engine_factory = lambda s: mpe.MultiprocessingEngine(seed=s, pool=pool) | ||
>>> metamodel = cc.CrosscatMetamodel(engine_factory) | ||
>>> bdb = bayesdb_open(builtin_metamodels=False) | ||
>>> register_metamodel(bdb, metamodel) | ||
|
||
Since people will often want in particular to create MultiprocessingEngine | ||
factories with a fixed thread pool, | ||
crosscat.MultiprocessingEngine.MultiprocessingEngineFactoryFromPool is | ||
provided as a convenience function: | ||
|
||
>>> engine_factory = mpe.MultiprocessingEngineFactoryFromPool(pool) | ||
>>> metamodel = cc.CrosscatMetamodel(thelambda) | ||
>>> register_metamodel(bdb, metamodel) | ||
|
||
""" | ||
|
||
def __init__(self, crosscat, subsample=None): | ||
if subsample is None: | ||
subsample = False | ||
self._crosscat = crosscat | ||
self._subsample = subsample | ||
self._subsample = False if subsample is None else subsample | ||
self._theta_validator = crosscat_theta_validator.Validator() | ||
|
||
def _crosscat_engine(self, bdb): | ||
if isinstance(self._crosscat, EngineTemplate): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, not resolved. @riastradh-probcomp why not use isinstance? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It makes the interface incoherent and needlessly multiplies the possible disjoint branches you need to consider when thinking about the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Concrete suggestion then is to drop backwards compatibility, or to add a different named interface for this case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Concrete suggestion is to summarize what the intended change in interface is up front, in the pull request comment. Too many conceptual changes in the control and data flow here to fit into my head in order to make a local suggestion, I'm afraid. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Read above: `
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added the following to the PR comment:
|
||
return self._crosscat | ||
# XXX with so few seeds, the optimistic scenario is that someone is | ||
# going to birthday-paradox themselves. We need an RNG on the crosscat | ||
# side with a bigger seed space. Python's randint can take an arbitrary | ||
# range, here, but crosscat can't take a bigger seed. | ||
seed = bdb._py_prng.randint(0, 2**32-1) | ||
return self._crosscat(seed=seed) | ||
|
||
def _crosscat_cache_nocreate(self, bdb): | ||
if bdb.cache is None: | ||
return None | ||
|
@@ -403,7 +449,7 @@ def _crosscat_get_rows(self, bdb, generator_id, rowids, X_L_list, | |
if len(rows) > 0: | ||
# Need to put more stuff into the subsample temporarily | ||
T = self._crosscat_data(bdb, generator_id, M_c) | ||
X_L_list, X_D_list, T = self._crosscat.insert( | ||
X_L_list, X_D_list, T = self._crosscat_engine(bdb).insert( | ||
M_c=M_c, | ||
T=T, | ||
X_L_list=X_L_list, | ||
|
@@ -794,7 +840,7 @@ def initialize_models(self, bdb, generator_id, modelnos, model_config): | |
'row_initialization': 'from_the_prior', | ||
} | ||
M_c = self._crosscat_metadata(bdb, generator_id) | ||
X_L_list, X_D_list = self._crosscat.initialize( | ||
X_L_list, X_D_list = self._crosscat_engine(bdb).initialize( | ||
M_c=M_c, | ||
M_r=None, # XXX | ||
T=self._crosscat_data(bdb, generator_id, M_c), | ||
|
@@ -811,7 +857,8 @@ def initialize_models(self, bdb, generator_id, modelnos, model_config): | |
for colno1, colno2, dep in | ||
crosscat_gen_column_dependencies(bdb, generator_id)] | ||
if 0 < len(dep_constraints): | ||
X_L_list, X_D_list = self._crosscat.ensure_col_dep_constraints( | ||
engine = self._crosscat_engine(bdb) | ||
X_L_list, X_D_list = engine.ensure_col_dep_constraints( | ||
M_c=M_c, | ||
M_r=None, | ||
T=self._crosscat_data(bdb, generator_id, M_c), | ||
|
@@ -944,7 +991,8 @@ def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, | |
iterations_in_ckpt = 0 | ||
while True: | ||
X_L_list_0 = X_L_list | ||
X_L_list, X_D_list, diagnostics = self._crosscat.analyze( | ||
engine = self._crosscat_engine(bdb) | ||
X_L_list, X_D_list, diagnostics = engine.analyze( | ||
M_c=M_c, | ||
T=T, | ||
do_diagnostics=True, | ||
|
@@ -1056,7 +1104,7 @@ def column_mutual_information(self, bdb, generator_id, modelno, colno0, | |
X_D_list = self._crosscat_latent_data(bdb, generator_id, modelno) | ||
cc_colno0 = crosscat_cc_colno(bdb, generator_id, colno0) | ||
cc_colno1 = crosscat_cc_colno(bdb, generator_id, colno1) | ||
r = self._crosscat.mutual_information( | ||
r = self._crosscat_engine(bdb).mutual_information( | ||
M_c=self._crosscat_metadata(bdb, generator_id), | ||
X_L_list=X_L_list, | ||
X_D_list=X_D_list, | ||
|
@@ -1078,7 +1126,7 @@ def row_similarity(self, bdb, generator_id, modelno, rowid, target_rowid, | |
[given_row_id, target_row_id], X_L_list, X_D_list = \ | ||
self._crosscat_get_rows(bdb, generator_id, [rowid, target_rowid], | ||
X_L_list, X_D_list) | ||
return self._crosscat.similarity( | ||
return self._crosscat_engine(bdb).similarity( | ||
M_c=self._crosscat_metadata(bdb, generator_id), | ||
X_L_list=X_L_list, | ||
X_D_list=X_D_list, | ||
|
@@ -1100,7 +1148,7 @@ def predict_confidence(self, bdb, generator_id, modelno, colno, rowid, | |
self._crosscat_get_row(bdb, generator_id, rowid, X_L_list, | ||
X_D_list) | ||
cc_colno = crosscat_cc_colno(bdb, generator_id, colno) | ||
code, confidence = self._crosscat.impute_and_confidence( | ||
code, confidence = self._crosscat_engine(bdb).impute_and_confidence( | ||
M_c=M_c, | ||
X_L=X_L_list, | ||
X_D=X_D_list, | ||
|
@@ -1124,7 +1172,8 @@ def simulate_joint(self, bdb, generator_id, targets, constraints, | |
X_D_list = self._crosscat_latent_data(bdb, generator_id, modelno) | ||
Q, Y, X_L_list, X_D_list = self._crosscat_remap_two( | ||
bdb, generator_id, X_L_list, X_D_list, targets, constraints) | ||
raw_outputs = self._crosscat.simple_predictive_sample( | ||
engine = self._crosscat_engine(bdb) | ||
raw_outputs = engine.simple_predictive_sample( | ||
M_c=M_c, | ||
X_L=X_L_list, | ||
X_D=X_D_list, | ||
|
@@ -1155,7 +1204,7 @@ def logpdf_joint(self, bdb, generator_id, targets, constraints, | |
X_D_list = self._crosscat_latent_data(bdb, generator_id, modelno) | ||
Q, Y, X_L_list, X_D_list = self._crosscat_remap_two( | ||
bdb, generator_id, X_L_list, X_D_list, targets, constraints) | ||
r = self._crosscat.predictive_probability_multistate( | ||
r = self._crosscat_engine(bdb).predictive_probability_multistate( | ||
M_c=M_c, | ||
X_L_list=X_L_list, | ||
X_D_list=X_D_list, | ||
|
@@ -1215,7 +1264,7 @@ def insertmany(self, bdb, generator_id, rows): | |
modelnos = [modelno for modelno, _theta_json in models] | ||
thetas = [json.loads(theta_json) | ||
for _modelno, theta_json in models] | ||
X_L_list, X_D_list, T = self._crosscat.insert( | ||
X_L_list, X_D_list, T = self._crosscat_engine(bdb).insert( | ||
M_c=M_c, | ||
T=T, | ||
X_L_list=[theta['X_L'] for theta in thetas], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dup import