diff --git a/.gitignore b/.gitignore
index 46f0b7850..acf96801b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ doc/build
 *gst_checkpoints*
 *model_test_checkpoints*
 *standard_gst_checkpoints*
+*ibmqexperiment_checkpoint*
 
 # Serialization Testing Artifacts #
 ###################################
@@ -43,6 +44,7 @@ test/output/pylint/*
 test/output/individual_coverage/*/*
 test/test_packages/cmp_chk_files/Fake_Dataset_none.txt.cache
 **.noseids
+**test_ibmq**
 
 # Tutorial Notebook Untracked Files #
 ####################################
diff --git a/optional-requirements.txt b/optional-requirements.txt
index bbd007812..b238c472c 100644
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@@ -10,5 +10,4 @@ cython
 cvxopt
 cvxpy
 seaborn
-qibo
 packaging
diff --git a/packages/pygsti/__init__.py b/packages/pygsti/__init__.py
deleted file mode 100644
index dc3770519..000000000
--- a/packages/pygsti/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-""" This is a placeholder script to warn pyGSTi users of a change in project structure.
-
-As of pyGSTi v0.9.9, the pyGSTi source directory has been moved from
-`/packages/pygsti` to `/pygsti`. For most users, this change should be
-completely imperceptible. However, if you have installed pyGSTi from
-source in development mode, i.e. using `pip install -e .`, your pyGSTi
-installation may now be broken.
-"""
-
-import warnings
-from pathlib import Path
-
-pygsti_root = Path(__file__).absolute().parent.parent.parent
-
-instructions = """
-\u001b[31m\u001b[1mIf you are seeing this message, you need to reinstall pyGSTi!\u001b[0m
-Open a shell and run the following commands:
-
-1. `cd {pygsti_root}`
-2. `pip install -e .[complete]`
-3. `python -c "import pygsti"`
-
-After following these instructions, if you still see this message,
-check to make sure that you don't have a GST.pth file located in
-your local site-packages directory (try running `find ~ -name GST.pth`).
-
-After removing any GST.pth files, if you're still seeing this
-message, leave a bug report for the pyGSTi developers at
-https://github.com/pyGSTio/pyGSTi/issues
-""".format(pygsti_root=pygsti_root)
-
-warnings.warn(__doc__ + instructions)
-raise NotImplementedError()
diff --git a/pygsti/algorithms/core.py b/pygsti/algorithms/core.py
index f2b749136..dd0a21ef7 100644
--- a/pygsti/algorithms/core.py
+++ b/pygsti/algorithms/core.py
@@ -33,6 +33,8 @@
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.optimize.customlm import CustomLMOptimizer as _CustomLMOptimizer
 from pygsti.optimize.customlm import Optimizer as _Optimizer
+from pygsti import forwardsims as _fwdsims
+from pygsti import layouts as _layouts
 
 _dummy_profiler = _DummyProfiler()
 
@@ -400,7 +402,7 @@ def _construct_ab(prep_fiducials, effect_fiducials, model, dataset, op_label_ali
         for j, rhostr in enumerate(prep_fiducials):
             opLabelString = rhostr + estr  # LEXICOGRAPHICAL VS MATRIX ORDER
             dsStr = opLabelString.replace_layers_with_aliases(op_label_aliases)
-            expd_circuit_outcomes = opLabelString.expand_instruments_and_separate_povm(model)
+            expd_circuit_outcomes = model.expand_instruments_and_separate_povm(opLabelString)
             assert(len(expd_circuit_outcomes) == 1), "No instruments are allowed in LGST fiducials!"
             unique_key = next(iter(expd_circuit_outcomes.keys()))
             outcomes = expd_circuit_outcomes[unique_key]
@@ -429,7 +431,7 @@ def _construct_x_matrix(prep_fiducials, effect_fiducials, model, op_label_tuple,
         for j, rhostr in enumerate(prep_fiducials):
             opLabelString = rhostr + _circuits.Circuit(op_label_tuple, line_labels=rhostr.line_labels) + estr
             dsStr = opLabelString.replace_layers_with_aliases(op_label_aliases)
-            expd_circuit_outcomes = opLabelString.expand_instruments_and_separate_povm(model)
+            expd_circuit_outcomes = model.expand_instruments_and_separate_povm(opLabelString)
             dsRow_fractions = dataset[dsStr].fractions
             assert(len(expd_circuit_outcomes) == nVariants)
 
@@ -677,16 +679,10 @@ def run_gst_fit(mdc_store, optimizer, objective_function_builder, verbosity=0):
         if _np.linalg.norm(mdc_store.model.to_vector() - v_cmp) > 1e-6:
             raise ValueError("MPI ERROR: *different* MC2GST start models"
                              " given to different processors!")                   # pragma: no cover
-
-    #MEM from ..baseobjs.profiler import Profiler
-    #MEM debug_prof = Profiler(comm)
-    #MEM debug_prof.print_memory("run_gst_fit1", True)
-
+        
     if objective_function_builder is not None:
         objective_function_builder = _objfns.ObjectiveFunctionBuilder.cast(objective_function_builder)
-        #MEM debug_prof.print_memory("run_gst_fit2", True)
         objective = objective_function_builder.build_from_store(mdc_store, printer)  # (objective is *also* a store)
-        #MEM debug_prof.print_memory("run_gst_fit3", True)
     else:
         assert(isinstance(mdc_store, _objfns.ObjectiveFunction)), \
             "When `objective_function_builder` is None, `mdc_store` must be an objective fn!"
@@ -705,14 +701,8 @@ def run_gst_fit(mdc_store, optimizer, objective_function_builder, verbosity=0):
 
     printer.log("Completed in %.1fs" % (_time.time() - tStart), 1)
 
-    #if target_model is not None:
-    #  target_vec = target_model.to_vector()
-    #  targetErrVec = _objective_func(target_vec)
-    #  return minErrVec, soln_gs, targetErrVec
     profiler.add_time("do_mc2gst: total time", tStart)
-    #TODO: evTree.permute_computation_to_original(minErrVec) #Doesn't work b/c minErrVec is flattened
-    # but maybe best to just remove minErrVec from return value since this isn't very useful
-    # anyway?
+
     return opt_result, objective
 
 
@@ -888,10 +878,30 @@ def _max_array_types(artypes_list):  # get the maximum number of each array type
     #The ModelDatasetCircuitsStore
     printer.log('Precomputing CircuitOutcomeProbabilityArray layouts for each iteration.', 2)
     precomp_layouts = []
+
+    #pre-compute a dictionary caching completed circuits for layout construction performance.
+    unique_circuits = list({ckt for circuit_list in circuit_lists for ckt in circuit_list})
+    if isinstance(mdl.sim, (_fwdsims.MatrixForwardSimulator, _fwdsims.MapForwardSimulator)):
+        precomp_layout_circuit_cache = mdl.sim.create_copa_layout_circuit_cache(unique_circuits, mdl, dataset=dataset)
+    else:
+        precomp_layout_circuit_cache = None
+
     for i, circuit_list in enumerate(circuit_lists):
         printer.log(f'Layout for iteration {i}', 2)
-        precomp_layouts.append(mdl.sim.create_layout(circuit_list, dataset, resource_alloc, array_types, verbosity= printer - 1))
-    
+        precomp_layouts.append(mdl.sim.create_layout(circuit_list, dataset, resource_alloc, array_types, verbosity= printer - 1,
+                                                    layout_creation_circuit_cache = precomp_layout_circuit_cache))
+        
+    #precompute a cache of possible outcome counts for each circuits to accelerate MDC store creation
+    if isinstance(mdl, _models.model.OpModel):
+        if precomp_layout_circuit_cache is not None: #then grab the split circuits from there.
+            expanded_circuit_outcome_list = mdl.bulk_expand_instruments_and_separate_povm(unique_circuits, 
+                                                                                          completed_circuits= precomp_layout_circuit_cache['completed_circuits'].values())
+        else:
+            expanded_circuit_outcome_list = mdl.bulk_expand_instruments_and_separate_povm(unique_circuits)        
+        outcome_count_by_circuit_cache = {ckt: len(outcome_tup) for ckt,outcome_tup in zip(unique_circuits, expanded_circuit_outcome_list)}
+    else:
+        outcome_count_by_circuit_cache = {ckt: mdl.compute_num_outcomes(ckt) for ckt in unique_circuits}
+
     with printer.progress_logging(1):
         for i in range(starting_index, len(circuit_lists)):
             circuitsToEstimate = circuit_lists[i]
@@ -908,7 +918,8 @@ def _max_array_types(artypes_list):  # get the maximum number of each array type
             mdl.basis = start_model.basis  # set basis in case of CPTP constraints (needed?)
             initial_mdc_store = _objfns.ModelDatasetCircuitsStore(mdl, dataset, circuitsToEstimate, resource_alloc,
                                                                   array_types=array_types, verbosity=printer - 1, 
-                                                                  precomp_layout = precomp_layouts[i])
+                                                                  precomp_layout = precomp_layouts[i],
+                                                                  outcome_count_by_circuit=outcome_count_by_circuit_cache)
             mdc_store = initial_mdc_store
 
             for j, obj_fn_builder in enumerate(iteration_objfn_builders):
diff --git a/pygsti/algorithms/directx.py b/pygsti/algorithms/directx.py
deleted file mode 100644
index c08a7b65a..000000000
--- a/pygsti/algorithms/directx.py
+++ /dev/null
@@ -1,711 +0,0 @@
-"""
-Functions for generating Direct-(LGST, MC2GST, MLGST) models
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-
-import warnings as _warnings
-_warnings.warn("pygsti.algorithms.directx is deprecated and will be removed in pyGSTi 0.9.13")
-
-from pygsti.algorithms import core as _core
-from pygsti import baseobjs as _baseobjs
-from pygsti.baseobjs import Label
-from pygsti import circuits as _circuits
-from pygsti import objectivefns as _objfns
-from pygsti.modelmembers.operations import FullArbitraryOp as _FullArbitraryOp
-
-
-def model_with_lgst_circuit_estimates(
-        circuits_to_estimate, dataset, prep_fiducials, meas_fiducials,
-        target_model, include_target_ops=True, op_label_aliases=None,
-        guess_model_for_gauge=None, circuit_labels=None, svd_truncate_to=None,
-        verbosity=0):
-    """
-    Constructs a model that contains LGST estimates for `circuits_to_estimate`.
-
-    For each circuit in `circuits_to_estimate`, the constructed model
-    contains the LGST estimate for s as separate gate, labeled either by
-    the corresponding element of circuit_labels or by the tuple of s itself.
-
-    Parameters
-    ----------
-    circuits_to_estimate : list of Circuits or tuples
-        The circuits to estimate using LGST
-
-    dataset : DataSet
-        The data to use for LGST
-
-    prep_fiducials : list of Circuits
-       Fiducial circuits used to construct an informationally complete
-       effective preparation.
-
-    meas_fiducials : list of Circuits
-       Fiducial circuits used to construct an informationally complete
-       effective measurement.
-
-    target_model : Model
-        A model used by LGST to specify which operation labels should be estimated,
-        a guess for which gauge these estimates should be returned in, and
-        used to simplify circuits.
-
-    include_target_ops : bool, optional
-        If True, the operation labels in target_model will be included in the
-        returned model.
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    guess_model_for_gauge : Model, optional
-        A model used to compute a gauge transformation that is applied to
-        the LGST estimates.  This gauge transformation is computed such that
-        if the estimated gates matched the model given, then the gate
-        matrices would match, i.e. the gauge would be the same as
-        the model supplied. Defaults to the target_model.
-
-    circuit_labels : list of strings, optional
-        A list of labels in one-to-one correspondence with the
-        circuit in `circuits_to_estimate`.  These labels are
-        the keys to access the operation matrices in the returned
-        Model, i.e. op_matrix = returned_model[op_label]
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    verbosity : int, optional
-        Verbosity value to send to `run_lgst(...)` call.
-
-    Returns
-    -------
-    Model
-        A model containing LGST estimates for all the requested
-        circuits and possibly the gates in target_model.
-    """
-    opLabels = []  # list of operation labels for LGST to estimate
-    if op_label_aliases is None: aliases = {}
-    else: aliases = op_label_aliases.copy()
-
-    #Add circuits to estimate as aliases
-    if circuit_labels is not None:
-        assert(len(circuit_labels) == len(circuits_to_estimate))
-        for opLabel, opStr in zip(circuit_labels, circuits_to_estimate):
-            aliases[opLabel] = opStr.replace_layers_with_aliases(op_label_aliases)
-            opLabels.append(opLabel)
-    else:
-        for opStr in circuits_to_estimate:
-            newLabel = 'G' + '.'.join(map(str, tuple(opStr)))
-            aliases[newLabel] = opStr.replace_layers_with_aliases(op_label_aliases)  # use circuit tuple as label
-            opLabels.append(newLabel)
-
-    #Add target model labels (not aliased) if requested
-    if include_target_ops and target_model is not None:
-        for targetOpLabel in target_model.operations:
-            if targetOpLabel not in opLabels:  # very unlikely that this is false
-                opLabels.append(targetOpLabel)
-
-    return _core.run_lgst(dataset, prep_fiducials, meas_fiducials, target_model,
-                          opLabels, aliases, guess_model_for_gauge,
-                          svd_truncate_to, verbosity)
-
-
-def direct_lgst_model(circuit_to_estimate, circuit_label, dataset,
-                      prep_fiducials, meas_fiducials, target_model,
-                      op_label_aliases=None, svd_truncate_to=None, verbosity=0):
-    """
-    Constructs a model of LGST estimates for target gates and circuit_to_estimate.
-
-    Parameters
-    ----------
-    circuit_to_estimate : Circuit or tuple
-        The single circuit to estimate using LGST
-
-    circuit_label : string
-        The label for the estimate of `circuit_to_estimate`.
-        i.e. op_matrix = returned_model[op_label]
-
-    dataset : DataSet
-        The data to use for LGST
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix.  Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    verbosity : int, optional
-        Verbosity value to send to `run_lgst(...)` call.
-
-    Returns
-    -------
-    Model
-        A model containing LGST estimates of `circuit_to_estimate`
-        and the gates of `target_model`.
-    """
-    return model_with_lgst_circuit_estimates(
-        [circuit_to_estimate], dataset, prep_fiducials, meas_fiducials, target_model,
-        True, op_label_aliases, None, [circuit_label], svd_truncate_to,
-        verbosity)
-
-
-def direct_lgst_models(circuits, dataset, prep_fiducials, meas_fiducials, target_model,
-                       op_label_aliases=None, svd_truncate_to=None, verbosity=0):
-    """
-    Constructs a dictionary with keys == circuits and values == Direct-LGST Models.
-
-    Parameters
-    ----------
-    circuits : list of Circuit or tuple objects
-        The circuits to estimate using LGST.  The elements of this list
-        are the keys of the returned dictionary.
-
-    dataset : DataSet
-        The data to use for all LGST estimates.
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    verbosity : int, optional
-        Verbosity value to send to run_lgst(...) call.
-
-    Returns
-    -------
-    dict
-        A dictionary that relates each circuit to a Model containing the LGST
-        estimate of that circuit's action (as a SPAM-less operation sequence)
-        stored under the operation label "GsigmaLbl", along with LGST estimates
-        of the gates in `target_model`.
-    """
-    printer = _baseobjs.VerbosityPrinter.create_printer(verbosity)
-
-    directLGSTmodels = {}
-    printer.log("--- Direct LGST precomputation ---")
-    with printer.progress_logging(1):
-        for i, sigma in enumerate(circuits):
-            printer.show_progress(i, len(circuits), prefix="--- Computing model for string -", suffix='---')
-            directLGSTmodels[sigma] = direct_lgst_model(
-                sigma, "GsigmaLbl", dataset, prep_fiducials, meas_fiducials, target_model,
-                op_label_aliases, svd_truncate_to, verbosity)
-    return directLGSTmodels
-
-
-def direct_mc2gst_model(circuit_to_estimate, circuit_label, dataset,
-                        prep_fiducials, meas_fiducials, target_model,
-                        op_label_aliases=None, svd_truncate_to=None,
-                        min_prob_clip_for_weighting=1e-4,
-                        prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a model of LSGST estimates for target gates and circuit_to_estimate.
-
-    Starting with a Direct-LGST estimate for circuit_to_estimate, runs LSGST
-    using the same strings that LGST would have used to estimate circuit_to_estimate
-    and each of the target gates.  That is, LSGST is run with strings of the form:
-
-    1. prep_fiducial
-    2. meas_fiducial
-    3. prep_fiducial + meas_fiducial
-    4. prep_fiducial + single_gate + meas_fiducial
-    5. prep_fiducial + circuit_to_estimate + meas_fiducial
-
-    and the resulting Model estimate is returned.
-
-    Parameters
-    ----------
-    circuit_to_estimate : Circuit
-        The single circuit to estimate using LSGST
-
-    circuit_label : string
-        The label for the estimate of `circuit_to_estimate`.
-        i.e. op_matrix = returned_mode[op_label]
-
-    dataset : DataSet
-        The data to use for LGST
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    min_prob_clip_for_weighting : float, optional
-        defines the clipping interval for the statistical weight used
-        within the chi^2 function (see chi2fn).
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send to run_lgst(...) and do_mc2gst(...) calls.
-
-    Returns
-    -------
-    Model
-        A model containing LSGST estimates of `circuit_to_estimate`
-        and the gates of `target_model`.
-    """
-    direct_lgst = model_with_lgst_circuit_estimates(
-        [circuit_to_estimate], dataset, prep_fiducials, meas_fiducials, target_model,
-        True, op_label_aliases, None, [circuit_label], svd_truncate_to, verbosity)
-
-    # LEXICOGRAPHICAL VS MATRIX ORDER
-    circuits = prep_fiducials + meas_fiducials + [prepC + measC for prepC in prep_fiducials
-                                                  for measC in meas_fiducials]
-    for opLabel in direct_lgst.operations:
-        circuits.extend([prepC + _circuits.Circuit((opLabel,)) + measC
-                         for prepC in prep_fiducials for measC in meas_fiducials])
-
-    aliases = {} if (op_label_aliases is None) else op_label_aliases.copy()
-    aliases[circuit_label] = circuit_to_estimate.replace_layers_with_aliases(op_label_aliases)
-
-    obuilder = _objfns.Chi2Function.builder(regularization={'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                            penalties={'prob_clip_interval': prob_clip_interval})
-    bulk_circuits = _circuits.CircuitList(circuits, aliases)
-    _, direct_lsgst = _core.run_gst_fit_simple(dataset, direct_lgst, bulk_circuits, optimizer=None,
-                                               objective_function_builder=obuilder, resource_alloc=None,
-                                               verbosity=verbosity)
-
-    return direct_lsgst
-
-
-def direct_mc2gst_models(circuits, dataset, prep_fiducials, meas_fiducials,
-                         target_model, op_label_aliases=None,
-                         svd_truncate_to=None, min_prob_clip_for_weighting=1e-4,
-                         prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a dictionary with keys == circuits and values == Direct-LSGST Models.
-
-    Parameters
-    ----------
-    circuits : list of Circuit or tuple objects
-        The circuits to estimate using LSGST.  The elements of this list
-        are the keys of the returned dictionary.
-
-    dataset : DataSet
-        The data to use for all LGST and LSGST estimates.
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    min_prob_clip_for_weighting : float, optional
-        defines the clipping interval for the statistical weight used
-        within the chi^2 function (see chi2fn).
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send to run_lgst(...) and do_mc2gst(...) calls.
-
-    Returns
-    -------
-    dict
-        A dictionary that relates each circuit to a Model containing the LGST
-        estimate of that circuit's action (as a SPAM-less operation sequence)
-        stored under the operation label "GsigmaLbl", along with LSGST estimates
-        of the gates in `target_model`.
-    """
-    printer = _baseobjs.VerbosityPrinter.create_printer(verbosity)
-    directLSGSTmodels = {}
-    printer.log("--- Direct LSGST precomputation ---")
-    with printer.progress_logging(1):
-        for i, sigma in enumerate(circuits):
-            printer.show_progress(i, len(circuits), prefix="--- Computing model for string-", suffix='---')
-            directLSGSTmodels[sigma] = direct_mc2gst_model(
-                sigma, 
-                Label('GsigmaLbl') if sigma.line_labels == ('*',) else Label('GsigmaLbl', sigma.line_labels), 
-                dataset, prep_fiducials, meas_fiducials, target_model,
-                op_label_aliases, svd_truncate_to, min_prob_clip_for_weighting,
-                prob_clip_interval, verbosity)
-
-    return directLSGSTmodels
-
-
-def direct_mlgst_model(circuit_to_estimate, circuit_label, dataset,
-                       prep_fiducials, meas_fiducials, target_model,
-                       op_label_aliases=None, svd_truncate_to=None, min_prob_clip=1e-6,
-                       prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a model of MLEGST estimates for target gates and circuit_to_estimate.
-
-    Starting with a Direct-LGST estimate for circuit_to_estimate, runs MLEGST
-    using the same strings that LGST would have used to estimate circuit_to_estimate
-    and each of the target gates.  That is, MLEGST is run with strings of the form:
-
-    1. prep_fiducial
-    2. meas_fiducial
-    3. prep_fiducial + meas_fiducial
-    4. prep_fiducial + singleGate + meas_fiducial
-    5. prep_fiducial + circuit_to_estimate + meas_fiducial
-
-    and the resulting Model estimate is returned.
-
-    Parameters
-    ----------
-    circuit_to_estimate : Circuit or tuple
-        The single circuit to estimate using LSGST
-
-    circuit_label : string
-        The label for the estimate of `circuit_to_estimate`.
-        i.e. `op_matrix = returned_model[op_label]`
-
-    dataset : DataSet
-        The data to use for LGST
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    min_prob_clip : float, optional
-        defines the minimum probability "patch point" used
-        within the logl function.
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send to run_lgst(...) and do_mlgst(...) calls.
-
-    Returns
-    -------
-    Model
-        A model containing MLEGST estimates of `circuit_to_estimate`
-        and the gates of `target_model`.
-    """
-    direct_lgst = model_with_lgst_circuit_estimates(
-        [circuit_to_estimate], dataset, prep_fiducials, meas_fiducials, target_model,
-        True, op_label_aliases, None, [circuit_label], svd_truncate_to, verbosity)
-
-    # LEXICOGRAPHICAL VS MATRIX ORDER
-    circuits = prep_fiducials + meas_fiducials + [prepC + measC for prepC in prep_fiducials
-                                                  for measC in meas_fiducials]
-    for opLabel in direct_lgst.operations:
-        circuits.extend([prepC + _circuits.Circuit((opLabel,)) + measC
-                         for prepC in prep_fiducials for measC in meas_fiducials])
-
-    aliases = {} if (op_label_aliases is None) else op_label_aliases.copy()
-    aliases[circuit_label] = circuit_to_estimate.replace_layers_with_aliases(op_label_aliases)
-
-    obuilder = _objfns.PoissonPicDeltaLogLFunction.builder(regularization={'min_prob_clip': min_prob_clip},
-                                                           penalties={'prob_clip_interval': prob_clip_interval})
-    bulk_circuits = _circuits.CircuitList(circuits, aliases)
-    _, direct_mlegst = _core.run_gst_fit_simple(dataset, direct_lgst, bulk_circuits, optimizer=None,
-                                                objective_function_builder=obuilder, resource_alloc=None,
-                                                verbosity=verbosity)
-
-    return direct_mlegst
-
-
-def direct_mlgst_models(circuits, dataset, prep_fiducials, meas_fiducials, target_model,
-                        op_label_aliases=None, svd_truncate_to=None, min_prob_clip=1e-6,
-                        prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a dictionary with keys == circuits and values == Direct-MLEGST Models.
-
-    Parameters
-    ----------
-    circuits : list of Circuit or tuple objects
-        The circuits to estimate using MLEGST.  The elements of this list
-        are the keys of the returned dictionary.
-
-    dataset : DataSet
-        The data to use for all LGST and LSGST estimates.
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    target_model : Model
-        The target model used by LGST to extract operation labels and an initial gauge
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    svd_truncate_to : int, optional
-        The Hilbert space dimension to truncate the operation matrices to using
-        a SVD to keep only the largest svdToTruncateTo singular values of
-        the I_tildle LGST matrix. Zero means no truncation.
-        Defaults to dimension of `target_model`.
-
-    min_prob_clip : float, optional
-        defines the minimum probability "patch point" used
-        within the logl function.
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send to run_lgst(...) and do_mlgst(...) calls.
-
-    Returns
-    -------
-    dict
-        A dictionary that relates each circuit to a Model containing the LGST
-        estimate of that circuit's action (as a SPAM-less operation sequence)
-        stored under the operation label "GsigmaLbl", along with MLEGST estimates
-        of the gates in `target_model`.
-    """
-    printer = _baseobjs.VerbosityPrinter.create_printer(verbosity)
-    directMLEGSTmodels = {}
-    printer.log("--- Direct MLEGST precomputation ---")
-    with printer.progress_logging(1):
-        for i, sigma in enumerate(circuits):
-            printer.show_progress(i, len(circuits), prefix="--- Computing model for string ", suffix="---")
-            directMLEGSTmodels[sigma] = direct_mlgst_model(
-                sigma, 
-                Label('GsigmaLbl') if sigma.line_labels == ('*',) else Label('GsigmaLbl', sigma.line_labels),
-                dataset, prep_fiducials, meas_fiducials, target_model,
-                op_label_aliases, svd_truncate_to, min_prob_clip,
-                prob_clip_interval, verbosity)
-
-    return directMLEGSTmodels
-
-
-def focused_mc2gst_model(circuit_to_estimate, circuit_label, dataset,
-                         prep_fiducials, meas_fiducials, start_model,
-                         op_label_aliases=None, min_prob_clip_for_weighting=1e-4,
-                         prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a model containing a single LSGST estimate of `circuit_to_estimate`.
-
-    Starting with `start_model`, run LSGST with the same circuits that LGST
-    would use to estimate `circuit_to_estimate`.  That is, LSGST is run with
-    strings of the form:  prep_fiducial + circuit_to_estimate + meas_fiducial
-    and return the resulting Model.
-
-    Parameters
-    ----------
-    circuit_to_estimate : Circuit or tuple
-        The single circuit to estimate using LSGST
-
-    circuit_label : string
-        The label for the estimate of `circuit_to_estimate`.
-        i.e. `op_matrix = returned_model[op_label]`
-
-    dataset : DataSet
-        The data to use for LGST
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    start_model : Model
-        The model to seed LSGST with. Often times obtained via LGST.
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    min_prob_clip_for_weighting : float, optional
-        defines the clipping interval for the statistical weight used
-        within the chi^2 function (see chi2fn).
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send do_mc2gst(...) call.
-
-    Returns
-    -------
-    Model
-        A model containing LSGST estimate of `circuit_to_estimate`.
-    """
-    circuits = [prepC + circuit_to_estimate + measC for prepC in prep_fiducials for measC in meas_fiducials]
-
-    obuilder = _objfns.Chi2Function.builder(regularization={'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                            penalties={'prob_clip_interval': prob_clip_interval})
-    bulk_circuits = _circuits.CircuitList(circuits, op_label_aliases)
-    _, focused_lsgst = _core.run_gst_fit_simple(dataset, start_model, bulk_circuits, optimizer=None,
-                                                objective_function_builder=obuilder, resource_alloc=None,
-                                                verbosity=verbosity)
-
-    focused_lsgst.operations[circuit_label] = _FullArbitraryOp(
-        focused_lsgst.sim.product(circuit_to_estimate))  # add desired string as a separate labeled gate
-    return focused_lsgst
-
-
-def focused_mc2gst_models(circuits, dataset, prep_fiducials, meas_fiducials,
-                          start_model, op_label_aliases=None,
-                          min_prob_clip_for_weighting=1e-4,
-                          prob_clip_interval=(-1e6, 1e6), verbosity=0):
-    """
-    Constructs a dictionary with keys == circuits and values == Focused-LSGST Models.
-
-    Parameters
-    ----------
-    circuits : list of Circuit or tuple objects
-        The circuits to estimate using LSGST.  The elements of this list
-        are the keys of the returned dictionary.
-
-    dataset : DataSet
-        The data to use for all LGST and LSGST estimates.
-
-    prep_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective preparation.
-
-    meas_fiducials : list of Circuits
-        Fiducial circuits used to construct an informationally complete
-        effective measurement.
-
-    start_model : Model
-        The model to seed LSGST with. Often times obtained via LGST.
-
-    op_label_aliases : dictionary, optional
-        Dictionary whose keys are operation label "aliases" and whose values are tuples
-        corresponding to what that operation label should be expanded into before querying
-        the dataset. Defaults to the empty dictionary (no aliases defined)
-        e.g. opLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
-
-    min_prob_clip_for_weighting : float, optional
-        defines the clipping interval for the statistical weight used
-        within the chi^2 function (see chi2fn).
-
-    prob_clip_interval : 2-tuple, optional
-        (min,max) to clip probabilities to within Model probability
-        computation routines (see Model.bulk_fill_probs)
-
-    verbosity : int, optional
-        Verbosity value to send to do_mc2gst(...) call.
-
-    Returns
-    -------
-    dict
-        A dictionary that relates each circuit to a Model containing the
-        LSGST estimate of that circuit's action, stored under the
-        operation label "GsigmaLbl".
-    """
-
-    printer = _baseobjs.VerbosityPrinter.create_printer(verbosity)
-    focusedLSGSTmodels = {}
-    printer.log("--- Focused LSGST precomputation ---")
-    with printer.progress_logging(1):
-        for i, sigma in enumerate(circuits):
-            printer.show_progress(i, len(circuits), prefix="--- Computing model for string", suffix='---')
-            focusedLSGSTmodels[sigma] = focused_mc2gst_model(
-                sigma, 
-                Label('GsigmaLbl') if sigma.line_labels == ('*',) else Label('GsigmaLbl', sigma.line_labels),
-                dataset, prep_fiducials, meas_fiducials, start_model,
-                op_label_aliases, min_prob_clip_for_weighting, prob_clip_interval, verbosity)
-    return focusedLSGSTmodels
diff --git a/pygsti/baseobjs/label.py b/pygsti/baseobjs/label.py
index 172d63274..55cb15c1a 100644
--- a/pygsti/baseobjs/label.py
+++ b/pygsti/baseobjs/label.py
@@ -818,6 +818,10 @@ def __reduce__(self):
         # Need to tell serialization logic how to create a new Label since it's derived
         # from the immutable tuple type (so cannot have its state set after creation)
         return (LabelStr, (str(self), self.time), None)
+    
+    def __contains__(self, x):
+        #need to get a string rep of the tested label.
+        return str(x) in str(self)
 
     def to_native(self):
         """
diff --git a/pygsti/circuits/__init__.py b/pygsti/circuits/__init__.py
index 46be7652f..28cd30f66 100644
--- a/pygsti/circuits/__init__.py
+++ b/pygsti/circuits/__init__.py
@@ -10,7 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-from .circuit import Circuit
+from .circuit import Circuit, SeparatePOVMCircuit
 from .circuitlist import CircuitList
 from .circuitstructure import CircuitPlaquette, FiducialPairPlaquette, \
     GermFiducialPairPlaquette, PlaquetteGridCircuitStructure
diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 2977dddac..8ecf63183 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -549,6 +549,24 @@ def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occ
 
         return self
     
+    #pickle management functions
+    def __getstate__(self):
+        state_dict = self.__dict__
+        #if state_dict.get('_hash', None) is not None:
+        #    del state_dict['_hash'] #don't store the hash, recompute at unpickling time
+        return state_dict
+
+    def __setstate__(self, state_dict):
+        for k, v in state_dict.items():
+            self.__dict__[k] = v
+        if self.__dict__['_static']:
+            #reinitialize the hash
+            if self.__dict__.get('_hashable_tup', None) is not None:
+                self._hash = hash(self._hashable_tup)
+            else: #legacy support
+                self._hashable_tup = self.tup
+                self._hash = hash(self._hashable_tup)
+
 
     def to_label(self, nreps=1):
         """
@@ -636,7 +654,6 @@ def layertup(self):
         if self._static:
             return self._labels
         else:
-            #return tuple([to_label(layer_lbl) for layer_lbl in self._labels])
             return tuple([layer_lbl if isinstance(layer_lbl, _Label) 
                           else _Label(layer_lbl) for layer_lbl in self._labels])
     @property
@@ -1095,13 +1112,12 @@ def _proc_lines_arg(self, lines):
     def _proc_key_arg(self, key):
         """ Pre-process the key argument used by many methods """
         if isinstance(key, tuple):
-            if len(key) != 2: return IndexError("Index must be of the form <layerIndex>,<lineIndex>")
-            layers = key[0]
-            lines = key[1]
+            if len(key) != 2: 
+                return IndexError("Index must be of the form <layerIndex>,<lineIndex>")
+            else:
+                return key[0], key[1]
         else:
-            layers = key
-            lines = None
-        return layers, lines
+            return key, None
 
     def _layer_components(self, ilayer):
         """ Get the components of the `ilayer`-th layer as a list/tuple. """
@@ -1191,22 +1207,41 @@ def extract_labels(self, layers=None, lines=None, strict=True):
             `layers` is a single integer and as a `Circuit` otherwise.
             Note: if you want a `Circuit` when only selecting one layer,
             set `layers` to a slice or tuple containing just a single index.
+            Note that the returned circuit doesn't retain any original
+            metadata, such as the compilable layer indices or occurence id.
         """
         nonint_layers = not isinstance(layers, int)
 
         #Shortcut for common case when lines == None and when we're only taking a layer slice/index
-        if lines is None:
-            assert(layers is not None)
-            if nonint_layers is False: return self.layertup[layers]
-            if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
-                return Circuit._fastinit(self._labels[layers], self._line_labels, not self._static)
+        if lines is None and layers is not None:
+            if self._static:
+                if not nonint_layers:
+                    return self._labels[layers]
+                if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
+                    #can speed this up a measurably by manually computing the new hashable tuple value and hash
+                    if not self._line_labels in (('*',), ()):
+                        new_hashable_tup = self._labels[layers] + ('@',) + self._line_labels
+                    else:
+                        new_hashable_tup = self._labels[layers]
+                    ret = Circuit.__new__(Circuit)
+                    return ret._copy_init(self._labels[layers], self._line_labels, not self._static, hashable_tup= new_hashable_tup, precomp_hash=hash(new_hashable_tup))
+            else:
+                if not nonint_layers:
+                    return self.layertup[layers]
+                if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
+                    return Circuit._fastinit(self._labels[layers], self._line_labels, not self._static)
+        #otherwise assert both are not None:
+
 
         layers = self._proc_layers_arg(layers)
         lines = self._proc_lines_arg(lines)
         if len(layers) == 0 or len(lines) == 0:
-            return Circuit._fastinit(() if self._static else [],
-                                     tuple(lines) if self._static else lines,
-                                     not self._static) if nonint_layers else None  # zero-area region
+            if self._static:
+                return Circuit._fastinit((), tuple(lines), False)  # zero-area region
+            else:
+                return Circuit._fastinit(() if self._static else [],
+                                        tuple(lines) if self._static else lines,
+                                        not self._static)  # zero-area region
 
         ret = []
         if self._static:
@@ -3447,7 +3482,7 @@ def num_gates(self):
         """
         if self._static:
             def cnt(lbl):  # obj a Label, perhaps compound
-                if lbl.is_simple():  # a simple label
+                if lbl.IS_SIMPLE:  # a simple label
                     return 1 if (lbl.sslbls is not None) else 0
                 else:
                     return sum([cnt(sublbl) for sublbl in lbl.components])
@@ -4427,107 +4462,6 @@ def done_editing(self):
         self._hashable_tup = self.tup
         self._hash = hash(self._hashable_tup)
 
-    def expand_instruments_and_separate_povm(self, model, observed_outcomes=None):
-        """
-        Creates a dictionary of :class:`SeparatePOVMCircuit` objects from expanding the instruments of this circuit.
-
-        Each key of the returned dictionary replaces the instruments in this circuit with a selection
-        of their members.  (The size of the resulting dictionary is the product of the sizes of
-        each instrument appearing in this circuit when `observed_outcomes is None`).  Keys are stored
-        as :class:`SeparatePOVMCircuit` objects so it's easy to keep track of which POVM outcomes (effects)
-        correspond to observed data.  This function is, for the most part, used internally to process
-        a circuit before computing its outcome probabilities.
-
-        Parameters
-        ----------
-        model : Model
-            The model used to provide necessary details regarding the expansion, including:
-
-            - default SPAM layers
-            - definitions of instrument-containing layers
-            - expansions of individual instruments and POVMs
-
-        Returns
-        -------
-        OrderedDict
-            A dict whose keys are :class:`SeparatePOVMCircuit` objects and whose
-            values are tuples of the outcome labels corresponding to this circuit,
-            one per POVM effect held in the key.
-        """
-        complete_circuit = model.complete_circuit(self)
-        expanded_circuit_outcomes = _collections.OrderedDict()
-        povm_lbl = complete_circuit[-1]  # "complete" circuits always end with a POVM label
-        circuit_without_povm = complete_circuit[0:len(complete_circuit) - 1]
-
-        def create_tree(lst):
-            subs = _collections.OrderedDict()
-            for el in lst:
-                if len(el) > 0:
-                    if el[0] not in subs: subs[el[0]] = []
-                    subs[el[0]].append(el[1:])
-            return _collections.OrderedDict([(k, create_tree(sub_lst)) for k, sub_lst in subs.items()])
-
-        def add_expanded_circuit_outcomes(circuit, running_outcomes, ootree, start):
-            """
-            """
-            cir = circuit if start == 0 else circuit[start:]  # for performance, avoid uneeded slicing
-            for k, layer_label in enumerate(cir, start=start):
-                components = layer_label.components
-                #instrument_inds = _np.nonzero([model._is_primitive_instrument_layer_lbl(component)
-                #                               for component in components])[0]  # SLOWER than statement below
-                instrument_inds = _np.array([i for i, component in enumerate(components)
-                                             if model._is_primitive_instrument_layer_lbl(component)])
-                if instrument_inds.size > 0:
-                    # This layer contains at least one instrument => recurse with instrument(s) replaced with
-                    #  all combinations of their members.
-                    component_lookup = {i: comp for i, comp in enumerate(components)}
-                    instrument_members = [model._member_labels_for_instrument(components[i])
-                                          for i in instrument_inds]  # also components of outcome labels
-                    for selected_instrmt_members in _itertools.product(*instrument_members):
-                        expanded_layer_lbl = component_lookup.copy()
-                        expanded_layer_lbl.update({i: components[i] + "_" + sel
-                                                   for i, sel in zip(instrument_inds, selected_instrmt_members)})
-                        expanded_layer_lbl = _Label([expanded_layer_lbl[i] for i in range(len(components))])
-
-                        if ootree is not None:
-                            new_ootree = ootree
-                            for sel in selected_instrmt_members:
-                                new_ootree = new_ootree.get(sel, {})
-                            if len(new_ootree) == 0: continue  # no observed outcomes along this outcome-tree path
-                        else:
-                            new_ootree = None
-
-                        add_expanded_circuit_outcomes(circuit[0:k] + Circuit((expanded_layer_lbl,)) + circuit[k + 1:],
-                                                      running_outcomes + selected_instrmt_members, new_ootree, k + 1)
-                    break
-
-            else:  # no more instruments to process: `cir` contains no instruments => add an expanded circuit
-                assert(circuit not in expanded_circuit_outcomes)  # shouldn't be possible to generate duplicates...
-                elabels = model._effect_labels_for_povm(povm_lbl) if (observed_outcomes is None) \
-                    else tuple(ootree.keys())
-                outcomes = tuple((running_outcomes + (elabel,) for elabel in elabels))
-                expanded_circuit_outcomes[SeparatePOVMCircuit(circuit, povm_lbl, elabels)] = outcomes
-
-        ootree = create_tree(observed_outcomes) if observed_outcomes is not None else None  # tree of observed outcomes
-        # e.g. [('0','00'), ('0','01'), ('1','10')] ==> {'0': {'00': {}, '01': {}}, '1': {'10': {}}}
-
-        if model._has_instruments():
-            add_expanded_circuit_outcomes(circuit_without_povm, (), ootree, start=0)
-        else:
-            # It may be helpful to cache the set of elabels for a POVM (maybe within the model?) because
-            # currently the call to _effect_labels_for_povm may be a bottleneck.  It's needed, even when we have
-            # observed outcomes, because there may be some observed outcomes that aren't modeled (e.g. leakage states)
-            if observed_outcomes is None:
-                elabels = model._effect_labels_for_povm(povm_lbl)
-            else:
-                possible_lbls = set(model._effect_labels_for_povm(povm_lbl))
-                elabels = tuple([oo for oo in ootree.keys() if oo in possible_lbls])
-            outcomes = tuple(((elabel,) for elabel in elabels))
-            expanded_circuit_outcomes[SeparatePOVMCircuit(circuit_without_povm, povm_lbl, elabels)] = outcomes
-
-        return expanded_circuit_outcomes
-
-
 class CompressedCircuit(object):
     """
     A "compressed" Circuit that requires less disk space.
diff --git a/pygsti/data/dataset.py b/pygsti/data/dataset.py
index 6214fbf41..ce7bb52c6 100644
--- a/pygsti/data/dataset.py
+++ b/pygsti/data/dataset.py
@@ -296,34 +296,10 @@ def timeseries_for_outcomes(self):
         last_time = None
         seriesDict = {self.dataset.olIndex[ol]: [] for ol in self.dataset.outcome_labels}
 
-        #REMOVED: (though this gives slightly different behavior)
-        #for outcome_label in self.outcomes:
-        #    if outcome_label not in seriesDict.keys():
-        #        seriesDict[outcome_label] = []
-
         if self.reps is None:
             reps = _np.ones(len(self.time), _np.int64)
         else: reps = self.reps
 
-        # An alternate implementation that appears to be (surprisingly?) slower...
-        ##Get time bin locations
-        #time_bins_borders = []
-        #last_time = None
-        #for i, t in enumerate(self.time):
-        #    if t != last_time:
-        #        time_bins_borders.append(i)
-        #        last_time = t
-        #time_bins_borders.append(len(self.time))
-        #nTimes = len(time_bins_borders) - 1
-        #
-        #seriesDict = {self.dataset.olIndex[ol]: _np.zeros(nTimes, _np.int64) for ol in self.dataset.outcome_labels}
-        #
-        #for i in range(nTimes):
-        #    slc = slice(time_bins_borders[i],time_bins_borders[i+1])
-        #    times.append( self.time[slc.start] )
-        #    for oli, rep in zip(self.oli[slc], reps[slc]):
-        #        seriesDict[oli][i] += rep
-
         for t, oli, rep in zip(self.time, self.oli, reps):
 
             if t != last_time:
@@ -586,26 +562,28 @@ def _get_counts(self, timestamp=None, all_outcomes=False):
             tslc = _np.where(_np.isclose(self.time, timestamp))[0]
         else: tslc = slice(None)
 
+        oli_tslc = self.oli[tslc]
         nOutcomes = len(self.dataset.olIndex)
-        nIndices = len(self.oli[tslc])
+        nIndices = len(oli_tslc)
+        
         if nOutcomes <= nIndices or all_outcomes:
             if self.reps is None:
                 for ol, i in self.dataset.olIndex.items():
-                    cnt = float(_np.count_nonzero(_np.equal(self.oli[tslc], i)))
-                    if all_outcomes or cnt > 0:
+                    cnt = float(_np.count_nonzero(_np.equal(oli_tslc, i)))
+                    if cnt > 0 or all_outcomes:
                         cntDict.setitem_unsafe(ol, cnt)
             else:
                 for ol, i in self.dataset.olIndex.items():
-                    inds = _np.nonzero(_np.equal(self.oli[tslc], i))[0]
-                    if all_outcomes or len(inds) > 0:
+                    inds = oli_tslc[oli_tslc == i]
+                    if len(inds) > 0 or all_outcomes:
                         cntDict.setitem_unsafe(ol, float(sum(self.reps[tslc][inds])))
         else:
             if self.reps is None:
-                for ol_index in self.oli[tslc]:
+                for ol_index in oli_tslc:
                     ol = self.dataset.ol[ol_index]
                     cntDict.setitem_unsafe(ol, 1.0 + cntDict.getitem_unsafe(ol, 0.0))
             else:
-                for ol_index, reps in zip(self.oli[tslc], self.reps[tslc]):
+                for ol_index, reps in zip(oli_tslc, self.reps[tslc]):
                     ol = self.dataset.ol[ol_index]
                     cntDict.setitem_unsafe(ol, reps + cntDict.getitem_unsafe(ol, 0.0))
 
@@ -616,7 +594,8 @@ def counts(self):
         """
         Dictionary of per-outcome counts.
         """
-        if self._cntcache: return self._cntcache  # if not None *and* len > 0
+        if self._cntcache: 
+            return self._cntcache  # if not None *and* len > 0
         ret = self._get_counts()
         if self._cntcache is not None:  # == and empty dict {}
             self._cntcache.update(ret)
@@ -1199,10 +1178,10 @@ def _get_row(self, circuit):
         circuit = _cir.Circuit.cast(circuit)
 
         #Note: cirIndex value is either an int (non-static) or a slice (static)
-        repData = self.repData[self.cirIndex[circuit]] \
-            if (self.repData is not None) else None
-        return _DataSetRow(self, self.oliData[self.cirIndex[circuit]],
-                           self.timeData[self.cirIndex[circuit]], repData,
+        cirIndex = self.cirIndex[circuit]
+        repData = self.repData[cirIndex] if (self.repData is not None) else None
+        return _DataSetRow(self, self.oliData[cirIndex],
+                           self.timeData[cirIndex], repData,
                            self.cnt_cache[circuit] if self.bStatic else None,
                            self.auxInfo[circuit])
 
diff --git a/pygsti/evotypes/densitymx/effectcreps.cpp b/pygsti/evotypes/densitymx/effectcreps.cpp
index bb4ab9f77..d8c00c8c3 100644
--- a/pygsti/evotypes/densitymx/effectcreps.cpp
+++ b/pygsti/evotypes/densitymx/effectcreps.cpp
@@ -147,7 +147,7 @@ namespace CReps_densitymx {
 	finalIndx += ((finds >> k) & 1) * 3 * base;
 	base = base >> 2; // /= 4 so base == 4**(N-1-k)
       }
-
+      
       //Apply result
       if(parity(finds & _zvals_int))
 	ret -= _abs_elval * state->_dataptr[finalIndx]; // minus sign
@@ -157,15 +157,25 @@ namespace CReps_densitymx {
     return ret;
   }
 
-  INT EffectCRep_Computational::parity(INT x) {
-    // int64-bit specific
-    x = (x & 0x00000000FFFFFFFF)^(x >> 32);
-    x = (x & 0x000000000000FFFF)^(x >> 16);
-    x = (x & 0x00000000000000FF)^(x >> 8);
-    x = (x & 0x000000000000000F)^(x >> 4);
-    x = (x & 0x0000000000000003)^(x >> 2);
-    x = (x & 0x0000000000000001)^(x >> 1);
-    return x & 1; // return the last bit (0 or 1)
+//  INT EffectCRep_Computational::parity(INT x) {
+//    // int64-bit specific
+//    x = (x & 0x00000000FFFFFFFF)^(x >> 32);
+//    x = (x & 0x000000000000FFFF)^(x >> 16);
+//    x = (x & 0x00000000000000FF)^(x >> 8);
+//    x = (x & 0x000000000000000F)^(x >> 4);
+//    x = (x & 0x0000000000000003)^(x >> 2);
+//    x = (x & 0x0000000000000001)^(x >> 1);
+//    return x & 1; // return the last bit (0 or 1)
+//  }
+
+  inline INT EffectCRep_Computational::parity(INT x) {
+    x ^= (x >> 32);
+    x ^= (x >> 16);
+    x ^= (x >> 8);
+    x ^= (x >> 4);
+    x ^= (x >> 2);
+    x ^= (x >> 1);
+    return x & 1; // Return the last bit
   }
 
 
diff --git a/pygsti/evotypes/evotype.py b/pygsti/evotypes/evotype.py
index 86777effc..75311b9a6 100644
--- a/pygsti/evotypes/evotype.py
+++ b/pygsti/evotypes/evotype.py
@@ -1,6 +1,7 @@
 import importlib as _importlib
 
 from . import basereps as _basereps
+from pygsti.baseobjs.statespace import StateSpace as _StateSpace
 
 
 class Evotype(object):
@@ -50,10 +51,46 @@ class Evotype(object):
     }
 
     @classmethod
-    def cast(cls, obj, default_prefer_dense_reps=False):
+    def cast(cls, obj, default_prefer_dense_reps=None, state_space=None):
+        """
+        Cast the specified object to an Evotype with options for default Evotype
+        handling.
+
+        Parameters
+        ----------
+        obj : Evotype or str
+            Object to cast to an Evotype. If already an Evotype the object is simply
+            returned. Otherwise if a string we attempt to cast it to a recognized
+            evotype option. If the string "default" is passed in then we determine
+            the type of evotype used in conjunction with the two optional kwargs below.
+
+        default_prefer_dense_reps : None or bool, optional (default None)
+            Flag to indicate preference for dense representation types when casting
+            a string. If None then there is no preference and this will be determined
+            by the optional state_space kwarg, if present. Otherwise if a boolean value
+            this selection overrides any logic based on the state space.
+
+        state_space : StateSpace, optional (default None)
+            If not None then the dimension of the state space is used to determine whether
+            or not to prefer the use of dense representation types when not already specified
+            by the default_prefer_dense_reps kwarg.
+        
+        Returns
+        -------
+        Evotype
+        """
         if isinstance(obj, Evotype):
             return obj
-        elif obj == "default":
+        
+        if default_prefer_dense_reps is None:
+            if state_space is None:
+                default_prefer_dense_reps = False #reproduces legacy behavior.
+            else:
+                if not isinstance(state_space, _StateSpace):
+                    raise ValueError('state_space must be a StateSpace object.')
+                default_prefer_dense_reps = False if state_space.dim > 64 else True #HARDCODED
+
+        if obj == "default":
             return Evotype(cls.default_evotype, default_prefer_dense_reps)
         else:  # assume obj is a string naming an evotype
             return Evotype(str(obj), default_prefer_dense_reps)
diff --git a/pygsti/evotypes/qibo/__init__.py b/pygsti/evotypes/qibo/__init__.py
deleted file mode 100644
index 1db9158df..000000000
--- a/pygsti/evotypes/qibo/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""
-An evolution type that uses the 3rd-party 'qibo' package.
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-
-densitymx_mode = False
-nshots = 1000
-
-
-def _get_densitymx_mode():
-    return densitymx_mode
-
-
-def _get_nshots():
-    return nshots
-
-
-def _get_minimal_space():
-    return minimal_space
-
-
-minimal_space = 'Hilbert'
-from .povmreps import *
-from .effectreps import *
-from .opreps import *
-from .statereps import *
diff --git a/pygsti/evotypes/qibo/effectreps.py b/pygsti/evotypes/qibo/effectreps.py
deleted file mode 100644
index dac57e1b5..000000000
--- a/pygsti/evotypes/qibo/effectreps.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-POVM effect representation classes for the `qibo` evolution type.
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-import numpy as _np
-
-from .. import basereps as _basereps
-from pygsti.baseobjs.statespace import StateSpace as _StateSpace
-
-from . import _get_densitymx_mode, _get_nshots
-
-
-class EffectRep(_basereps.EffectRep):
-    def __init__(self, state_space):
-        self.state_space = _StateSpace.cast(state_space)
-
-    @property
-    def nqubits(self):
-        return self.state_space.num_qubits
-
-
-class EffectRepComputational(EffectRep):
-    def __init__(self, zvals, basis, state_space):
-        self.zvals = zvals
-        self.basis = basis
-        super(EffectRepComputational, self).__init__(state_space)
-
-
-class EffectRepConjugatedState(EffectRep):
-
-    def __init__(self, state_rep):
-        self.state_rep = state_rep
-        super(EffectRepConjugatedState, self).__init__(state_rep.state_space)
-
-    def probability(self, state):
-        # compute <s2|s1>
-        assert(_get_densitymx_mode() is True), "Can only use EffectRepConjugatedState when densitymx_mode == True!"
-
-        initial_state = state.qibo_state
-        effect_state = self.state_rep.qibo_state
-        if effect_state.ndim == 1:  # b/c qibo_state can be either a vector or density mx
-            #Promote this state vector to a density matrix to use it as a POVM effect
-            effect_state = _np.kron(effect_state[:, None], effect_state.conjugate()[None, :])
-        assert(effect_state.ndim == 2)  # density matrices
-
-        qibo_circuit = state.qibo_circuit
-        results = qibo_circuit(initial_state)
-        return _np.real_if_close(effect_state.ravel().conjugate() @ results.state().ravel())
-
-    def to_dense(self, on_space):
-        return self.state_rep.to_dense(on_space)
-
-    @property
-    def basis(self):
-        # (all qibo effect reps need to have a .basis property)
-        return self.state_rep.basis
-
-
-class EffectRepComposed(EffectRep):
-    def __init__(self, op_rep, effect_rep, op_id, state_space):
-        self.op_rep = op_rep
-        self.effect_rep = effect_rep
-        self.op_id = op_id
-
-        self.state_space = _StateSpace.cast(state_space)
-        assert(self.state_space.is_compatible_with(effect_rep.state_space))
-
-        super(EffectRepComposed, self).__init__(effect_rep.state_space)
-
-    def probability(self, state):
-        return self.effect_rep.probability(self.op_rep.acton(state))
diff --git a/pygsti/evotypes/qibo/opreps.py b/pygsti/evotypes/qibo/opreps.py
deleted file mode 100644
index 78144684a..000000000
--- a/pygsti/evotypes/qibo/opreps.py
+++ /dev/null
@@ -1,376 +0,0 @@
-"""
-Operation representation classes for the `qibo` evolution type.
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-
-import itertools as _itertools
-import copy as _copy
-from functools import partial as _partial
-
-import numpy as _np
-from scipy.sparse.linalg import LinearOperator
-from numpy.random import RandomState as _RandomState
-
-from . import _get_minimal_space
-from .statereps import StateRep as _StateRep
-from .. import basereps as _basereps
-from pygsti.baseobjs.statespace import StateSpace as _StateSpace
-from pygsti.baseobjs.basis import Basis as _Basis
-from ...tools import jamiolkowski as _jt
-from ...tools import basistools as _bt
-from ...tools import internalgates as _itgs
-from ...tools import optools as _ot
-
-
-try:
-    import qibo as _qibo
-
-    std_qibo_creation_fns = {  # functions that create the desired op given qubit indices & gate args
-        'Gi': _qibo.gates.I,
-        'Gxpi2': _partial(_qibo.gates.RX, theta=_np.pi / 2, trainable=False),
-        'Gypi2': _partial(_qibo.gates.RY, theta=_np.pi / 2, trainable=False),
-        'Gzpi2': _partial(_qibo.gates.RZ, theta=_np.pi / 2, trainable=False),
-        'Gxpi': _qibo.gates.X,
-        'Gypi': _qibo.gates.Y,
-        'Gzpi': _qibo.gates.Z,
-        'Gxmpi2': _partial(_qibo.gates.RX, theta=-_np.pi / 2, trainable=False),
-        'Gympi2': _partial(_qibo.gates.RY, theta=-_np.pi / 2, trainable=False),
-        'Gzmpi2': _partial(_qibo.gates.RZ, theta=-_np.pi / 2, trainable=False),
-        'Gh': _qibo.gates.H,
-        'Gp': _qibo.gates.S,
-        'Gpdag': _partial(_qibo.gates.U1, theta=-_np.pi / 2, trainable=False),
-        'Gt': _qibo.gates.T,
-        'Gtdag': _partial(_qibo.gates.U1, theta=-_np.pi / 4, trainable=False),
-        'Gcphase': _qibo.gates.CZ,
-        'Gcnot': _qibo.gates.CNOT,
-        'Gswap': _qibo.gates.SWAP,
-        #'Gzr': _qibo.gates.RZ,  # takes (q, theta)
-        #'Gczr': _qibo.gates.CRZ,  # takes (q0, q1, theta)
-        'Gx': _partial(_qibo.gates.RX, theta=_np.pi / 2, trainable=False),
-        'Gy': _partial(_qibo.gates.RY, theta=_np.pi / 2, trainable=False),
-        'Gz': _partial(_qibo.gates.RZ, theta=_np.pi / 2, trainable=False)
-    }
-except (ImportError, AttributeError):  # AttributeError if an early version of qibo without some of the above gates
-    _qibo = None
-
-
-class OpRep(_basereps.OpRep):
-    def __init__(self, state_space):
-        self.state_space = state_space
-
-    @property
-    def dim(self):
-        return self.state_space.udim
-
-    def create_qibo_ops_on(self, qubit_indices):
-        raise NotImplementedError("Derived classes must implement this!")
-
-    def acton(self, state):
-        c = state.qibo_circuit.copy()
-        # TODO: change below to: sole_tensor_product_block_labels
-        for qibo_op in self.create_qibo_ops_on(self.state_space.tensor_product_block_labels(0)):
-            c.add(qibo_op)
-        return _StateRep(c, state.qibo_state.copy(), state.state_space)
-
-    def adjoint_acton(self, state):
-        raise NotImplementedError()
-
-    def acton_random(self, state, rand_state):
-        return self.acton(state)  # default is to ignore rand_state
-
-    def adjoint_acton_random(self, state, rand_state):
-        return self.adjoint_acton(state)  # default is to ignore rand_state
-
-#    def aslinearoperator(self):
-#        def mv(v):
-#            if v.ndim == 2 and v.shape[1] == 1: v = v[:, 0]
-#            in_state = _StateRepDensePure(_np.ascontiguousarray(v, complex), self.state_space, basis=None)
-#            return self.acton(in_state).to_dense('Hilbert')
-#
-#        def rmv(v):
-#            if v.ndim == 2 and v.shape[1] == 1: v = v[:, 0]
-#            in_state = _StateRepDensePure(_np.ascontiguousarray(v, complex), self.state_space, basis=None)
-#            return self.adjoint_acton(in_state).to_dense('Hilbert')
-#        return LinearOperator((self.dim, self.dim), matvec=mv, rmatvec=rmv)  # transpose, adjoint, dot, matmat?
-
-    def copy(self):
-        return _copy.deepcopy(self)
-
-
-class OpRepDenseUnitary(OpRep):
-    def __init__(self, mx, basis, state_space):
-        state_space = _StateSpace.cast(state_space)
-        if mx is None:
-            mx = _np.identity(state_space.udim, complex)
-        assert(mx.ndim == 2 and mx.shape[0] == state_space.udim)
-        self.basis = basis
-        self.base = _np.require(mx, requirements=['OWNDATA', 'C_CONTIGUOUS'])
-        super(OpRepDenseUnitary, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return [_qibo.gates.UnitaryChannel([1.0], [(qubit_indices, self.base)], seed=None)]
-
-    def base_has_changed(self):
-        pass  # nothing needed
-
-    def to_dense(self, on_space):
-        if on_space == 'Hilbert' or (on_space == 'minimal' and _get_minimal_space() == 'Hilbert'):
-            return self.base
-        elif on_space == 'HilbertSchmidt' or (on_space == 'minimal' and _get_minimal_space() == 'HilbertSchmidt'):
-            return _ot.unitary_to_superop(self.base, self.basis)
-        else:
-            raise ValueError("Invalid `on_space` argument: %s" % str(on_space))
-
-    def __str__(self):
-        return "OpRepDenseUnitary:\n" + str(self.base)
-
-
-class OpRepDenseSuperop(OpRep):
-    def __init__(self, mx, basis, state_space):
-        state_space = _StateSpace.cast(state_space)
-        if mx is None:
-            mx = _np.identity(state_space.dim, 'd')
-        assert(mx.ndim == 2 and mx.shape[0] == state_space.dim)
-        self.basis = basis
-        assert(self.basis is not None), "Qibo evotype requires OpRepDenseSuperop be given a basis (to get Kraus ops!)"
-
-        self.base = _np.require(mx, requirements=['OWNDATA', 'C_CONTIGUOUS'])
-        super(OpRepDenseSuperop, self).__init__(state_space)
-        self.base_has_changed()  # sets self.kraus_ops
-
-    def base_has_changed(self):
-        #recompute Kraus ops for creating qibo op
-        superop_mx = self.base; d = int(_np.round(_np.sqrt(superop_mx.shape[0])))
-        std_basis = _Basis.cast('std', superop_mx.shape[0])
-        choi_mx = _jt.jamiolkowski_iso(superop_mx, self.basis, std_basis) * d  # see NOTE below
-        evals, evecs = _np.linalg.eig(choi_mx)
-        assert(all([ev > -1e-7 for ev in evals])), \
-            "Cannot compute Kraus decomposition of non-positive-definite superoperator (within OpRepDenseSuperop!)"
-        self.kraus_ops = [evecs[:, i].reshape(d, d) * _np.sqrt(ev) for i, ev in enumerate(evals) if abs(ev) > 1e-7]
-
-    def to_dense(self, on_space):
-        if not (on_space == 'HilbertSchmidt' or (on_space == 'minimal' and _get_minimal_space() == 'HilbertSchmidt')):
-            raise ValueError("'densitymx_slow' evotype cannot produce Hilbert-space ops!")
-        return self.base
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return [_qibo.gates.KrausChannel([(qubit_indices, Ki) for Ki in self.kraus_ops])]
-
-    def __str__(self):
-        return "OpRepDenseSuperop:\n" + str(self.base)
-
-    def copy(self):
-        return OpRepDenseSuperop(self.base.copy(), self.basis, self.state_space)
-
-
-class OpRepStandard(OpRep):
-    def __init__(self, name, basis, state_space):
-        self.name = name
-        if self.name not in std_qibo_creation_fns:
-            raise ValueError("Standard name '%s' is not available in 'qibo' evotype" % self.name)
-
-        self.basis = basis  # used anywhere?
-        self.creation_fn = std_qibo_creation_fns[name]
-        # create the desired op given qubit indices & gate args
-
-        super(OpRepStandard, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return [self.creation_fn(*qubit_indices)]
-
-
-#class OpRepStochastic(OpRepDense):
-# - maybe we could add this, but it wouldn't be a "dense" op here,
-#   perhaps we need to change API?
-
-
-class OpRepComposed(OpRep):
-    # exactly the same as densitymx case
-    def __init__(self, factor_op_reps, state_space):
-        #assert(len(factor_op_reps) > 0), "Composed gates must contain at least one factor gate!"
-        self.factor_reps = factor_op_reps
-        super(OpRepComposed, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return list(_itertools.chain(*[f.create_qibo_ops_on(qubit_indices) for f in self.factor_reps]))
-
-    def reinit_factor_op_reps(self, new_factor_op_reps):
-        self.factors_reps = new_factor_op_reps
-
-
-# This might work, but we won't need it unless we get OpRepExpErrorgen, etc, working.
-#class OpRepSum(OpRep):
-#    # exactly the same as densitymx case
-#    def __init__(self, factor_reps, state_space):
-#        #assert(len(factor_reps) > 0), "Composed gates must contain at least one factor gate!"
-#        self.factor_reps = factor_reps
-#        super(OpRepSum, self).__init__(state_space)
-#
-#    def acton(self, state):
-#        """ Act this gate map on an input state """
-#        output_state = _StateRepDensePure(_np.zeros(state.data.shape, complex), state.state_space, state.basis)
-#        for f in self.factor_reps:
-#            output_state.data += f.acton(state).data
-#        return output_state
-#
-#    def adjoint_acton(self, state):
-#        """ Act the adjoint of this operation matrix on an input state """
-#        output_state = _StateRepDensePure(_np.zeros(state.data.shape, complex), state.state_space, state.basis)
-#        for f in self.factor_reps:
-#            output_state.data += f.adjoint_acton(state).data
-#        return output_state
-#
-#    def acton_random(self, state, rand_state):
-#        """ Act this gate map on an input state """
-#        output_state = _StateRepDensePure(_np.zeros(state.data.shape, complex), state.state_space, state.basis)
-#        for f in self.factor_reps:
-#            output_state.data += f.acton_random(state, rand_state).data
-#        return output_state
-#
-#    def adjoint_acton_random(self, state, rand_state):
-#        """ Act the adjoint of this operation matrix on an input state """
-#        output_state = _StateRepDensePure(_np.zeros(state.data.shape, complex), state.state_space, state.basis)
-#        for f in self.factor_reps:
-#            output_state.data += f.adjoint_acton_random(state, rand_state).data
-#        return output_state
-
-
-class OpRepEmbedded(OpRep):
-
-    def __init__(self, state_space, target_labels, embedded_rep):
-        self.target_labels = target_labels
-        self.embedded_rep = embedded_rep
-        super(OpRepEmbedded, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        # TODO: change below to: sole_tensor_product_block_labels
-        assert(qubit_indices == self.state_space.tensor_product_block_labels(0))
-        return self.embedded_rep.create_qibo_ops_on(self.target_labels)
-
-
-#REMOVE
-#class OpRepExpErrorgen(OpRep):
-#
-#    def __init__(self, errorgen_rep):
-#        state_space = errorgen_rep.state_space
-#        self.errorgen_rep = errorgen_rep
-#        super(OpRepExpErrorgen, self).__init__(state_space)
-#
-#    def errgenrep_has_changed(self, onenorm_upperbound):
-#        pass
-#
-#    def acton(self, state):
-#        raise AttributeError("Cannot currently act with statevec.OpRepExpErrorgen - for terms only!")
-#
-#    def adjoint_acton(self, state):
-#        raise AttributeError("Cannot currently act with statevec.OpRepExpErrorgen - for terms only!")
-
-
-class OpRepRepeated(OpRep):
-    def __init__(self, rep_to_repeat, num_repetitions, state_space):
-        state_space = _StateSpace.cast(state_space)
-        self.repeated_rep = rep_to_repeat
-        self.num_repetitions = num_repetitions
-        super(OpRepRepeated, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return [self.repeated_rep.create_qibo_ops_on(qubit_indices)] * self.num_repetitions
-
-
-#REMOVE
-#class OpRepLindbladErrorgen(OpRep):
-#    def __init__(self, lindblad_coefficient_blocks, state_space):
-#        super(OpRepLindbladErrorgen, self).__init__(state_space)
-#        self.Lterms = None
-#        self.Lterm_coeffs = None
-#        self.lindblad_coefficient_blocks = lindblad_coefficient_blocks
-
-
-class OpRepKraus(OpRep):
-    def __init__(self, basis, kraus_reps, state_space):
-        self.basis = basis
-        self.kraus_reps = kraus_reps  # superop reps in this evotype (must be reps of *this* evotype)
-        assert(all([isinstance(rep, OpRepDenseUnitary) for rep in kraus_reps]))
-        state_space = _StateSpace.cast(state_space)
-        assert(self.basis.dim == state_space.dim)
-        super(OpRepKraus, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        kraus_ops = [Krep.base for Krep in self.kraus_reps]
-        kraus_norms = list(map(_np.linalg.norm, kraus_ops))
-        return [_qibo.gates.KrausChannel([(qubit_indices, Ki)
-                                          for Ki, nrm in zip(kraus_ops, kraus_norms) if nrm > 1e-7])]
-
-    def __str__(self):
-        return "OpRepKraus with ops\n" + str(self.kraus_reps)
-
-    def copy(self):
-        return OpRepKraus(self.basis, list(self.kraus_reps), None, self.state_space)
-
-    def to_dense(self, on_space):
-        assert(on_space == 'HilbertSchmidt' or (on_space == 'minimal' and _get_minimal_space() == 'HilbertSchmidt')), \
-            'Can only compute OpRepKraus.to_dense on HilbertSchmidt space!'
-        return sum([rep.to_dense(on_space) for rep in self.kraus_reps])
-
-
-class OpRepRandomUnitary(OpRep):
-    def __init__(self, basis, unitary_rates, unitary_reps, seed_or_state, state_space):
-        self.basis = basis
-        self.unitary_reps = unitary_reps
-        self.unitary_rates = unitary_rates.copy()
-
-        if isinstance(seed_or_state, _RandomState):
-            self.rand_state = seed_or_state
-        else:
-            self.rand_state = _RandomState(seed_or_state)
-
-        self.state_space = _StateSpace.cast(state_space)
-        assert(self.basis.dim == self.state_space.dim)
-        super(OpRepRandomUnitary, self).__init__(state_space)
-
-    def create_qibo_ops_on(self, qubit_indices):
-        return [_qibo.gates.UnitaryChannel(self.unitary_rates, [(qubit_indices, Uk.to_dense('Hilbert'))
-                                                                for Uk in self.unitary_reps],
-                                           seed=self.rand_state.randint(0, 2**30))]  # HARDCODED 2**30!! (max seed)
-
-    def __str__(self):
-        return "OpRepRandomUnitary:\n" + " rates: " + str(self.unitary_rates)  # maybe show ops too?
-
-    def copy(self):
-        return OpRepRandomUnitary(self.basis, self.unitary_rates, list(self.unitary_reps),
-                                  self.rand_state, self.state_space)
-
-    def update_unitary_rates(self, rates):
-        self.unitary_rates[:] = rates
-
-    def to_dense(self, on_space):
-        assert(on_space == 'HilbertSchmidt')  # below code only works in this case
-        return sum([rate * rep.to_dense(on_space) for rate, rep in zip(self.unitary_rates, self.unitary_reps)])
-
-
-class OpRepStochastic(OpRepRandomUnitary):
-
-    def __init__(self, stochastic_basis, basis, initial_rates, seed_or_state, state_space):
-        self.rates = initial_rates
-        self.stochastic_basis = stochastic_basis
-        rates = [1 - sum(initial_rates)] + list(initial_rates)
-        reps = [OpRepDenseUnitary(bel, basis, state_space) for bel in stochastic_basis.elements]
-        assert(len(reps) == len(rates))
-
-        state_space = _StateSpace.cast(state_space)
-        assert(basis.dim == state_space.dim)
-        self.basis = basis
-
-        super(OpRepStochastic, self).__init__(basis, _np.array(rates, 'd'), reps, seed_or_state, state_space)
-
-    def update_rates(self, rates):
-        unitary_rates = [1 - sum(rates)] + list(rates)
-        self.rates[:] = rates
-        self.update_unitary_rates(unitary_rates)
diff --git a/pygsti/evotypes/qibo/povmreps.py b/pygsti/evotypes/qibo/povmreps.py
deleted file mode 100644
index ef28ce916..000000000
--- a/pygsti/evotypes/qibo/povmreps.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-POVM representation classes for the `qibo` evolution type.
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-
-import os as _os
-import re as _re
-import subprocess as _sp
-import tempfile as _tf
-import numpy as _np
-
-from .. import basereps as _basereps
-from . import _get_densitymx_mode, _get_nshots
-from pygsti.baseobjs.statespace import StateSpace as _StateSpace
-from pygsti.baseobjs.outcomelabeldict import OutcomeLabelDict as _OutcomeLabelDict
-
-try:
-    import qibo as _qibo
-except ImportError:
-    _qibo = None
-
-
-class POVMRep(_basereps.POVMRep):
-    def __init__(self):
-        super(POVMRep, self).__init__()
-
-
-class ComputationalPOVMRep(POVMRep):
-    def __init__(self, nqubits, qubit_filter):
-        self.nqubits = nqubits
-        self.qubit_filter = qubit_filter
-        super(ComputationalPOVMRep, self).__init__()
-
-    #REMOVE
-    #def sample_outcome(self, state, rand_state):
-    #    chp_ops = state.chp_ops
-    #
-    #    povm_qubits = _np.array(range(self.nqubits))
-    #    for iqubit in povm_qubits:
-    #        if self.qubit_filter is None or iqubit in self.qubit_filter:
-    #            chp_ops.append(f'm {iqubit}')
-    #
-    #    # TODO: Make sure this handles intermediate measurements
-    #    outcomes, _ = self._run_chp_ops(chp_ops)
-    #    outcome = ''.join(outcomes)
-    #    outcome_label = _OutcomeLabelDict.to_outcome(outcome)
-    #    return outcome_label
-
-    def probabilities(self, state, rand_state, effect_labels):
-        qibo_circuit = state.qibo_circuit
-        initial_state = state.qibo_state
-        # TODO: change below to: sole_tensor_product_block_labels
-        qubits_to_measure = state.state_space.tensor_product_block_labels(0) \
-            if (self.qubit_filter is None) else self.qubit_filter
-
-        gatetypes_requiring_shots = set(('UnitaryChannel', 'PauliNoiseChannel',
-                                         'ResetChannel', 'ThermalRelaxationChannel'))
-        circuit_requires_shots = len(gatetypes_requiring_shots.intersection(set(qibo_circuit.gate_types.keys()))) > 0
-        if _get_densitymx_mode() or circuit_requires_shots is False:
-            #then we can use QIBO's exact .probabilities call:
-            results = qibo_circuit(initial_state)
-            prob_tensor = results.probabilities(qubits_to_measure)
-
-            probs = [prob_tensor[tuple(map(int, effect_lbl))] for effect_lbl in effect_labels]
-            # Above map & int converts, e.g., '01' -> (0,1)
-        else:
-            #we must use built-in weak fwdsim
-            qibo_circuit.add(_qibo.gates.M(*qubits_to_measure))
-            nshots = _get_nshots()
-            results = qibo_circuit(initial_state, nshots=nshots)
-            freqs = results.frequencies(binary=True)
-            probs = [freqs[effect_lbl] / nshots for effect_lbl in effect_labels]
-    
-        return probs
-
-
-class ComposedPOVMRep(POVMRep):
-    def __init__(self, errmap_rep, base_povm_rep, state_space):
-        self.errmap_rep = errmap_rep
-        self.base_povm_rep = base_povm_rep
-        self.state_space = state_space
-        super(ComposedPOVMRep, self).__init__()
-
-#REMOVE
-#    def sample_outcome(self, state, rand_state):
-#        state = self.errmap_rep.acton_random(state, rand_state)
-#        return self.base_povm_rep.sample_outcome(state)
-
-    def probabilities(self, state, rand_state, effect_labels):
-        state = self.errmap_rep.acton_random(state, rand_state)
-        return self.base_povm_rep.probabilities(state, rand_state, effect_labels)
diff --git a/pygsti/evotypes/qibo/statereps.py b/pygsti/evotypes/qibo/statereps.py
deleted file mode 100644
index e35193953..000000000
--- a/pygsti/evotypes/qibo/statereps.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""
-State representations for "qibo" evolution type.
-"""
-#***************************************************************************************************
-# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
-# in this software.
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.  You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
-#***************************************************************************************************
-
-import numpy as _np
-import functools as _functools
-
-from .. import basereps as _basereps
-from . import _get_densitymx_mode, _get_minimal_space
-from pygsti.baseobjs.statespace import StateSpace as _StateSpace
-from pygsti.baseobjs.basis import Basis as _Basis
-from pygsti.tools import internalgates as _itgs
-from pygsti.tools import basistools as _bt
-from pygsti.tools import optools as _ot
-
-try:
-    from ...tools import fastcalc as _fastcalc
-except ImportError:
-    _fastcalc = None
-
-try:
-    import qibo as _qibo
-except ImportError:
-    _qibo = None
-
-
-class StateRep(_basereps.StateRep):
-    def __init__(self, qibo_circuit, qibo_state, state_space):
-        self.qibo_circuit = qibo_circuit
-        self.qibo_state = qibo_state
-        self.state_space = _StateSpace.cast(state_space)
-        assert(self.qibo_circuit is None or self.state_space.num_qubits == self.qibo_circuit.nqubits), \
-            'Number-of-qubits mismatch between state space and circuit for "qubit" evotype'
-
-    @property
-    def num_qubits(self):
-        return self.state_space.num_qubits
-
-    def copy(self):
-        return StateRep(self.qibo_circuit, self.qibo_state, self.state_space)
-
-    def actionable_staterep(self):
-        # return a state rep that can be acted on by op reps or mapped to
-        # a probability/amplitude by POVM effect reps.
-        return self  # for most classes, the rep itself is actionable
-
-
-class StateRepDensePure(StateRep):
-    def __init__(self, purevec, state_space, basis):
-        state_space = _StateSpace.cast(state_space)
-        qibo_circuit = _qibo.models.Circuit(state_space.num_qubits, density_matrix=_get_densitymx_mode())
-        self.basis = basis
-        super(StateRepDensePure, self).__init__(qibo_circuit, purevec, state_space)
-
-    @property
-    def base(self):
-        return self.qibo_state
-
-    def base_has_changed(self):
-        pass
-
-    def to_dense(self, on_space):
-        if on_space == 'Hilbert' or (on_space == 'minimal' and _get_minimal_space() == 'Hilbert'):
-            return self.base
-        elif on_space == 'HilbertSchmidt' or (on_space == 'minimal' and _get_minimal_space() == 'HilbertSchmidt'):
-            return _bt.change_basis(_ot.state_to_dmvec(self.base), 'std', self.basis)
-        else:
-            raise ValueError("Invalid `on_space` argument: %s" % str(on_space))
-
-
-class StateRepDense(StateRep):
-    def __init__(self, data, state_space, basis):
-        assert(_get_densitymx_mode() is True), "Must set pygsti.evotypes.qibo.densitymx_mode=True to use dense states!"
-        state_space = _StateSpace.cast(state_space)
-        qibo_circuit = _qibo.models.Circuit(state_space.num_qubits, density_matrix=True)
-        self.basis = basis
-        self.std_basis = _Basis.cast('std', state_space.dim)  # the basis the qibo expects
-        self.data = data
-        self.udim = state_space.udim
-        qibo_state = _bt.change_basis(data, basis, self.std_basis).reshape((self.udim, self.udim))
-        super(StateRepDense, self).__init__(qibo_circuit, qibo_state, state_space)
-
-    @property
-    def base(self):
-        return self.data  # state in self.basis (not self.std_basis)
-
-    def base_has_changed(self):
-        self.qibo_state = _bt.change_basis(self.data, self.basis, self.std_basis).reshape((self.udim, self.udim))
-
-    def to_dense(self, on_space):
-        if not (on_space == 'HilbertSchmidt' or (on_space == 'minimal' and _get_minimal_space() == 'HilbertSchmidt')):
-            raise ValueError("'densitymx' evotype cannot produce Hilbert-space ops!")
-        return self.data
-
-
-class StateRepComputational(StateRep):
-    def __init__(self, zvals, basis, state_space):
-        assert all([nm in ('pp', 'PP') for nm in basis.name.split('*')]), \
-            "Only Pauli basis is allowed for 'chp' evotype"
-
-        #Convert zvals to dense vec:
-        factor_dim = 2
-        v0 = _np.array((1, 0), complex)  # '0' qubit state as complex state vec
-        v1 = _np.array((0, 1), complex)  # '1' qubit state as complex state vec
-        v = (v0, v1)
-
-        if _fastcalc is None:  # do it the slow way using numpy
-            vec = _functools.reduce(_np.kron, [v[i] for i in zvals])
-        else:
-            typ = complex
-            fast_kron_array = _np.ascontiguousarray(
-                _np.empty((len(zvals), factor_dim), typ))
-            fast_kron_factordims = _np.ascontiguousarray(_np.array([factor_dim] * len(zvals), _np.int64))
-            for i, zi in enumerate(zvals):
-                fast_kron_array[i, :] = v[zi]
-            vec = _np.ascontiguousarray(_np.empty(factor_dim**len(zvals), typ))
-            _fastcalc.fast_kron_complex(vec, fast_kron_array, fast_kron_factordims)
-
-        self.zvals = zvals
-        self.basis = basis
-
-        if _qibo is None: raise ValueError("qibo is not installed! Must `pip install qibo` to use the 'qibo' evotype")
-        state_space = _StateSpace.cast(state_space)
-        qibo_circuit = _qibo.models.Circuit(state_space.num_qubits, density_matrix=_get_densitymx_mode())
-        super(StateRepComputational, self).__init__(qibo_circuit, vec, state_space)
-
-
-class StateRepComposed(StateRep):
-    def __init__(self, state_rep, op_rep, state_space):
-        self.state_rep = state_rep
-        self.op_rep = op_rep
-        super(StateRepComposed, self).__init__(None, None, state_space)  # this state rep is *not* actionable
-
-    def reps_have_changed(self):
-        pass  # not needed -- don't actually hold ops
-
-    def actionable_staterep(self):
-        state_rep = self.state_rep.actionable_staterep()
-        return self.op_rep.acton(state_rep)
-
-    @property
-    def basis(self):
-        # (all qibo state reps need to have a .basis property)
-        return self.state_rep.basis
-
-#REMOVE
-#    def chp_ops(self, seed_or_state=None):
-#        return self.state_rep.chp_ops(seed_or_state=seed_or_state) \
-#            + self.op_rep.chp_ops(seed_or_state=seed_or_state)
-
-# TODO: Untested, only support computational and composed for now
-#class StateRepTensorProduct(StateRep):
-#    def __init__(self, factor_state_reps, state_space):
-#        self.factor_reps = factor_state_reps
-#        super(StateRepTensorProduct, self).__init__([], state_space)
-#        self.reps_have_changed()
-#
-#    def reps_have_changed(self):
-#        chp_ops = []
-#        current_iqubit = 0
-#        for factor in self.factor_reps:
-#            local_to_tp_index = {str(iloc): str(itp) for iloc, itp in
-#                                 enumerate(range(current_iqubit, current_iqubit + factor.num_qubits))}
-#            chp_ops.extend([_update_chp_op(op, local_to_tp_index) for op in self.chp_ops])
-#            current_iqubit += factor.num_qubits
-#        self.chp_ops = chp_ops
diff --git a/pygsti/extras/interpygate/__init__.py b/pygsti/extras/interpygate/__init__.py
index 49fddae7b..1155ee3f1 100644
--- a/pygsti/extras/interpygate/__init__.py
+++ b/pygsti/extras/interpygate/__init__.py
@@ -10,3 +10,15 @@
 
 from .core import PhysicalProcess, InterpolatedDenseOp, InterpolatedOpFactory
 from .process_tomography import vec, unvec, run_process_tomography
+
+# Note from Riley on September, 2024:
+#
+#   vec is deprecated, and shouldn't be called anywhere in the codebase.
+#
+#   unvec is deprecated and replaced with unvec_square; the latter function
+#   isn't imported here because we don't want people to access it just from
+#   the pygsti.extras.interpygate namespace.
+#
+#   Ideally we'd remove vec and unvec from the pygsti.extras.interpygate namespace
+#   and only have them available in pygsti.extras.interpygate.process_tomography.
+#
diff --git a/pygsti/extras/interpygate/process_tomography.py b/pygsti/extras/interpygate/process_tomography.py
index 2b262b1d2..fba79adb6 100644
--- a/pygsti/extras/interpygate/process_tomography.py
+++ b/pygsti/extras/interpygate/process_tomography.py
@@ -7,6 +7,7 @@
 import numpy.linalg as _lin
 
 from pygsti.tools.basistools import change_basis
+from pygsti.tools.legacytools import deprecate
 
 
 #Helper functions
@@ -15,8 +16,11 @@ def multi_kron(*a):
     return reduce(_np.kron, a)
 
 
+@deprecate("Calls to this function should be replaced with in-lined code: matrix.reshape((matrix.size, 1), 'F')")
 def vec(matrix):
-    """A function that vectorizes a matrix.
+    """
+    Returns an explicit column-vector representation of a square matrix, obtained by reading
+    from the square matrix in column-major order.
 
     Args:
         matrix (list,numpy.ndarray): NxN matrix
@@ -30,11 +34,12 @@ def vec(matrix):
     """
     matrix = _np.array(matrix)
     if matrix.shape == (len(matrix), len(matrix)):
-        return _np.array([_np.concatenate(_np.array(matrix).T)]).T
+        return matrix.reshape(shape=(matrix.size, 1), order='F')
     else:
         raise ValueError('The input matrix must be square.')
 
 
+@deprecate("Calls to this function should be replaced by unvec_square(vectorized, 'F')")
 def unvec(vectorized):
     """A function that vectorizes a process in the basis of matrix units, sorted first
     by column, then row.
@@ -49,13 +54,42 @@ def unvec(vectorized):
         ValueError: If the length of the input is not a perfect square
 
     """
-    vectorized = _np.array(vectorized)
-    length = int(_np.sqrt(max(vectorized.shape)))
-    if len(vectorized) == length ** 2:
-        return _np.reshape(vectorized, [length, length]).T
+    return unvec_square(vectorized, order='F')
+
+
+def unvec_square(vectorized, order):
+    """
+    Takes a vector whose length is a perfect square, and returns a square matrix
+    representation by reading from the vectors entries to define the matrix in 
+    column-major order (order='F') or row-major order (order='C').
+
+    Args:
+        vectorized: array-like, where np.array(vectorized).size is a perfect square.
+        order: 'F' or 'C'
+
+    Returns:
+        numpy.ndarray: NxN dimensional array
+
+    Raises:
+        ValueError: If the length of the input is not a perfect square.
+
+    """
+    assert order == 'F' or order == 'C'
+    if not isinstance(vectorized, _np.ndarray):
+        vectorized = _np.array(vectorized)
+
+    if vectorized.ndim == 2:
+        assert min(vectorized.shape) == 1
+        vectorized = vectorized.ravel()
+    elif vectorized.ndim > 2:
+        raise ValueError('vectorized.ndim must be <= 2.')
+
+    n = int(_np.sqrt(max(vectorized.shape)))
+    if len(vectorized) == n ** 2:
+        return vectorized.reshape((n, n), order=order)
     else:
-        raise ValueError(
-            'The input vector length must be a perfect square, but this input has length %d.' % len(vectorized))
+        msg = 'The input vector length must be a perfect square, but this input has length %d.' % len(vectorized)
+        raise ValueError(msg)
 
 
 def split(n, a):
@@ -129,7 +163,7 @@ def run_process_tomography(state_to_density_matrix_fn, n_qubits=1, comm=None,
     states = _itertools.product(one_qubit_states, repeat=n_qubits)
     states = [multi_kron(*state) for state in states]
     in_density_matrices = [_np.outer(state, state.conj()) for state in states]
-    in_states = _np.column_stack(list([vec(rho) for rho in in_density_matrices]))
+    in_states = _np.column_stack(list([rho.ravel(order='F') for rho in in_density_matrices]))
     my_states = split(size, states)[rank]
     if verbose:
         print("Process %d of %d evaluating %d input states." % (rank, size, len(my_states)))
@@ -150,7 +184,7 @@ def run_process_tomography(state_to_density_matrix_fn, n_qubits=1, comm=None,
         out_density_matrices = _np.array([y for x in gathered_out_density_matrices for y in x])
         # Sort the list by time
         out_density_matrices = _np.transpose(out_density_matrices, [1, 0, 2, 3])
-        out_states = [_np.column_stack(list([vec(rho) for rho in density_matrices_at_time]))
+        out_states = [_np.column_stack(list([rho.ravel(order='F') for rho in density_matrices_at_time]))
                       for density_matrices_at_time in out_density_matrices]
         process_matrices = [_np.dot(out_states_at_time, _lin.inv(in_states)) for out_states_at_time in out_states]
         process_matrices = [change_basis(process_matrix_at_time, 'col', basis)
diff --git a/pygsti/forwardsims/forwardsim.py b/pygsti/forwardsims/forwardsim.py
index c5e61b057..2ae19f2f3 100644
--- a/pygsti/forwardsims/forwardsim.py
+++ b/pygsti/forwardsims/forwardsim.py
@@ -323,7 +323,8 @@ def hprobs(self, circuit, resource_alloc=None):
     # ---------------------------------------------------------------------------
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None,
-                      array_types=(), derivative_dimensions=None, verbosity=0):
+                      array_types=(), derivative_dimensions=None, verbosity=0,
+                      layout_creation_circuit_cache = None):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for `circuits` and `dataset`.
 
@@ -364,6 +365,8 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None,
         verbosity : int or VerbosityPrinter
             Determines how much output to send to stdout.  0 means no output, higher
             integers mean more output.
+        
+        
 
         Returns
         -------
@@ -378,6 +381,15 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None,
                 derivative_dimensions = tuple()
         return _CircuitOutcomeProbabilityArrayLayout.create_from(circuits, self.model, dataset, derivative_dimensions,
                                                                  resource_alloc=resource_alloc)
+    
+    @staticmethod
+    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
+        """
+        Helper function for pre-computing/pre-processing circuits structures
+        used in matrix layout creation.
+        """
+        msg = "Not currently implemented for this forward simulator class."
+        raise NotImplementedError(msg)
 
     def bulk_probs(self, circuits, clip_to=None, resource_alloc=None, smartc=None):
         """
diff --git a/pygsti/forwardsims/mapforwardsim.py b/pygsti/forwardsims/mapforwardsim.py
index 6b19e8d39..1462d641d 100644
--- a/pygsti/forwardsims/mapforwardsim.py
+++ b/pygsti/forwardsims/mapforwardsim.py
@@ -26,6 +26,8 @@
 from pygsti.tools import sharedmemtools as _smt
 from pygsti.tools import slicetools as _slct
 from pygsti.tools.matrixtools import _fas
+from pygsti.tools import listtools as _lt
+from pygsti.circuits import CircuitList as _CircuitList
 
 _dummy_profiler = _DummyProfiler()
 
@@ -49,7 +51,7 @@ class SimpleMapForwardSimulator(_ForwardSimulator):
     # If this is done, then MapForwardSimulator wouldn't need to separately subclass DistributableForwardSimulator
 
     def _compute_circuit_outcome_probabilities(self, array_to_fill, circuit, outcomes, resource_alloc, time=None):
-        expanded_circuit_outcomes = circuit.expand_instruments_and_separate_povm(self.model, outcomes)
+        expanded_circuit_outcomes = self.model.expand_instruments_and_separate_povm(circuit, outcomes)
         outcome_to_index = {outc: i for i, outc in enumerate(outcomes)}
         for spc, spc_outcomes in expanded_circuit_outcomes.items():  # spc is a SeparatePOVMCircuit
             # Note: `spc.circuit_without_povm` *always* begins with a prep label.
@@ -157,7 +159,7 @@ def _array_types_for_method(cls, method_name):
         if method_name == 'bulk_fill_timedep_dchi2': return ('p',)  # just an additional parameter vector
         return super()._array_types_for_method(method_name)
 
-    def __init__(self, model=None, max_cache_size=0, num_atoms=None, processor_grid=None, param_blk_sizes=None,
+    def __init__(self, model=None, max_cache_size=None, num_atoms=None, processor_grid=None, param_blk_sizes=None,
                  derivative_eps=1e-7, hessian_eps=1e-5):
         #super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
         _DistributableForwardSimulator.__init__(self, model, num_atoms, processor_grid, param_blk_sizes)
@@ -193,7 +195,9 @@ def copy(self):
                                    self._processor_grid, self._pblk_sizes)
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0):
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None,
+                      circuit_partition_cost_functions=('size', 'propagations'),
+                      load_balancing_parameters=(1.15,.1)):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -223,6 +227,23 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
         verbosity : int or VerbosityPrinter
             Determines how much output to send to stdout.  0 means no output, higher
             integers mean more output.
+        
+        layout_creation_circuit_cache : dict, optional (default None)
+            A precomputed dictionary serving as a cache for completed circuits. I.e. circuits 
+            with prep labels and POVM labels appended. Along with other useful pre-computed 
+            circuit structures used in layout creation.
+
+        circuit_partition_cost_functions : tuple of str, optional (default ('size', 'propagations'))
+            A tuple of strings denoting cost function to use in each of the two stages of the algorithm
+            for determining the partitions of the complete circuit set amongst atoms.
+            Allowed options are 'size', which corresponds to balancing the number of circuits, 
+            and 'propagations', which corresponds to balancing the number of state propagations.
+
+        load_balancing_parameters : tuple of floats, optional (default (1.2, .1))
+            A tuple of floats used as load balancing parameters when splitting a layout across atoms,
+            as in the multi-processor setting when using MPI. These parameters correspond to the `imbalance_threshold`
+            and `minimum_improvement_threshold` parameters described in the method `find_splitting_new`
+            of the `PrefixTable` class.
 
         Returns
         -------
@@ -248,15 +269,15 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
                 raise MemoryError("Attempted layout creation w/memory limit = %g <= 0!" % mem_limit)
             printer.log("Layout creation w/mem limit = %.2fGB" % (mem_limit * C))
 
-        #Start with how we'd like to split processors up (without regard to memory limit):
-
-        # when there are lots of processors, the from_vector calls dominante over the actual fwdsim,
-        # but we can reduce from_vector calls by having np1, np2 > 0 (each param requires a from_vector
-        # call when using finite diffs) - so we want to choose nc = Ng < nprocs and np1 > 1 (so nc * np1 = nprocs).
-        #work_per_proc = self.model.dim**2
+        #Start with how we'd like to split processors up (without regard to memory limit):        
+        #The current implementation of map (should) benefit more from having a matching between the number of atoms
+        #and the number of processors, at least for up to around two-qubits.
+        default_natoms = nprocs # heuristic
+        #TODO: factor in the mem_limit value to more intelligently set the default number of atoms.
 
         natoms, na, npp, param_dimensions, param_blk_sizes = self._compute_processor_distribution(
-            array_types, nprocs, num_params, len(circuits), default_natoms=2 * self.model.dim)  # heuristic?
+            array_types, nprocs, num_params, len(circuits), default_natoms=default_natoms)  
+        
         printer.log(f'Num Param Processors {npp}')
         
         printer.log("MapLayout: %d processors divided into %s (= %d) grid along circuit and parameter directions." %
@@ -265,7 +286,9 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
         assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
 
         layout = _MapCOPALayout(circuits, self.model, dataset, self._max_cache_size, natoms, na, npp,
-                                param_dimensions, param_blk_sizes, resource_alloc, verbosity)
+                                param_dimensions, param_blk_sizes, resource_alloc,circuit_partition_cost_functions,
+                                verbosity, layout_creation_circuit_cache= layout_creation_circuit_cache,
+                                load_balancing_parameters=load_balancing_parameters)
 
         if mem_limit is not None:
             loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
@@ -306,6 +329,41 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
                 printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
 
         return layout
+    
+    @staticmethod
+    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
+        """
+        Helper function for pre-computing/pre-processing circuits structures
+        used in matrix layout creation.
+        """
+        cache = dict()
+        completed_circuits = model.complete_circuits(circuits)
+
+        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
+
+        split_circuits = model.split_circuits(completed_circuits, split_prep=False)    
+        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
+        
+
+        if dataset is not None:
+            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
+            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
+            unique_outcomes_list = []
+            for ckt in ds_circuits:
+                ds_row = dataset[ckt]
+                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
+        else:
+            unique_outcomes_list = [None]*len(circuits)
+
+        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
+                                                                                        observed_outcomes_list = unique_outcomes_list, 
+                                                                                        completed_circuits= completed_circuits)
+        
+        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(completed_circuits, expanded_circuit_outcome_list)}                
+        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
+
+        return cache
+
 
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
diff --git a/pygsti/forwardsims/mapforwardsim_calc_densitymx.pyx b/pygsti/forwardsims/mapforwardsim_calc_densitymx.pyx
index 8fb485049..e9a09680f 100644
--- a/pygsti/forwardsims/mapforwardsim_calc_densitymx.pyx
+++ b/pygsti/forwardsims/mapforwardsim_calc_densitymx.pyx
@@ -1,6 +1,4 @@
 # encoding: utf-8
-# cython: profile=False
-# cython: linetrace=False
 
 #***************************************************************************************************
 # Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
@@ -28,12 +26,6 @@ from ..tools import slicetools as _slct
 #from ..tools import optools as _ot
 from ..tools.matrixtools import _fas
 
-#DEBUG REMOVE MEMORY PROFILING
-#import os, psutil
-#process = psutil.Process(os.getpid())
-#def print_mem_usage(prefix):
-#    print("%s: mem usage = %.3f GB" % (prefix, process.memory_info().rss / (1024.0**3)))
-
 #Use 64-bit integers
 ctypedef long long INT
 ctypedef unsigned long long UINT
@@ -59,15 +51,16 @@ def propagate_staterep(staterep, operationreps):
 # Python -> C Conversion functions
 # -----------------------------------------
 
-cdef vector[vector[INT]] convert_maplayout(layout_atom, operation_lookup, rho_lookup):
+@cython.wraparound(False)
+cdef vector[vector[INT]] convert_maplayout(prefix_table_contents, operation_lookup, rho_lookup):
     # c_layout :
     # an array of INT-arrays; each INT-array is [iDest,iStart,iCache,<remainder gate indices>]
     cdef vector[INT] intarray
-    cdef vector[vector[INT]] c_layout_atom = vector[vector[INT]](len(layout_atom.table))
-    for kk, (iDest, iStart, remainder, iCache) in enumerate(layout_atom.table.contents):
+    cdef vector[vector[INT]] c_layout_atom = vector[vector[INT]](len(prefix_table_contents))
+    for kk, (iDest, iStart, remainder, iCache) in enumerate(prefix_table_contents):
         if iStart is None: iStart = -1 # so always an int
         if iCache is None: iCache = -1 # so always an int
-        remainder = remainder.circuit_without_povm.layertup
+        remainder = remainder
         intarray = vector[INT](3 + len(remainder))
         intarray[0] = iDest
         intarray[1] = iStart
@@ -107,6 +100,8 @@ cdef vector[vector[INT]] convert_and_wrap_dict_of_intlists(d, wrapper):
             ret[i][j] = wrapper[intlist[j]]
     return ret
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 cdef vector[StateCRep*] create_rhocache(INT cacheSize, INT state_dim):
     cdef INT i
     cdef vector[StateCRep*] rho_cache = vector[StateCRep_ptr](cacheSize)
@@ -114,11 +109,14 @@ cdef vector[StateCRep*] create_rhocache(INT cacheSize, INT state_dim):
         rho_cache[i] = new StateCRep(state_dim)
     return rho_cache
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 cdef void free_rhocache(vector[StateCRep*] rho_cache):
     cdef UINT i
     for i in range(rho_cache.size()): # fill cache with empty but alloc'd states
         del rho_cache[i]
 
+@cython.wraparound(False)
 cdef vector[OpCRep*] convert_opreps(operationreps):
     # c_opreps : an array of OpCReps
     cdef vector[OpCRep*] c_opreps = vector[OpCRep_ptr](len(operationreps))
@@ -130,19 +128,20 @@ cdef StateCRep* convert_rhorep(rhorep):
     # extract c-reps from rhorep and ereps => c_rho and c_ereps
     return (<StateRep?>rhorep).c_state
 
+@cython.wraparound(False)
 cdef vector[StateCRep*] convert_rhoreps(rhoreps):
     cdef vector[StateCRep*] c_rhoreps = vector[StateCRep_ptr](len(rhoreps))
     for ii,rrep in rhoreps.items(): # (ii = python variable)
         c_rhoreps[ii] = (<StateRep?>rrep).c_state
     return c_rhoreps
 
+@cython.wraparound(False)
 cdef vector[EffectCRep*] convert_ereps(ereps):
     cdef vector[EffectCRep*] c_ereps = vector[EffectCRep_ptr](len(ereps))
     for i in range(len(ereps)):
         c_ereps[i] = (<EffectRep>ereps[i]).c_effect
     return c_ereps
 
-
 # -----------------------------------------
 # Mapfill functions
 # -----------------------------------------
@@ -159,7 +158,6 @@ def mapfill_probs_atom(fwdsim, np.ndarray[double, mode="c", ndim=1] array_to_fil
     shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True
 
     dest_indices = _slct.to_array(dest_indices)  # make sure this is an array and not a slice
-    #dest_indices = np.ascontiguousarray(dest_indices) #unneeded
 
     #Get (extension-type) representation objects
     rho_lookup = { lbl:i for i,lbl in enumerate(layout_atom.rho_labels) } # rho labels -> ints for faster lookup
@@ -169,7 +167,7 @@ def mapfill_probs_atom(fwdsim, np.ndarray[double, mode="c", ndim=1] array_to_fil
     ereps = [fwdsim.model._circuit_layer_operator(elbl, 'povm')._rep for elbl in layout_atom.full_effect_labels]  # cache these in future
 
     # convert to C-mode:  evaltree, operation_lookup, operationreps
-    cdef vector[vector[INT]] c_layout_atom = convert_maplayout(layout_atom, operation_lookup, rho_lookup)
+    cdef vector[vector[INT]] c_layout_atom = convert_maplayout(layout_atom.table.contents, operation_lookup, rho_lookup)
     cdef vector[StateCRep*] c_rhos = convert_rhoreps(rhoreps)
     cdef vector[EffectCRep*] c_ereps = convert_ereps(ereps)
     cdef vector[OpCRep*] c_opreps = convert_opreps(operationreps)
@@ -182,17 +180,6 @@ def mapfill_probs_atom(fwdsim, np.ndarray[double, mode="c", ndim=1] array_to_fil
     cdef vector[vector[INT]] final_indices_per_circuit = convert_and_wrap_dict_of_intlists(
         layout_atom.elindices_by_expcircuit, dest_indices)
 
-    #DEBUG REMOVE
-    #print_mem_usage("MAPFILL PROBS begin")
-    #for i in [1808, 419509, 691738, 497424]:
-    #    from ..evotypes.densitymx.opreps import OpRepComposed
-    #    op = operationreps[i]
-    #    if isinstance(op.embedded_rep, OpRepComposed):
-    #        extra = " factors = " + ', '.join([str(type(opp)) for opp in op.embedded_rep.factor_reps])
-    #    else:
-    #        extra = ""
-    #    print("ID ",i,str(type(op)),str(type(op.embedded_rep)), extra)
-
     if shared_mem_leader:
         #Note: dm_mapfill_probs could have taken a resource_alloc to employ multiple cpus to do computation.
         # Since array_fo_fill is assumed to be shared mem it would need to only update `array_to_fill` *if*
@@ -203,6 +190,7 @@ def mapfill_probs_atom(fwdsim, np.ndarray[double, mode="c", ndim=1] array_to_fil
     free_rhocache(rho_cache)  #delete cache entries
 
 
+@cython.wraparound(False)
 cdef dm_mapfill_probs(double[:] array_to_fill,
                       vector[vector[INT]] c_layout_atom,
                       vector[OpCRep*] c_opreps,
@@ -234,7 +222,7 @@ cdef dm_mapfill_probs(double[:] array_to_fill,
     # - all rho_cache entries have been allocated via "new"
     #REMOVE print("MAPFILL PROBS begin cfn")
     for k in range(<INT>c_layout_atom.size()):
-        t0 = pytime.time() # DEBUG
+        #t0 = pytime.time() # DEBUG
         intarray = c_layout_atom[k]
         i = intarray[0]
         istart = intarray[1]
@@ -298,22 +286,27 @@ cdef dm_mapfill_probs(double[:] array_to_fill,
     del prop2
     del shelved
 
-
+@cython.wraparound(False)
 def mapfill_dprobs_atom(fwdsim,
                         np.ndarray[double, ndim=2] array_to_fill,
                         dest_indices,
                         dest_param_indices,
                         layout_atom, param_indices, resource_alloc, double eps):
 
-    #cdef double eps = 1e-7
+    cdef int num_params = fwdsim.model.num_params
+    cdef int model_dim = fwdsim.model.dim
 
     if param_indices is None:
-        param_indices = list(range(fwdsim.model.num_params))
+        param_indices = list(range(num_params))
     if dest_param_indices is None:
         dest_param_indices = list(range(_slct.length(param_indices)))
 
-    param_indices = _slct.to_array(param_indices)
-    dest_param_indices = _slct.to_array(dest_param_indices)
+    cdef np.ndarray[np.int64_t, ndim=1, mode='c'] param_indices_array = _slct.to_array(param_indices)
+    cdef np.ndarray[np.int64_t, ndim=1, mode='c'] dest_param_indices_array = _slct.to_array(dest_param_indices)
+
+    cdef np.int64_t[::1] param_indices_view = param_indices_array
+    cdef np.int64_t[::1] dest_param_indices_view = dest_param_indices_array
+    
 
     #Get (extension-type) representation objects
     # NOTE: the circuit_layer_operator(lbl) functions cache the returned operation
@@ -327,7 +320,7 @@ def mapfill_dprobs_atom(fwdsim,
     ereps = [fwdsim.model._circuit_layer_operator(elbl, 'povm')._rep for elbl in layout_atom.full_effect_labels]  # cache these in future
 
     # convert to C-mode:  evaltree, operation_lookup, operationreps
-    cdef vector[vector[INT]] c_layout_atom = convert_maplayout(layout_atom, operation_lookup, rho_lookup)
+    cdef vector[vector[INT]] c_layout_atom = convert_maplayout(layout_atom.table.contents, operation_lookup, rho_lookup)
     cdef vector[StateCRep*] c_rhos = convert_rhoreps(rhoreps)
     cdef vector[EffectCRep*] c_ereps = convert_ereps(ereps)
     cdef vector[OpCRep*] c_opreps = convert_opreps(operationreps)
@@ -335,53 +328,60 @@ def mapfill_dprobs_atom(fwdsim,
     # create rho_cache = vector of StateCReps
     #print "DB: creating rho_cache of size %d * %g GB => %g GB" % \
     #   (layout_atom.cache_size, 8.0 * fwdsim.model.dim / 1024.0**3, layout_atom.cache_size * 8.0 * fwdsim.model.dim / 1024.0**3)
-    cdef vector[StateCRep*] rho_cache = create_rhocache(layout_atom.cache_size, fwdsim.model.dim)
+    cdef vector[StateCRep*] rho_cache = create_rhocache(layout_atom.cache_size, model_dim)
 
     cdef vector[vector[INT]] elabel_indices_per_circuit = convert_dict_of_intlists(layout_atom.elbl_indices_by_expcircuit)
     cdef vector[vector[INT]] final_indices_per_circuit = convert_dict_of_intlists(layout_atom.elindices_by_expcircuit)
 
-    orig_vec = fwdsim.model.to_vector().copy()
+    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] orig_vec = fwdsim.model.to_vector().copy()
+    cdef double[::1] orig_vec_view = orig_vec
     fwdsim.model.from_vector(orig_vec, close=False)  # ensure we call with close=False first
 
     nEls = layout_atom.num_elements
-    probs = np.empty(nEls, 'd') #must be contiguous!
-    probs2 = np.empty(nEls, 'd') #must be contiguous!
+    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] probs = np.empty(nEls, dtype=np.float64) #must be contiguous!
+    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] probs2 = np.empty(nEls, dtype=np.float64) #must be contiguous!
+
+    cdef double[::1] probs_view = probs
+    cdef double[::1] probs2_view = probs2
 
     #if resource_alloc.comm_rank == 0:
     #    print("MAPFILL DPROBS ATOM 1"); t=pytime.time(); t0=pytime.time()
-    dm_mapfill_probs(probs, c_layout_atom, c_opreps, c_rhos, c_ereps, &rho_cache,
-                     elabel_indices_per_circuit, final_indices_per_circuit, fwdsim.model.dim)
+    dm_mapfill_probs(probs_view, c_layout_atom, c_opreps, c_rhos, c_ereps, &rho_cache, elabel_indices_per_circuit, final_indices_per_circuit, model_dim)
     #if resource_alloc.comm_rank == 0:
     #    print("MAPFILL DPROBS ATOM 2 %.3fs" % (pytime.time() - t)); t=pytime.time()
 
-    shared_mem_leader = resource_alloc.is_host_leader
-
-    #Get a map from global parameter indices to the desired
-    # final index within array_to_fill
-    iParamToFinal = {i: dest_index for i, dest_index in zip(param_indices, dest_param_indices)}
+    cdef bint shared_mem_leader = resource_alloc.is_host_leader
+
+    #add typing to indices we'll be using below:
+    cdef int i
+    cdef int first_param_idx
+    cdef int iFinal
+
+    #Split off the first finite difference step, as the pattern I want in the loop with each step
+    #is to simultaneously undo the previous update and apply the new one.
+    if len(param_indices_view)>0:
+        #probs2_view[:] = probs_view[:]
+        first_param_idx = param_indices_view[0]
+        iFinal = dest_param_indices_view[0]
+        fwdsim.model.set_parameter_value(first_param_idx, orig_vec_view[first_param_idx]+eps)
+        if shared_mem_leader:  # don't fill assumed-shared array-to_fill on non-mem-leaders
+            dm_mapfill_probs(probs2_view, c_layout_atom, c_opreps, c_rhos, c_ereps, &rho_cache, elabel_indices_per_circuit, final_indices_per_circuit, model_dim)
+            array_to_fill[dest_indices, iFinal] = (probs2 - probs) / eps
+
+    for i in range(1, len(param_indices_view)):
+        #probs2_view[:] = probs_view[:]
+        iFinal = dest_param_indices_view[i]
+        fwdsim.model.set_parameter_values([param_indices_view[i-1], param_indices_view[i]], [orig_vec_view[param_indices_view[i-1]], orig_vec_view[param_indices_view[i]]+eps])
+
+        if shared_mem_leader:  # don't fill assumed-shared array-to_fill on non-mem-leaders
+            dm_mapfill_probs(probs2_view, c_layout_atom, c_opreps, c_rhos, c_ereps,  &rho_cache, elabel_indices_per_circuit, final_indices_per_circuit, model_dim)
+            array_to_fill[dest_indices, iFinal] = (probs2 - probs) / eps
+        
+    #reset the final model parameter we changed to it's original value.
+    fwdsim.model.set_parameter_value(param_indices_view[len(param_indices_view)-1], orig_vec_view[param_indices_view[len(param_indices_view)-1]])
 
-    for i in range(fwdsim.model.num_params):
-        #print("dprobs cache %d of %d" % (i,self.Np))
-        if i in iParamToFinal:
-            #if resource_alloc.comm_rank == 0:
-            #    print("MAPFILL DPROBS ATOM 3 (i=%d) %.3fs elapssed=%.1fs" % (i, pytime.time() - t, pytime.time() - t0)); t=pytime.time()
-            iFinal = iParamToFinal[i]
-            vec = orig_vec.copy(); vec[i] += eps
-            fwdsim.model.from_vector(vec, close=True)
-            #Note: dm_mapfill_probs could have taken a resource_alloc to employ multiple cpus to do computation.
-            # If probs2 were shared mem (seems not benefit to this?) it would need to only update `probs2` *if*
-            # it were the host leader.
-            if shared_mem_leader:  # don't fill assumed-shared array-to_fill on non-mem-leaders
-                dm_mapfill_probs(probs2, c_layout_atom, c_opreps, c_rhos, c_ereps, &rho_cache,
-                                 elabel_indices_per_circuit, final_indices_per_circuit, fwdsim.model.dim)
-                #_fas(array_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps)  # I don't think this is needed
-                array_to_fill[dest_indices, iFinal] = (probs2 - probs) / eps
-
-    #if resource_alloc.comm_rank == 0:
-    #    print("MAPFILL DPROBS ATOM 4 elapsed=%.1fs" % (pytime.time() - t0))
-    fwdsim.model.from_vector(orig_vec, close=True)
     free_rhocache(rho_cache)  #delete cache entries
-
+    
 
 cdef double TDchi2_obj_fn(double p, double f, double n_i, double n, double omitted_p, double min_prob_clip_for_weighting, double extra):
     cdef double cp, v, omitted_cp
@@ -475,7 +475,6 @@ def mapfill_TDterms(fwdsim, objective, array_to_fill, dest_indices, num_outcomes
     #comm is currently ignored
     #TODO: if layout_atom is split, distribute among processors
     for iDest, iStart, remainder, iCache in layout_atom.table.contents:
-        remainder = remainder.circuit_without_povm.layertup
         rholabel = remainder[0]; remainder = remainder[1:]
         rhoVec = fwdsim.model._circuit_layer_operator(rholabel, 'prep')
 
diff --git a/pygsti/forwardsims/mapforwardsim_calc_generic.py b/pygsti/forwardsims/mapforwardsim_calc_generic.py
index ddbd5ea4b..4938fc14e 100644
--- a/pygsti/forwardsims/mapforwardsim_calc_generic.py
+++ b/pygsti/forwardsims/mapforwardsim_calc_generic.py
@@ -51,7 +51,6 @@ def mapfill_probs_atom(fwdsim, mx_to_fill, dest_indices, layout_atom, resource_a
 
     #TODO: if layout_atom is split, distribute somehow among processors(?) instead of punting for all but rank-0 above
     for iDest, iStart, remainder, iCache in layout_atom.table.contents:
-        remainder = remainder.circuit_without_povm.layertup
 
         if iStart is None:  # then first element of remainder is a state prep label
             rholabel = remainder[0]
@@ -73,7 +72,63 @@ def mapfill_probs_atom(fwdsim, mx_to_fill, dest_indices, layout_atom, resource_a
                 mx_to_fill[final_indices] = povmreps[povm_lbl].probabilities(final_state, None, effect_labels)
         else:
             ereps = [effectreps[j] for j in layout_atom.elbl_indices_by_expcircuit[iDest]]
+            #print(ereps)
+            if shared_mem_leader:
+                for j, erep in zip(final_indices, ereps):
+                    mx_to_fill[j] = erep.probability(final_state)  # outcome probability
+    #raise Exception
+#Version of the probability calculation that updates circuit probabilities conditionally based on
+#Whether the circuit is sensitive to the parameter. If not we leave that circuit alone.
+def cond_update_probs_atom(fwdsim, mx_to_fill, dest_indices, layout_atom, param_index, resource_alloc):
+
+    # The required ending condition is that array_to_fill on each processor has been filled.  But if
+    # memory is being shared and resource_alloc contains multiple processors on a single host, we only
+    # want *one* (the rank=0) processor to perform the computation, since array_to_fill will be
+    # shared memory that we don't want to have muliple procs using simultaneously to compute the
+    # same thing.  Thus, we carefully guard any shared mem updates/usage
+    # using "if shared_mem_leader" (and barriers, if needed) below.
+    shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True
+
+    dest_indices = _slct.to_array(dest_indices)  # make sure this is an array and not a slice
+    cacheSize = layout_atom.jac_table.cache_size_by_parameter[param_index]
+
+    #Create rhoCache
+    rho_cache = [None] * cacheSize  # so we can store (s,p) tuples in cache
+
+    #Get operationreps and ereps now so we don't make unnecessary ._rep references
+    rhoreps = {rholbl: fwdsim.model._circuit_layer_operator(rholbl, 'prep')._rep for rholbl in layout_atom.rho_labels}
+    operationreps = {gl: fwdsim.model._circuit_layer_operator(gl, 'op')._rep for gl in layout_atom.op_labels}
+    povmreps = {plbl: fwdsim.model._circuit_layer_operator(plbl, 'povm')._rep for plbl in layout_atom.povm_labels}
+    if any([(povmrep is None) for povmrep in povmreps.values()]):
+        effectreps = {i: fwdsim.model._circuit_layer_operator(Elbl, 'povm')._rep
+                      for i, Elbl in enumerate(layout_atom.full_effect_labels)}  # cache these in future
+    else:
+        effectreps = None  # not needed, as we use povm reps directly
+
+
+    #TODO: if layout_atom is split, distribute somehow among processors(?) instead of punting for all but rank-0 above
+
+    for iDest, iStart, remainder, iCache in layout_atom.jac_table.contents_by_parameter[param_index]:
+ 
+        if iStart is None:  # then first element of remainder is a state prep label
+            rholabel = remainder[0]
+            init_state = rhoreps[rholabel]
+            remainder = remainder[1:]
+        else:
+            init_state = rho_cache[iStart]  # [:,None]
+
+        final_state = propagate_staterep(init_state, [operationreps[gl] for gl in remainder])
+        if iCache is not None: rho_cache[iCache] = final_state  # [:,0] #store this state in the cache
+
+        final_indices = [dest_indices[j] for j in layout_atom.elindices_by_expcircuit[iDest]]
+
+        if effectreps is None:
+            povm_lbl, *effect_labels = layout_atom.povm_and_elbls_by_expcircuit[iDest]
 
+            if shared_mem_leader:
+                mx_to_fill[final_indices] = povmreps[povm_lbl].probabilities(final_state, None, effect_labels)
+        else:
+            ereps = [effectreps[j] for j in layout_atom.elbl_indices_by_expcircuit[iDest]]
             if shared_mem_leader:
                 for j, erep in zip(final_indices, ereps):
                     mx_to_fill[j] = erep.probability(final_state)  # outcome probability
@@ -82,11 +137,10 @@ def mapfill_probs_atom(fwdsim, mx_to_fill, dest_indices, layout_atom, resource_a
 def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, layout_atom, param_indices,
                         resource_alloc, eps):
 
-    #eps = 1e-7
-    #shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True
+    num_params = fwdsim.model.num_params
 
     if param_indices is None:
-        param_indices = list(range(fwdsim.model.num_params))
+        param_indices = list(range(num_params))
     if dest_param_indices is None:
         dest_param_indices = list(range(_slct.length(param_indices)))
 
@@ -105,19 +159,43 @@ def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, la
     nEls = layout_atom.num_elements
     probs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls,), 'd', memory_tracker=None)
     probs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls,), 'd', memory_tracker=None)
+    #probs2_test, shm2_test = _smt.create_shared_ndarray(resource_alloc, (nEls,), 'd', memory_tracker=None)
+    
+    #mx_to_fill_test = mx_to_fill.copy()
+
     mapfill_probs_atom(fwdsim, probs, slice(0, nEls), layout_atom, resource_alloc)  # probs != shared
 
-    for i in range(fwdsim.model.num_params):
-        #print("dprobs cache %d of %d" % (i,self.Np))
-        if i in iParamToFinal:
-            iFinal = iParamToFinal[i]
-            vec = orig_vec.copy(); vec[i] += eps
-            fwdsim.model.from_vector(vec, close=True)
-            mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
-            _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps)
-    fwdsim.model.from_vector(orig_vec, close=True)
+    #Split off the first finite difference step, as the pattern I want in the loop with each step
+    #is to simultaneously undo the previous update and apply the new one.
+    if len(param_indices)>0:
+        probs2[:] = probs[:]
+        first_param_idx = param_indices[0]
+        iFinal = iParamToFinal[first_param_idx]
+        fwdsim.model.set_parameter_value(first_param_idx, orig_vec[first_param_idx]+eps)
+        #mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
+        cond_update_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, first_param_idx, resource_alloc)
+        #assert _np.linalg.norm(probs2_test-probs2) < 1e-10
+        #print(f'{_np.linalg.norm(probs2_test-probs2)=}')
+        _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps)
+
+
+    for i in range(1, len(param_indices)):
+        probs2[:] = probs[:]
+        iFinal = iParamToFinal[param_indices[i]]
+        fwdsim.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
+                                          [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
+        #mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
+        cond_update_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, param_indices[i], resource_alloc)
+        #assert _np.linalg.norm(probs2_test-probs2) < 1e-10
+        #print(f'{_np.linalg.norm(probs2_test-probs2)=}')
+        _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps)
+
+    #reset the final model parameter we changed to it's original value.
+    fwdsim.model.set_parameter_value(param_indices[-1], orig_vec[param_indices[-1]])
+
     _smt.cleanup_shared_ndarray(shm)
     _smt.cleanup_shared_ndarray(shm2)
+    #_smt.cleanup_shared_ndarray(shm2_test)
 
 
 def mapfill_TDchi2_terms(fwdsim, array_to_fill, dest_indices, num_outcomes, layout_atom, dataset_rows,
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index ddc18270a..1ecd0cd85 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -29,6 +29,8 @@
 from pygsti.tools import sharedmemtools as _smt
 from pygsti.tools import slicetools as _slct
 from pygsti.tools.matrixtools import _fas
+from pygsti.tools import listtools as _lt
+from pygsti.circuits import CircuitList as _CircuitList
 
 _dummy_profiler = _DummyProfiler()
 
@@ -132,12 +134,18 @@ def _process_wrt_filter(self, wrt_filter, obj):
             obj_wrtFilter = []  # values = object-local param indices
             relevant_gpindices = []  # indices into original wrt_filter'd indices
 
-            gpindices = obj.gpindices_as_array()
+            if isinstance(obj.gpindices, slice):
+                gpindices_list = _slct.indices(obj.gpindices)
+            elif obj.gpindices is None:
+                gpindices_list = []
+            else:
+                gpindices_list = list(obj.gpindices)
+            gpindices_set = set(gpindices_list)
 
             for ii, i in enumerate(wrt_filter):
-                if i in gpindices:
+                if i in gpindices_set:
                     relevant_gpindices.append(ii)
-                    obj_wrtFilter.append(list(gpindices).index(i))
+                    obj_wrtFilter.append(gpindices_list.index(i))
             relevant_gpindices = _np.array(relevant_gpindices, _np.int64)
             if len(relevant_gpindices) == 1:
                 #Don't return a length-1 list, as this doesn't index numpy arrays
@@ -590,7 +598,7 @@ def _compute_circuit_outcome_probabilities(self, array_to_fill, circuit, outcome
         use_scaling = False  # Hardcoded for now
         assert(time is None), "MatrixForwardSimulator cannot be used to simulate time-dependent circuits"
 
-        expanded_circuit_outcomes = circuit.expand_instruments_and_separate_povm(self.model, outcomes)
+        expanded_circuit_outcomes = self.model.expand_instruments_and_separate_povm(circuit, outcomes)
         outcome_to_index = {outc: i for i, outc in enumerate(outcomes)}
         for spc, spc_outcomes in expanded_circuit_outcomes.items():  # spc is a SeparatePOVMCircuit
             indices = [outcome_to_index[o] for o in spc_outcomes]
@@ -1025,7 +1033,7 @@ def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
         return hProdCache
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0):
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -1056,6 +1064,12 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
             Determines how much output to send to stdout.  0 means no output, higher
             integers mean more output.
 
+        layout_creation_circuit_cache : dict, optional (default None)
+            A precomputed dictionary serving as a cache for completed
+            circuits. I.e. circuits with prep labels and POVM labels appended.
+            Along with other useful pre-computed circuit structures used in layout
+            creation.
+            
         Returns
         -------
         MatrixCOPALayout
@@ -1105,7 +1119,8 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
         assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
 
         layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
-                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity)
+                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
 
         if mem_limit is not None:
             loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
@@ -1147,6 +1162,46 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
                 printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
 
         return layout
+    
+    @staticmethod
+    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
+        """
+        Helper function for pre-computing/pre-processing circuits structures
+        used in matrix layout creation.
+        """
+        cache = dict()
+        completed_circuits, split_circuits = model.complete_circuits(circuits, return_split=True)
+
+        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
+        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
+
+        if dataset is not None:
+            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
+            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
+            unique_outcomes_list = []
+            for ckt in ds_circuits:
+                ds_row = dataset[ckt]
+                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
+        else:
+            unique_outcomes_list = [None]*len(circuits)
+
+        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
+                                                                                        observed_outcomes_list = unique_outcomes_list, 
+                                                                                        split_circuits = split_circuits)
+        
+        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(circuits, expanded_circuit_outcome_list)}
+                    
+        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
+
+        expanded_subcircuits_no_spam_cache = dict()
+        for expc_outcomes in cache['expanded_and_separated_circuits'].values():
+            for sep_povm_c, _ in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                exp_nospam_c = sep_povm_c.circuit_without_povm[1:] 
+                expanded_subcircuits_no_spam_cache[exp_nospam_c] = exp_nospam_c.expand_subcircuits()
+
+        cache['expanded_subcircuits_no_spam'] = expanded_subcircuits_no_spam_cache
+
+        return cache
 
     def _scale_exp(self, scale_exps):
         old_err = _np.seterr(over='ignore')
@@ -1194,7 +1249,20 @@ def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice
         # dp_dOps[i,j] = dot( e, dot( d_gs, rho ) )[0,i,j,0]
         # dp_dOps      = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,3))
         old_err2 = _np.seterr(invalid='ignore', over='ignore')
-        dp_dOps = _np.squeeze(_np.dot(e, _np.dot(d_gs, rho)), axis=(0, 3)) * scale_vals[:, None]
+        #print(f'{d_gs.shape=}')
+        #print(f'{e.shape=}')
+        #print(f'{rho.shape=}')
+        #print(f'{_np.dot(d_gs, rho).shape=}')
+        #print(f'{_np.dot(e, _np.dot(d_gs, rho)).shape=}')
+        #print(f'{_np.squeeze(_np.dot(e, _np.dot(d_gs, rho)), axis=(0, 3)).shape=}')
+        #
+        #print(f"{_np.einsum('hk,ijkl,lm->ij', e, d_gs, rho).shape=}")
+        #
+        #print(f"{_np.linalg.norm(_np.squeeze(_np.dot(e, _np.dot(d_gs, rho))) - _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho))=}")
+        path = _np.einsum_path('hk,ijkl,lm->ij', e, d_gs, rho, optimize='optimal')
+        #print(path[1])
+        dp_dOps = _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho, optimize=path[0]) * scale_vals[:, None]
+        #dp_dOps = _np.squeeze(_np.dot(e, _np.dot(d_gs, rho)), axis=(0, 3)) * scale_vals[:, None]
         _np.seterr(**old_err2)
         # may overflow, but OK ; shape == (len(circuit_list), nDerivCols)
         # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
@@ -1234,7 +1302,6 @@ def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice
             # dp_drhos[i,J0+J] = sum_kl e[0,k] gs[i,k,l] drhoP[l,J]
             # dp_drhos[i,J0+J] = dot(e, gs, drhoP)[0,i,J]
             # dp_drhos[:,J0+J] = squeeze(dot(e, gs, drhoP),axis=(0,))[:,J]
-
             dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
             _fas(dp_drhos, [None, rho_gpindices],
                  _np.squeeze(_np.dot(_np.dot(e, gs),
diff --git a/pygsti/forwardsims/termforwardsim.py b/pygsti/forwardsims/termforwardsim.py
index 3d4669d2a..d6b00b4fc 100644
--- a/pygsti/forwardsims/termforwardsim.py
+++ b/pygsti/forwardsims/termforwardsim.py
@@ -267,7 +267,7 @@ def copy(self):
                                     self.oob_check_interval, self.cache)
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimension=None, verbosity=0):
+                      derivative_dimension=None, verbosity=0, layout_creation_circuit_cache=None):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -296,6 +296,12 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
         verbosity : int or VerbosityPrinter
             Determines how much output to send to stdout.  0 means no output, higher
             integers mean more output.
+        
+        layout_creation_circuit_cache:
+            A precomputed dictionary serving as a cache for completed
+            circuits. I.e. circuits with prep labels and POVM labels appended.
+            Along with other useful pre-computed circuit structures used in layout
+            creation.
 
         Returns
         -------
@@ -330,6 +336,7 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
         printer.log("   %d atoms, parameter block size limits %s" % (natoms, str(param_blk_sizes)))
         assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
 
+        # TODO: Layout circuit creation cache unused for TermCOPALayout
         layout = _TermCOPALayout(circuits, self.model, dataset, natoms, na, npp, param_dimensions,
                                  param_blk_sizes, resource_alloc, printer)
         #MEM debug_prof.print_memory("CreateLayout2 - nAtoms = %d" % len(layout.atoms), True)
diff --git a/pygsti/forwardsims/torchfwdsim.py b/pygsti/forwardsims/torchfwdsim.py
index 1285e51de..2f991fe6e 100644
--- a/pygsti/forwardsims/torchfwdsim.py
+++ b/pygsti/forwardsims/torchfwdsim.py
@@ -31,6 +31,7 @@
 
 try:
     import torch
+    from torch.profiler import profile, record_function, ProfilerActivity
     TORCH_ENABLED = True
 except ImportError:
     TORCH_ENABLED = False
@@ -74,8 +75,9 @@ class StatelessModel:
     def __init__(self, model: ExplicitOpModel, layout: CircuitOutcomeProbabilityArrayLayout):
         circuits = []
         self.outcome_probs_dim = 0
+        #TODO: Refactor this to use the bulk_expand_instruments_and_separate_povm codepath
         for _, circuit, outcomes in layout.iter_unique_circuits():
-            expanded_circuits = circuit.expand_instruments_and_separate_povm(model, outcomes)
+            expanded_circuits = model.expand_instruments_and_separate_povm(circuit, outcomes)
             if len(expanded_circuits) > 1:
                 raise NotImplementedError("I don't know what to do with this.")
             spc = next(iter(expanded_circuits))
@@ -89,6 +91,7 @@ def __init__(self, model: ExplicitOpModel, layout: CircuitOutcomeProbabilityArra
         # framed in terms of the "layout._element_indicies" dict.
         eind = layout._element_indices
         assert isinstance(eind, dict)
+        assert len(eind) > 0
         items = iter(eind.items())
         k_prev, v_prev = next(items)
         assert k_prev == 0
@@ -106,11 +109,11 @@ def __init__(self, model: ExplicitOpModel, layout: CircuitOutcomeProbabilityArra
             param_type = type(obj)
             param_data = (lbl, param_type) + (obj.stateless_data(),)
             self.param_metadata.append(param_data)
-        self.params_dim = None 
-        # ^ That's set in get_free_params.
 
+        self.params_dim = None 
+        self.free_param_sizes = None
         self.default_to_reverse_ad = None
-        # ^ That'll be set to a boolean the next time that get_free_params is called.
+        # ^ Those are set in get_free_params
         return
     
     def get_free_params(self, model: ExplicitOpModel) -> Tuple[torch.Tensor]:
@@ -122,6 +125,7 @@ def get_free_params(self, model: ExplicitOpModel) -> Tuple[torch.Tensor]:
         to StatelessModel.__init__(...). We raise an error if an inconsistency is detected.
         """
         free_params = []
+        free_param_sizes = []
         prev_idx = 0
         for i, (lbl, obj) in enumerate(model._iter_parameterized_objs()):
             gpind = obj.gpindices_as_array()
@@ -143,6 +147,8 @@ def get_free_params(self, model: ExplicitOpModel) -> Tuple[torch.Tensor]:
                 """
                 raise ValueError(message)
             free_params.append(vec)
+            free_param_sizes.append(vec_size)
+        self.free_param_sizes = free_param_sizes
         self.params_dim = prev_idx
         self.default_to_reverse_ad = self.outcome_probs_dim < self.params_dim
         return tuple(free_params)
@@ -156,9 +162,20 @@ def get_torch_bases(self, free_params: Tuple[torch.Tensor]) -> Dict[Label, torch
         ----
         If you want to use the returned dict to build a PyTorch Tensor that supports the 
         .backward() method, then you need to make sure that fp.requires_grad is True for all
-        fp in free_params. This can be done by calling fp._requires_grad(True) before calling
+        fp in free_params. This can be done by calling fp.requires_grad_(True) before calling
         this function.
         """
+        # The closest analog to this function in tgst is the first couple lines in 
+        # tgst.gst.MachineModel.circuit_outcome_probs(...).
+        # Those lines just assign values a-la new_machine.params[i][:] = fp[:].
+        #
+        #   The variables new_machine.params[i] are just references to Tensors
+        #   that are attached to tgst.abstractions objects (Gate, Measurement, State).
+        #
+        #   Calling abstr.rep_array for a given abstraction performs a computation on
+        #   its attached Tensor, and that computation is roughly analogous to 
+        #   torchable.torch_base(...).
+        #
         assert len(free_params) == len(self.param_metadata)
          # ^ A sanity check that we're being called with the correct number of arguments.
         torch_bases = dict()
@@ -201,10 +218,42 @@ def circuit_probs_from_free_params(self, *free_params: Tuple[torch.Tensor], enab
         """
         if enable_backward:
             for fp in free_params:
-                fp._requires_grad(True)
-        torch_bases = self.get_torch_bases(free_params)
-        probs = self.circuit_probs_from_torch_bases(torch_bases)
+                fp.requires_grad_(True)
+
+        torch_bases = dict()
+        for i, val in enumerate(free_params):
+            label, type_handle, stateless_data = self.param_metadata[i]
+            param_t = type_handle.torch_base(stateless_data, val)
+            torch_bases[label] = param_t
+
+        probs = []
+        for c in self.circuits:
+            superket = torch_bases[c.prep_label]
+            superops = [torch_bases[ol] for ol in c.op_labels]
+            povm_mat = torch_bases[c.povm_label]
+            for superop in superops:
+                superket = superop @ superket
+            circuit_probs = povm_mat @ superket
+            probs.append(circuit_probs)
+        probs = torch.concat(probs)
         return probs
+    
+    def circuit_probs_from_concat_free_params(self, concat_freeparams, enable_backward=False):
+        # concat_freeparams is a Tensor of size self.params_dim
+        # we want to split it into a tuple of Tensors
+        if enable_backward:
+            concat_freeparams.require_grad_(True)
+        split_freeparams = []
+        start = 0
+        for fp_size in self.free_param_sizes:
+            stop = start + fp_size
+            split_freeparams.append(concat_freeparams[start:stop])
+            start = stop
+        split_freeparams = tuple(split_freeparams)
+        # Note: if enabled_backward was True, then the components of split_freeparams
+        # will have inherited the requires_grad==True condition. So we always use
+        # enable_backward=False in the function call below.
+        return self.circuit_probs_from_free_params(self, *split_freeparams, enable_backward=False)
 
 
 class TorchForwardSimulator(ForwardSimulator):
@@ -248,8 +297,10 @@ def _bulk_fill_dprobs(self, array_to_fill, layout, pr_array_to_fill) -> None:
         if slm.default_to_reverse_ad:
             # Then slm.circuit_probs_from_free_params will automatically construct the
             # torch_base dict to support reverse-mode AD.
+            # print('USING REVERSE-MODE AD')
             J_func = torch.func.jacrev(slm.circuit_probs_from_free_params, argnums=argnums)
         else:
+            # print('USING FORWARD-MODE AD')
             # Then slm.circuit_probs_from_free_params will automatically skip the extra
             # steps needed for torch_base to support reverse-mode AD.
             J_func = torch.func.jacfwd(slm.circuit_probs_from_free_params, argnums=argnums)
@@ -258,7 +309,14 @@ def _bulk_fill_dprobs(self, array_to_fill, layout, pr_array_to_fill) -> None:
         #   have a need to override the default in the future then we'd need to override
         #   the ForwardSimulator function(s) that call self._bulk_fill_dprobs(...).
 
+        # import time
+        # print('Calling J_func at current free_params')
+        # tic = time.time()
+        # with profile(activities=[ProfilerActivity.CPU], profile_memory=True) as prof:
         J_val = J_func(*free_params)
+        # toc = time.time()
+        # print()
+        # print(f'Done! --> {toc - tic} seconds elapsed')
         J_val = torch.column_stack(J_val)
         array_to_fill[:] = J_val.cpu().detach().numpy()
         return
diff --git a/pygsti/forwardsims/weakforwardsim.py b/pygsti/forwardsims/weakforwardsim.py
index 017973a1e..32d0e4bc6 100644
--- a/pygsti/forwardsims/weakforwardsim.py
+++ b/pygsti/forwardsims/weakforwardsim.py
@@ -55,7 +55,7 @@ def _compute_circuit_outcome_for_shot(self, circuit, resource_alloc, time=None,
         circuit : Circuit
             A tuple-like object of *simplified* gates (e.g. may include
             instrument elements like 'Imyinst_0') generated by
-            Circuit.expand_instruments_and_separate_povm()
+            OpModel.expand_instruments_and_separate_povm()
 
         resource_alloc: ResourceAlloc
             Currently not used
@@ -77,7 +77,7 @@ def _compute_circuit_outcome_for_shot(self, circuit, resource_alloc, time=None,
         assert(resource_alloc is None), "WeakForwardSimulator cannot use a resource_alloc for one shot."
 
         #prep_label, op_labels, povm_label = self.model.split_circuit(spc_circuit)
-        spc_dict = circuit.expand_instruments_and_separate_povm(self.model,
+        spc_dict = self.model.expand_instruments_and_separate_povm(circuit,
                                                                 observed_outcomes=None)  # FUTURE: observed outcomes?
         assert(len(spc_dict) == 1), "Circuits with instruments are not supported by weak forward simulator (yet)"
         spc = next(iter(spc_dict.keys()))  # first & only SeparatePOVMCircuit
diff --git a/pygsti/io/__init__.py b/pygsti/io/__init__.py
index 1b76e1c56..9dea86475 100644
--- a/pygsti/io/__init__.py
+++ b/pygsti/io/__init__.py
@@ -12,8 +12,7 @@
 
 # Import the most important/useful routines of each module into
 # the package namespace
-#from .legacyio import enable_no_cython_unpickling
-#from .legacyio import enable_old_object_unpickling  # , disable_old_object_unpickling
+
 from .readers import *
 from .metadir import *
 from .stdinput import *
diff --git a/pygsti/layouts/copalayout.py b/pygsti/layouts/copalayout.py
index 430fb6734..bd5020aa8 100644
--- a/pygsti/layouts/copalayout.py
+++ b/pygsti/layouts/copalayout.py
@@ -190,24 +190,26 @@ def __init__(self, circuits, unique_circuits, to_unique, elindex_outcome_tuples,
         if unique_circuits is None and to_unique is None:
             unique_circuits, to_unique = self._compute_unique_circuits(circuits)
         self._unique_circuits = unique_circuits
-        self._unique_circuit_index = _collections.OrderedDict(
-            [(c, i) for i, c in enumerate(self._unique_circuits)])  # original circuits => unique circuit indices
+        self._unique_circuit_index = {c:i for i, c in enumerate(self._unique_circuits)}  # original circuits => unique circuit indices
         self._to_unique = to_unique  # original indices => unique circuit indices
         self._unique_complete_circuits = unique_complete_circuits  # Note: can be None
         self._param_dimensions = param_dimensions
         self._resource_alloc = _ResourceAllocation.cast(resource_alloc)
 
-        max_element_index = max(_it.chain(*[[ei for ei, _ in pairs] for pairs in elindex_outcome_tuples.values()])) \
-            if len(elindex_outcome_tuples) > 0 else -1  # -1 makes _size = 0 below
-        indices = set(i for tuples in elindex_outcome_tuples.values() for i, o in tuples)
+        indices = [i for tuples in elindex_outcome_tuples.values() for i, _ in tuples]
+        max_element_index = max(indices) if len(elindex_outcome_tuples) > 0 else -1  # -1 makes _size = 0 below
+        indices = set(indices)
+        
+        
         self._size = max_element_index + 1
         assert(len(indices) == self._size), \
             "Inconsistency: %d distinct indices but max index + 1 is %d!" % (len(indices), self._size)
 
-        self._outcomes = _collections.OrderedDict()
-        self._element_indices = _collections.OrderedDict()
+        self._outcomes = dict()
+        self._element_indices = dict()
+        sort_idx_func = lambda x: x[0]
         for i_unique, tuples in elindex_outcome_tuples.items():
-            sorted_tuples = sorted(tuples, key=lambda x: x[0])  # sort by element index
+            sorted_tuples = sorted(tuples, key=sort_idx_func)  # sort by element index
             elindices, outcomes = zip(*sorted_tuples)  # sorted by elindex so we make slices whenever possible
             self._outcomes[i_unique] = tuple(outcomes)
             self._element_indices[i_unique] = _slct.list_to_slice(elindices, array_ok=True)
diff --git a/pygsti/layouts/distlayout.py b/pygsti/layouts/distlayout.py
index 7a7184529..9db1150d8 100644
--- a/pygsti/layouts/distlayout.py
+++ b/pygsti/layouts/distlayout.py
@@ -360,9 +360,6 @@ def __init__(self, circuits, unique_circuits, to_unique, unique_complete_circuit
             to_send = 0  # default = contribute nothing to MPI.SUM below
 
             if i in atoms_dict:
-                #print("DB (%d): updating elindex_outcome_tuples w/Atom %d:\n%s"
-                #      % (rank, i, "\n".join(["%d: %s" % (indx, str(tups))
-                #                             for indx, tups in atoms_dict[i].elindex_outcome_tuples.items()])))
                 if start is None:
                     start = stop = offset
                 assert(stop == offset)  # This should be checked by _assert_sequential(myAtomIndices) above
@@ -810,42 +807,6 @@ def __init__(self, circuits, unique_circuits, to_unique, unique_complete_circuit
         super().__init__(local_circuits, local_unique_circuits, local_to_unique, local_elindex_outcome_tuples,
                          local_unique_complete_circuits, param_dimensions, resource_alloc)
 
-        #DEBUG LAYOUT PRINTING
-        #def cnt_str(cnt):
-        #    if cnt is None: return "%4s" % '-'
-        #    return "%4d" % cnt
-        #def slc_str(slc):
-        #    if slc is None: return "%14s" % '--'
-        #    return "%3d->%3d (%3d)" % (slc.start, slc.stop, slc.stop - slc.start) \
-        #        if isinstance(slc, slice) else "%14s" % str(slc)
-        #shm = bool(resource_alloc.host_comm is not None)  # shared mem?
-        #if rank == 0:
-        #    print("%11s %-14s %-14s %-14s   %-14s %-4s %-14s %-4s %-14s %-4s" % (
-        #        '#', 'g-elements', 'g-params', 'g-param2s',
-        #        'h-elements','tot', 'h-params','tot', 'h-params2','tot'),
-        #          flush=True)
-        #resource_alloc.comm.barrier()
-        #for r in range(resource_alloc.comm.size):
-        #    if r == rank:
-        #        my_desc = ("%3d (%2d.%2d)" % (rank, resource_alloc.host_index, resource_alloc.host_comm.rank)) \
-        #                  if shm else ("%11d" % rank)
-        #        print(my_desc, slc_str(self.global_element_slice), slc_str(self.global_param_slice),
-        #              slc_str(self.global_param2_slice), ' ',
-        #              slc_str(self.host_element_slice), cnt_str(self.host_num_elements),
-        #              slc_str(self.host_param_slice), cnt_str(self.host_num_params),
-        #              slc_str(self.host_param2_slice), cnt_str(self.host_num_params2),  flush=True)
-        #    resource_alloc.comm.barrier()
-        #
-        #if rank == 0:
-        #    print("%11s %-14s %-14s %-4s" % ('#', 'g-pfine', 'h-pfine', 'tot'), flush=True)
-        #resource_alloc.comm.barrier()
-        #for r in range(resource_alloc.comm.size):
-        #    if r == rank:
-        #        my_desc = ("%3d (%2d.%2d)" % (rank, resource_alloc.host_index, resource_alloc.host_comm.rank)) \
-        #                  if shm else ("%11d" % rank)
-        #        print(my_desc, slc_str(self.global_param_fine_slice), slc_str(self.host_param_fine_slice),
-        #              cnt_str(self.host_num_params_fine), flush=True)
-        #    resource_alloc.comm.barrier()
 
     @property
     def max_atom_elements(self):
diff --git a/pygsti/layouts/maplayout.py b/pygsti/layouts/maplayout.py
index ca5ca642e..39a58fb08 100644
--- a/pygsti/layouts/maplayout.py
+++ b/pygsti/layouts/maplayout.py
@@ -11,10 +11,12 @@
 #***************************************************************************************************
 
 import collections as _collections
+import importlib as _importlib
+import numpy as _np
 
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
-from pygsti.layouts.prefixtable import PrefixTable as _PrefixTable
+from pygsti.layouts.prefixtable import PrefixTable as _PrefixTable, PrefixTableJacobian as _PrefixTableJacobian
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
 
@@ -51,19 +53,39 @@ class _MapCOPALayoutAtom(_DistributableAtom):
     """
 
     def __init__(self, unique_complete_circuits, ds_circuits, group, model,
-                 dataset, max_cache_size):
+                 dataset, max_cache_size, 
+                 circuit_param_dependencies=None, param_circuit_dependencies=None, 
+                 expanded_complete_circuit_cache = None):
+        
+        expanded_circuit_info_by_unique = dict()
+        expanded_circuit_set = dict() # only use SeparatePOVMCircuit keys as ordered set
+
+        if expanded_complete_circuit_cache is None:
+            expanded_complete_circuit_cache = dict()
 
-        expanded_circuit_info_by_unique = _collections.OrderedDict()
-        expanded_circuit_set = _collections.OrderedDict()  # only use SeparatePOVMCircuit keys as ordered set
         for i in group:
-            observed_outcomes = None if (dataset is None) else dataset[ds_circuits[i]].outcomes
-            d = unique_complete_circuits[i].expand_instruments_and_separate_povm(model, observed_outcomes)
+            d = expanded_complete_circuit_cache.get(unique_complete_circuits[i], None)
+            if d is None:
+                unique_observed_outcomes = None if (dataset is None) else dataset[ds_circuits[i]].unique_outcomes
+                d = model.expand_instruments_and_separate_povm(unique_complete_circuits[i], unique_observed_outcomes)
             expanded_circuit_info_by_unique[i] = d  # a dict of SeparatePOVMCircuits => tuples of outcome labels
-            expanded_circuit_set.update(d)
+            expanded_circuit_set.update(d)            
 
         expanded_circuits = list(expanded_circuit_set.keys())
+        
         self.table = _PrefixTable(expanded_circuits, max_cache_size)
 
+        #only Build the Jacobian prefix table if we are using the generic evotype.
+        if model.sim.calclib is _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic"):
+            #create a list for storing the model parameter dependencies of expanded circuits
+            expanded_param_circuit_depend = [{} for _ in range(len(param_circuit_dependencies))]
+            for i in group:
+                for param_idx in circuit_param_dependencies[i]:
+                    expanded_param_circuit_depend[param_idx].update(expanded_circuit_info_by_unique[i])
+            expanded_param_circuit_depend = [list(param_circuit_depend_dict.keys()) for  param_circuit_depend_dict in expanded_param_circuit_depend]
+
+            self.jac_table = _PrefixTableJacobian(expanded_circuits, max_cache_size, expanded_param_circuit_depend)
+
         #Create circuit element <=> integer index lookups for speed
         all_rholabels = set()
         all_oplabels = set()
@@ -91,8 +113,7 @@ def __init__(self, unique_complete_circuits, ds_circuits, group, model,
         self.outcomes_by_expcircuit = {}
         self.povm_and_elbls_by_expcircuit = {}
 
-        elindex_outcome_tuples = _collections.OrderedDict([
-            (unique_i, list()) for unique_i in range(len(unique_complete_circuits))])
+        elindex_outcome_tuples = {unique_i: list() for unique_i in range(len(unique_complete_circuits))}
 
         #Assign element indices, "global" indices starting at `offset`
         local_offset = 0
@@ -198,42 +219,99 @@ class MapCOPALayout(_DistributableCOPALayout):
 
     resource_alloc : ResourceAllocation, optional
         The resources available for computing circuit outcome probabilities.
+    
+    circuit_partition_cost_functions : tuple of str, optional (default ('size', 'propagations'))
+        A tuple of strings denoting cost function to use in each of the two stages of the algorithm
+        for determining the partitions of the complete circuit set amongst atoms.
+        Allowed options are 'size', which corresponds to balancing the number of circuits, 
+        and 'propagations', which corresponds to balancing the number of state propagations.
 
     verbosity : int or VerbosityPrinter
         Determines how much output to send to stdout.  0 means no output, higher
         integers mean more output.
+
+    layout_creation_circuit_cache : dict, optional (default None)
+        An optional dictionary containing pre-computed circuit structures/modifications which
+        can be used to reduce the overhead of repeated circuit operations during layout creation.
+    
+    load_balancing_parameters : tuple of floats, optional (default (1.2, .1))
+        A tuple of floats used as load balancing parameters when splitting a layout across atoms,
+        as in the multi-processor setting when using MPI. These parameters correspond to the `imbalance_threshold`
+        and `minimum_improvement_threshold` parameters described in the method `find_splitting_new`
+        of the `PrefixTable` class.
     """
 
     def __init__(self, circuits, model, dataset=None, max_cache_size=None,
                  num_sub_tables=None, num_table_processors=1, num_param_dimension_processors=(),
-                 param_dimensions=(), param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0):
+                 param_dimensions=(), param_dimension_blk_sizes=(), resource_alloc=None, 
+                 circuit_partition_cost_functions=('size', 'propagations'), verbosity=0, 
+                 layout_creation_circuit_cache=None, load_balancing_parameters = (1.2, .1)):
 
         unique_circuits, to_unique = self._compute_unique_circuits(circuits)
         aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
         ds_circuits = _lt.apply_aliases_to_circuits(unique_circuits, aliases)
-        unique_complete_circuits = [model.complete_circuit(c) for c in unique_circuits]
-        unique_povmless_circuits = [model.split_circuit(c, split_prep=False)[1] for c in unique_complete_circuits]
+
+        #extract subcaches from layout_creation_circuit_cache:
+        if layout_creation_circuit_cache is None:
+            layout_creation_circuit_cache = dict()
+        self.completed_circuit_cache = layout_creation_circuit_cache.get('completed_circuits', None)
+        self.split_circuit_cache = layout_creation_circuit_cache.get('split_circuits', None)
+        self.expanded_and_separated_circuits_cache = layout_creation_circuit_cache.get('expanded_and_separated_circuits', None)
+        
+        if self.completed_circuit_cache is None:
+            unique_complete_circuits = model.complete_circuits(unique_circuits)
+            split_circuits = model.split_circuits(unique_complete_circuits, split_prep=False)
+        else:
+            unique_complete_circuits = []
+            for c in unique_circuits:
+                comp_ckt = self.completed_circuit_cache.get(c, None)
+                if comp_ckt is not None:
+                    unique_complete_circuits.append(comp_ckt)
+                else:
+                    unique_complete_circuits.append(model.complete_circuit(c))
+            split_circuits = []
+            for c, c_complete in zip(unique_circuits,unique_complete_circuits):
+                split_ckt = self.split_circuit_cache.get(c, None)
+                if split_ckt is not None:
+                    split_circuits.append(split_ckt)
+                else:
+                    split_circuits.append(model.split_circuit(c_complete, split_prep=False))
+
+        #construct a map for the parameter dependence for each of the unique_complete_circuits.
+        #returns a dictionary who's keys are the unique completed circuits, and whose
+        #values are lists of model parameters upon which that circuit depends.
+        if model.sim.calclib is _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic") and model.param_interposer is None:
+            circ_param_map, param_circ_map = model.circuit_parameter_dependence(unique_complete_circuits, return_param_circ_map=True)
+            uniq_comp_circs_param_depend = list(circ_param_map.values())
+            uniq_comp_param_circs_depend = param_circ_map
+        else : 
+            circ_param_map = None
+            param_circ_map = None
+            uniq_comp_circs_param_depend = None
+            uniq_comp_param_circs_depend = None
+        #construct list of unique POVM-less circuits.
+        unique_povmless_circuits = [ckt_tup[1] for ckt_tup in split_circuits]
 
         max_sub_table_size = None  # was an argument but never used; remove in future
         if (num_sub_tables is not None and num_sub_tables > 1) or max_sub_table_size is not None:
             circuit_table = _PrefixTable(unique_povmless_circuits, max_cache_size)
-            groups = circuit_table.find_splitting(max_sub_table_size, num_sub_tables, verbosity=verbosity)
+            self.complete_circuit_table = circuit_table
+            groups = circuit_table.find_splitting_new(max_sub_table_size, num_sub_tables, verbosity=verbosity,
+                                                      initial_cost_metric=circuit_partition_cost_functions[0],
+                                                      rebalancing_cost_metric=circuit_partition_cost_functions[1],
+                                                      imbalance_threshold = load_balancing_parameters[0],
+                                                      minimum_improvement_threshold = load_balancing_parameters[1])
+            #groups = circuit_table.find_splitting(max_sub_table_size, num_sub_tables, verbosity=verbosity)
         else:
-            groups = [set(range(len(unique_complete_circuits)))]
-
-        #atoms = []
-        #elindex_outcome_tuples = _collections.OrderedDict(
-        #    [(unique_i, list()) for unique_i in range(len(unique_circuits))])
-
-        #offset = 0
-        #for group in groups:
-        #    atoms.append(_MapCOPALayoutAtom(unique_complete_circuits, ds_circuits, to_orig, group,
-        #                                    model, dataset, offset, elindex_outcome_tuples, max_cache_size))
-        #    offset += atoms[-1].num_elements
+            groups = [list(range(len(unique_complete_circuits)))]
+        
 
         def _create_atom(group):
             return _MapCOPALayoutAtom(unique_complete_circuits, ds_circuits, group,
-                                      model, dataset, max_cache_size)
+                                      model, dataset, max_cache_size,
+                                      circuit_param_dependencies= uniq_comp_circs_param_depend,
+                                      param_circuit_dependencies= uniq_comp_param_circs_depend,
+                                      expanded_complete_circuit_cache=self.expanded_and_separated_circuits_cache)
 
         super().__init__(circuits, unique_circuits, to_unique, unique_complete_circuits,
                          _create_atom, groups, num_table_processors,
@@ -247,4 +325,4 @@ def _create_atom(group):
         unique_to_orig = {unique_i: orig_i for orig_i, unique_i in self._to_unique.items()}  # unique => orig. indices
         for atom in self.atoms:
             for expanded_circuit_i, unique_i in atom.unique_indices_by_expcircuit.items():
-                atom.orig_indices_by_expcircuit[expanded_circuit_i] = unique_to_orig[unique_i]
+                atom.orig_indices_by_expcircuit[expanded_circuit_i] = unique_to_orig[unique_i]
\ No newline at end of file
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index bc92cc054..bfff25a31 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -61,6 +61,9 @@ class _MatrixCOPALayoutAtom(_DistributableAtom):
     model : Model
         The model being used to construct this layout.  Used for expanding instruments
         within the circuits.
+    
+    unique_circuits : list of Circuits
+        A list of the unique :class:`Circuit` objects representing the circuits this layout will include.
 
     dataset : DataSet
         The dataset, used to include only observed circuit outcomes in this atom
@@ -68,7 +71,11 @@ class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
 
     def __init__(self, unique_complete_circuits, unique_nospam_circuits, circuits_by_unique_nospam_circuits,
-                 ds_circuits, group, helpful_scratch, model, dataset):
+                 ds_circuits, group, helpful_scratch, model, unique_circuits, dataset=None, expanded_and_separated_circuit_cache=None,
+                 double_expanded_nospam_circuits_cache = None):
+
+        if expanded_and_separated_circuit_cache is None:
+            expanded_and_separated_circuit_cache = dict()
 
         #Note: group gives unique_nospam_circuits indices, which circuits_by_unique_nospam_circuits
         # turns into "unique complete circuit" indices, which the layout via it's to_unique can map
@@ -78,24 +85,24 @@ def add_expanded_circuits(indices, add_to_this_dict):
             for i in indices:
                 nospam_c = unique_nospam_circuits[i]
                 for unique_i in circuits_by_unique_nospam_circuits[nospam_c]:  # "unique" circuits: add SPAM to nospam_c
-                    observed_outcomes = None if (dataset is None) else dataset[ds_circuits[unique_i]].unique_outcomes
-                    expc_outcomes = unique_complete_circuits[unique_i].expand_instruments_and_separate_povm(
-                        model, observed_outcomes)
-                    #Note: unique_complete_circuits may have duplicates (they're only unique *pre*-completion)
-
+                    #the cache is indexed into using the (potentially) incomplete circuits
+                    expc_outcomes = expanded_and_separated_circuit_cache.get(unique_circuits[unique_i], None)
+                    if expc_outcomes is None: #fall back on original non-cache behavior.
+                        observed_outcomes = None if (dataset is None) else dataset[ds_circuits[unique_i]].unique_outcomes
+                        expc_outcomes = model.expand_instruments_and_separate_povm(unique_complete_circuits[unique_i], observed_outcomes)
+                        #and add this new value to the cache.
+                        expanded_and_separated_circuit_cache[unique_circuits[unique_i]] = expc_outcomes 
                     for sep_povm_c, outcomes in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
                         prep_lbl = sep_povm_c.circuit_without_povm[0]
                         exp_nospam_c = sep_povm_c.circuit_without_povm[1:]  # sep_povm_c *always* has prep lbl
                         spam_tuples = [(prep_lbl, elabel) for elabel in sep_povm_c.full_effect_labels]
-                        outcome_by_spamtuple = _collections.OrderedDict([(st, outcome)
-                                                                         for st, outcome in zip(spam_tuples, outcomes)])
+                        outcome_by_spamtuple = {st:outcome for st, outcome in zip(spam_tuples, outcomes)}
 
                         #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
                         # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
                         # keep track of a list of unique_i indices for each circut and spam tuple below.
                         if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
-                            _expanded_nospam_circuit_outcomes[exp_nospam_c] = _collections.OrderedDict(
-                                [(st, (outcome, [unique_i])) for st, outcome in zip(spam_tuples, outcomes)])
+                            _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
                         else:
                             for st, outcome in outcome_by_spamtuple.items():
                                 if st in _expanded_nospam_circuit_outcomes[exp_nospam_c]:
@@ -109,25 +116,28 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
         # keys = expanded circuits w/out SPAM layers; values = spamtuple => (outcome, unique_is) dictionary that
         # keeps track of which "unique" circuit indices having each spamtuple / outcome.
-        expanded_nospam_circuit_outcomes = _collections.OrderedDict()
+        expanded_nospam_circuit_outcomes = dict()
         add_expanded_circuits(group, expanded_nospam_circuit_outcomes)
-        expanded_nospam_circuits = _collections.OrderedDict(
-            [(i, cir) for i, cir in enumerate(expanded_nospam_circuit_outcomes.keys())])
+        expanded_nospam_circuits = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes.keys())}
 
         # add suggested scratch to the "final" elements as far as the tree creation is concerned
         # - this allows these scratch element to help balance the tree.
-        expanded_nospam_circuit_outcomes_plus_scratch = expanded_nospam_circuit_outcomes.copy()
-        add_expanded_circuits(helpful_scratch, expanded_nospam_circuit_outcomes_plus_scratch)
-        expanded_nospam_circuits_plus_scratch = _collections.OrderedDict(
-            [(i, cir) for i, cir in enumerate(expanded_nospam_circuit_outcomes_plus_scratch.keys())])
-
-        double_expanded_nospam_circuits_plus_scratch = _collections.OrderedDict()
+        if helpful_scratch:
+            expanded_nospam_circuit_outcomes_plus_scratch = expanded_nospam_circuit_outcomes.copy()
+            add_expanded_circuits(helpful_scratch, expanded_nospam_circuit_outcomes_plus_scratch)
+            expanded_nospam_circuits_plus_scratch = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes_plus_scratch.keys())}
+        else:
+            expanded_nospam_circuits_plus_scratch = expanded_nospam_circuits.copy()
+        
+        if double_expanded_nospam_circuits_cache is None:
+            double_expanded_nospam_circuits_cache = dict()
+        double_expanded_nospam_circuits_plus_scratch = dict()
         for i, cir in expanded_nospam_circuits_plus_scratch.items():
-            cir = cir.copy(editable=True)
-            cir.expand_subcircuits()  # expand sub-circuits for a more efficient tree
-            cir.done_editing()
-            double_expanded_nospam_circuits_plus_scratch[i] = cir
-
+            # expand sub-circuits for a more efficient tree
+            double_expanded_ckt = double_expanded_nospam_circuits_cache.get(cir, None)
+            if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
+                double_expanded_ckt = cir.expand_subcircuits()
+            double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
         self.tree = _EvalTree.create(double_expanded_nospam_circuits_plus_scratch)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
@@ -138,7 +148,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
         # quantity plus a spam-tuple. We order the final indices so that all the outcomes corresponding to a
         # given spam-tuple are contiguous.
 
-        tree_indices_by_spamtuple = _collections.OrderedDict()  # "tree" indices index expanded_nospam_circuits
+        tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
         for i, c in expanded_nospam_circuits.items():
             for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
                 if spam_tuple not in tree_indices_by_spamtuple: tree_indices_by_spamtuple[spam_tuple] = []
@@ -147,7 +157,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
         #Assign element indices, starting at `offset`
         # now that we know how many of each spamtuple there are, assign final element indices.
         local_offset = 0
-        self.indices_by_spamtuple = _collections.OrderedDict()  # values are (element_indices, tree_indices) tuples.
+        self.indices_by_spamtuple = dict()  # values are (element_indices, tree_indices) tuples.
         for spam_tuple, tree_indices in tree_indices_by_spamtuple.items():
             self.indices_by_spamtuple[spam_tuple] = (slice(local_offset, local_offset + len(tree_indices)),
                                                      _slct.list_to_slice(tree_indices, array_ok=True))
@@ -157,8 +167,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
         element_slice = None  # slice(offset, offset + local_offset)  # *global* (of parent layout) element-index slice
         num_elements = local_offset
 
-        elindex_outcome_tuples = _collections.OrderedDict([
-            (unique_i, list()) for unique_i in range(len(unique_complete_circuits))])
+        elindex_outcome_tuples = {unique_i: list() for unique_i in range(len(unique_complete_circuits))}
 
         for spam_tuple, (element_indices, tree_indices) in self.indices_by_spamtuple.items():
             for elindex, tree_index in zip(_slct.indices(element_indices), _slct.to_array(tree_indices)):
@@ -231,7 +240,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
     Parameters
     ----------
     circuits : list
-        A list of:class:`Circuit` objects representing the circuits this layout will include.
+        A list of :class:`Circuit` objects representing the circuits this layout will include.
 
     model : Model
         The model that will be used to compute circuit outcome probabilities using this layout.
@@ -271,11 +280,16 @@ class MatrixCOPALayout(_DistributableCOPALayout):
     verbosity : int or VerbosityPrinter
         Determines how much output to send to stdout.  0 means no output, higher
         integers mean more output.
+
+    layout_creation_circuit_cache : dict, optional (default None)
+        A precomputed dictionary serving as a cache for completed
+        circuits. I.e. circuits with prep labels and POVM labels appended.
     """
 
     def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=1,
                  num_param_dimension_processors=(), param_dimensions=(),
-                 param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0):
+                 param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
+                 layout_creation_circuit_cache = None):
 
         #OUTDATED: TODO - revise this:
         # 1. pre-process => get complete circuits => spam-tuples list for each no-spam circuit (no expanding yet)
@@ -290,16 +304,56 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
         unique_circuits, to_unique = self._compute_unique_circuits(circuits)
         aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
         ds_circuits = _lt.apply_aliases_to_circuits(unique_circuits, aliases)
-        unique_complete_circuits, split_unique_circuits = model.complete_circuits(unique_circuits, return_split=True)
+
+        #extract subcaches from layout_creation_circuit_cache:
+        if layout_creation_circuit_cache is None:
+            layout_creation_circuit_cache = dict()
+        self.completed_circuit_cache = layout_creation_circuit_cache.get('completed_circuits', None)
+        self.split_circuit_cache = layout_creation_circuit_cache.get('split_circuits', None)
+        self.expanded_and_separated_circuits_cache = layout_creation_circuit_cache.get('expanded_and_separated_circuits', None)
+        self.expanded_subcircuits_no_spam_cache = layout_creation_circuit_cache.get('expanded_subcircuits_no_spam', None)
+        
+        if self.completed_circuit_cache is None:
+            unique_complete_circuits, split_unique_circuits = model.complete_circuits(unique_circuits, return_split=True)
+        else:
+            unique_complete_circuits = []
+            for c in unique_circuits:
+                comp_ckt = self.completed_circuit_cache.get(c, None)
+                if comp_ckt is not None:
+                    unique_complete_circuits.append(comp_ckt)
+                else:
+                    unique_complete_circuits.append(model.complete_circuit(c))
+
         #Note: "unique" means a unique circuit *before* circuit-completion, so there could be duplicate
         # "unique circuits" after completion, e.g. "rho0Gx" and "Gx" could both complete to "rho0GxMdefault_0".
 
-        circuits_by_unique_nospam_circuits = _collections.OrderedDict()
-        for i, (_, nospam_c, _) in enumerate(split_unique_circuits):
-            if nospam_c in circuits_by_unique_nospam_circuits:
-                circuits_by_unique_nospam_circuits[nospam_c].append(i)
-            else:
-                circuits_by_unique_nospam_circuits[nospam_c] = [i]
+        circuits_by_unique_nospam_circuits = dict()
+        if self.completed_circuit_cache is None:
+            for i, (_, nospam_c, _) in enumerate(split_unique_circuits):
+                if nospam_c in circuits_by_unique_nospam_circuits:
+                    circuits_by_unique_nospam_circuits[nospam_c].append(i)
+                else:
+                    circuits_by_unique_nospam_circuits[nospam_c] = [i]
+            #also create the split circuit cache at this point for future use.
+            if self.split_circuit_cache is None:
+                self.split_circuit_cache = {unique_ckt:split_ckt for unique_ckt, split_ckt in zip(unique_circuits, split_unique_circuits)}
+
+        else:
+            if self.split_circuit_cache is None:
+                self.split_circuit_cache = dict()
+            for i, (c_unique_complete, c_unique) in enumerate(zip(unique_complete_circuits, unique_circuits)):
+                split_ckt_tup = self.split_circuit_cache.get(c_unique, None)
+                nospam_c= split_ckt_tup[1] if split_ckt_tup is not None else None
+                if nospam_c is None:
+                    split_ckt_tup = model.split_circuit(c_unique_complete)
+                    nospam_c= split_ckt_tup[1]
+                    #also add this missing circuit to the cache for future use.
+                    self.split_circuit_cache[c_unique] = split_ckt_tup
+                if nospam_c in circuits_by_unique_nospam_circuits:
+                    circuits_by_unique_nospam_circuits[nospam_c].append(i)
+                else:
+                    circuits_by_unique_nospam_circuits[nospam_c] = [i]
+
         unique_nospam_circuits = list(circuits_by_unique_nospam_circuits.keys())
         
         # Split circuits into groups that will make good subtrees (all procs do this)
@@ -317,9 +371,16 @@ def _create_atom(args):
             group, helpful_scratch_group = args
             return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
-                                         group, helpful_scratch_group, model, dataset)
+                                         group, helpful_scratch_group, model, 
+                                         unique_circuits, dataset,
+                                         self.expanded_and_separated_circuits_cache,
+                                         self.expanded_subcircuits_no_spam_cache)
 
         super().__init__(circuits, unique_circuits, to_unique, unique_complete_circuits,
                          _create_atom, list(zip(groups, helpful_scratch)), num_tree_processors,
                          num_param_dimension_processors, param_dimensions,
                          param_dimension_blk_sizes, resource_alloc, verbosity)
+
+
+
+
diff --git a/pygsti/layouts/prefixtable.py b/pygsti/layouts/prefixtable.py
index ba468ff2c..7508fb7c4 100644
--- a/pygsti/layouts/prefixtable.py
+++ b/pygsti/layouts/prefixtable.py
@@ -11,7 +11,10 @@
 #***************************************************************************************************
 
 import collections as _collections
-
+import networkx as _nx
+import matplotlib.pyplot as plt
+from math import ceil
+from pygsti.baseobjs import Label as _Label
 from pygsti.circuits.circuit import SeparatePOVMCircuit as _SeparatePOVMCircuit
 
 
@@ -38,6 +41,14 @@ def __init__(self, circuits_to_evaluate, max_cache_size):
         `iDest` is always in the range [0,len(circuits_to_evaluate)-1], and
         indexes the result computed for each of the circuits.
 
+        Parameters
+        ----------
+
+
+        circuit_parameter_sensitivities : 
+            A map between the circuits in circuits_to_evaluate and the indices of the model parameters
+            to which these circuits depend.
+
         Returns
         -------
         tuple
@@ -45,19 +56,23 @@ def __init__(self, circuits_to_evaluate, max_cache_size):
             of tuples as given above and `cache_size` is the total size of the state
             cache used to hold intermediate results.
         """
+
         #Sort the operation sequences "alphabetically", so that it's trivial to find common prefixes
-        circuits_to_evaluate_fastlookup = {i: cir for i, cir in enumerate(circuits_to_evaluate)}
         circuits_to_sort_by = [cir.circuit_without_povm if isinstance(cir, _SeparatePOVMCircuit) else cir
                                for cir in circuits_to_evaluate]  # always Circuits - not SeparatePOVMCircuits
-        sorted_circuits_to_sort_by = sorted(list(enumerate(circuits_to_sort_by)), key=lambda x: x[1])
-        sorted_circuits_to_evaluate = [(i, circuits_to_evaluate_fastlookup[i]) for i, _ in sorted_circuits_to_sort_by]
+        #with the current logic in _build_table a candidate circuit is only treated as a possible prefix if
+        #it is shorter than the one it is being evaluated as a prefix for. So it should work to sort these
+        #circuits by length for the purposes of the current logic.
+        sorted_circuits_to_sort_by = sorted(list(enumerate(circuits_to_sort_by)), key=lambda x: len(x[1]))
+        orig_indices, sorted_circuits_to_evaluate = zip(*[(i, circuits_to_evaluate[i]) for i, _ in sorted_circuits_to_sort_by])
+        
+        self.sorted_circuits_to_evaluate = sorted_circuits_to_evaluate
+        self.orig_indices = orig_indices
+
+        #get the circuits in a form readily usable for comparisons
+        circuit_reps, circuit_lens = _circuits_to_compare(sorted_circuits_to_evaluate)
+        self.circuit_reps = circuit_reps
 
-        distinct_line_labels = set([cir.line_labels for cir in circuits_to_sort_by])
-        if len(distinct_line_labels) == 1:  # if all circuits have the *same* line labels, we can just compare tuples
-            circuit_reps_to_compare_and_lengths = {i: (cir.layertup, len(cir))
-                                                   for i, cir in enumerate(circuits_to_sort_by)}
-        else:
-            circuit_reps_to_compare_and_lengths = {i: (cir, len(cir)) for i, cir in enumerate(circuits_to_sort_by)}
 
         if max_cache_size is None or max_cache_size > 0:
             #CACHE assessment pass: figure out what's worth keeping in the cache.
@@ -66,48 +81,13 @@ def __init__(self, circuits_to_evaluate, max_cache_size):
             # Not: this logic could be much better, e.g. computing a cost savings for each
             #  potentially-cached item and choosing the best ones, and proper accounting
             #  for chains of cached items.
-            cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
-            cache_hits = _collections.defaultdict(lambda: 0)
-
-            for i, _ in sorted_circuits_to_evaluate:
-                circuit, L = circuit_reps_to_compare_and_lengths[i]  # can be a Circuit or a label tuple
-                for cached_index in reversed(cacheIndices):
-                    candidate, Lc = circuit_reps_to_compare_and_lengths[cached_index]
-                    if L >= Lc > 0 and circuit[0:Lc] == candidate:  # a cache hit!
-                        cache_hits[cached_index] += 1
-                        break  # stop looking through cache
-                cacheIndices.append(i)  # cache *everything* in this pass
-
-        # Build prefix table: construct list, only caching items with hits > 0 (up to max_cache_size)
-        cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
-        table_contents = []
-        curCacheSize = 0
-
-        for i, circuit in sorted_circuits_to_evaluate:
-            circuit_rep, L = circuit_reps_to_compare_and_lengths[i]
-
-            #find longest existing prefix for circuit by working backwards
-            # and finding the first string that *is* a prefix of this string
-            # (this will necessarily be the longest prefix, given the sorting)
-            for i_in_cache in range(curCacheSize - 1, -1, -1):  # from curCacheSize-1 -> 0
-                candidate, Lc = circuit_reps_to_compare_and_lengths[cacheIndices[i_in_cache]]
-                if L >= Lc > 0 and circuit_rep[0:Lc] == candidate:  # ">=" allows for duplicates
-                    iStart = i_in_cache  # an index into the *cache*, not into circuits_to_evaluate
-                    remaining = circuit[Lc:]  # *always* a SeparatePOVMCircuit or Circuit
-                    break
-            else:  # no break => no prefix
-                iStart = None
-                remaining = circuit[:]
-
-            # if/where this string should get stored in the cache
-            if (max_cache_size is None or curCacheSize < max_cache_size) and cache_hits.get(i, 0) > 0:
-                iCache = len(cacheIndices)
-                cacheIndices.append(i); curCacheSize += 1
-            else:  # don't store in the cache
-                iCache = None
+            cache_hits = _cache_hits(self.circuit_reps, circuit_lens)
+        else:
+            cache_hits = [None]*len(self.circuit_reps)
 
-            #Add instruction for computing this circuit
-            table_contents.append((i, iStart, remaining, iCache))
+        table_contents, curCacheSize = _build_table(sorted_circuits_to_evaluate, cache_hits,
+                                                    max_cache_size, self.circuit_reps, circuit_lens,
+                                                    orig_indices)
 
         #FUTURE: could perform a second pass, and if there is
         # some threshold number of elements which share the
@@ -118,9 +98,196 @@ def __init__(self, circuits_to_evaluate, max_cache_size):
         # order.
         self.contents = table_contents
         self.cache_size = curCacheSize
+        self.circuits_evaluated = circuits_to_sort_by 
+
 
     def __len__(self):
         return len(self.contents)
+    
+    def num_state_propagations(self):
+        """
+        Return the number of state propagation operations (excluding the action of POVM effects) 
+        required for the evaluation strategy given by this PrefixTable.
+        """
+        return sum(self.num_state_propagations_by_circuit().values())
+    
+    def num_state_propagations_by_circuit(self):
+        """
+        Return the number of state propagation operations per-circuit 
+        (excluding the action of POVM effects) required for the evaluation strategy 
+        given by this PrefixTable, returned as a dictionary with keys corresponding to
+        circuits and values corresponding to the number of state propagations
+        required for that circuit.
+        """
+        state_props_by_circuit = {}
+        for i, istart, remainder, _ in self.contents:
+            if len(self.circuits_evaluated[i][0])>0 and self.circuits_evaluated[i][0] == _Label('rho0') and istart is None:
+                state_props_by_circuit[self.circuits_evaluated[i]] = len(remainder)-1
+            else:
+                state_props_by_circuit[self.circuits_evaluated[i]] = len(remainder)
+            
+        return state_props_by_circuit
+    
+    def num_state_propagations_by_circuit_no_caching(self):
+        """
+        Return the number of state propagation operations per-circuit 
+        (excluding the action of POVM effects) required for an evaluation strategy 
+        without caching, returned as a dictionary with keys corresponding to
+        circuits and values corresponding to the number of state propagations
+        required for that circuit.
+        """
+        state_props_by_circuit = {}
+        for circuit in self.circuits_evaluated:
+            if len(circuit)>0 and circuit[0] == _Label('rho0'):
+                state_props_by_circuit[circuit] = len(circuit[1:])
+            else:
+                state_props_by_circuit[circuit] = len(circuit)
+        return state_props_by_circuit
+    
+    def num_state_propagations_no_caching(self):
+        """
+        Return the total number of state propagation operations
+        (excluding the action of POVM effects) required for an evaluation strategy 
+        without caching.
+        """
+        return sum(self.num_state_propagations_by_circuit_no_caching().values())
+
+    def find_splitting_new(self, max_sub_table_size=None, num_sub_tables=None, initial_cost_metric='size',
+                           rebalancing_cost_metric='propagations', imbalance_threshold=1.2, minimum_improvement_threshold=.1,
+                           verbosity=0):
+        """
+        Find a partition of the indices of this table to define a set of sub-tables with the desire properties.
+
+        This is done in order to reduce the maximum size of any tree (useful for
+        limiting memory consumption or for using multiple cores).  Must specify
+        either max_sub_tree_size or num_sub_trees.
+
+        Parameters
+        ----------
+        max_sub_table_size : int, optional
+            The maximum size (i.e. list length) of each sub-table.  If the
+            original table is smaller than this size, no splitting will occur.
+            If None, then there is no limit.
+
+        num_sub_tables : int, optional
+            The maximum size (i.e. list length) of each sub-table.  If the
+            original table is smaller than this size, no splitting will occur.
+
+        imbalance_threshold : float, optional (default 1.2)
+            This number serves as a tolerance parameter for a final load balancing refinement
+            to the splitting. The value coresponds to a threshold value of the ratio of the heaviest
+            to the lightest subtree such that ratios below this value are considered sufficiently
+            balanced and processing stops.
+
+        minimum_improvement_threshold : float, optional (default .1)
+            A parameter for the final load balancing refinement process that sets a minimum balance
+            improvement (improvement to the ratio of the sizes of two subtrees) such that a rebalancing
+            step is considered worth performing (even if it would otherwise bring the imbalance parameter
+            described above in `imbalance_threshold` below the target value) .
+                    
+        verbosity : int, optional (default 0)
+            How much detail to send to stdout.
+
+        Returns
+        -------
+        list
+            A list of sets of elements to place in sub-tables.
+        """
+
+        table_contents = self.contents
+        if max_sub_table_size is None and num_sub_tables is None:
+            return [set(range(len(table_contents)))]  # no splitting needed
+
+        if max_sub_table_size is not None and num_sub_tables is not None:
+            raise ValueError("Cannot specify both max_sub_table_size and num_sub_tables")
+        if num_sub_tables is not None and num_sub_tables <= 0:
+            raise ValueError("Error: num_sub_tables must be > 0!")
+
+        #Don't split at all if it's unnecessary
+        if max_sub_table_size is None or len(table_contents) < max_sub_table_size:
+            if num_sub_tables is None or num_sub_tables == 1:
+                return [set(range(len(table_contents)))]
+            
+        #construct a tree structure describing the prefix strucure of the circuit set.
+        circuit_tree = _build_prefix_tree(self.sorted_circuits_to_evaluate, self.circuit_reps, self.orig_indices)
+        circuit_tree_nx = circuit_tree.to_networkx_graph()
+        
+        if num_sub_tables is not None:
+            max_max_sub_table_size = len(self.sorted_circuits_to_evaluate)
+            initial_max_sub_table_size = ceil(len(self.sorted_circuits_to_evaluate)/num_sub_tables)
+            cut_edges, new_roots, tree_levels, subtree_weights = tree_partition_kundu_misra(circuit_tree_nx, max_weight=initial_max_sub_table_size,
+                                                                            weight_key= 'cost' if initial_cost_metric=='size' else 'prop_cost',
+                                                                            return_levels_and_weights=True)
+
+            if len(new_roots) > num_sub_tables: #iteratively row the maximum subtree size until we either hit or are less than the target.
+                last_seen_sub_max_sub_table_size_val = None
+                feasible_range = [initial_max_sub_table_size+1, max_max_sub_table_size-1]
+                #bisect on max_sub_table_size until we find the smallest value for which len(new_roots) <= num_sub_tables
+                while feasible_range[0] < feasible_range[1]:
+                    current_max_sub_table_size = (feasible_range[0] + feasible_range[1])//2
+                    cut_edges, new_roots = tree_partition_kundu_misra(circuit_tree_nx, max_weight=current_max_sub_table_size,
+                                                                      weight_key='cost' if initial_cost_metric=='size' else 'prop_cost',
+                                                                      test_leaves=False, precomp_levels=tree_levels, precomp_weights=subtree_weights)                    
+                    if len(new_roots) > num_sub_tables:
+                        feasible_range[0] = current_max_sub_table_size+1
+                    else:
+                        last_seen_sub_max_sub_table_size_val = (cut_edges, new_roots) #In the multiple root setting I am seeing some strange
+                        #non-monotonicity, so add this as a fall back in case the final result anomalously has len(roots)>num_sub_tables
+                        feasible_range[1] = current_max_sub_table_size
+                if len(new_roots)>num_sub_tables and last_seen_sub_max_sub_table_size_val is not None: #fallback
+                    cut_edges, new_roots = last_seen_sub_max_sub_table_size_val
+
+                #only apply the cuts now that we have found our starting point.
+                partitioned_tree = _copy_networkx_graph(circuit_tree_nx)
+                 #update the propagation cost attribute of the promoted nodes.
+                #only do this at this point to reduce the need for copying
+                for edge in cut_edges:
+                    partitioned_tree.nodes[edge[1]]['prop_cost'] += partitioned_tree.edges[edge[0], edge[1]]['promotion_cost']
+                partitioned_tree.remove_edges_from(cut_edges)
+               
+            #if we have hit the number of partitions, great, we're done!
+            if len(new_roots) == num_sub_tables:
+                #only apply the cuts now that we have found our starting point.
+                partitioned_tree = _copy_networkx_graph(circuit_tree_nx)
+                 #update the propagation cost attribute of the promoted nodes.
+                #only do this at this point to reduce the need for copying
+                for edge in cut_edges:
+                    partitioned_tree.nodes[edge[1]]['prop_cost'] += partitioned_tree.edges[edge[0], edge[1]]['promotion_cost']
+                partitioned_tree.remove_edges_from(cut_edges)
+                pass
+            #if we have fewer subtables then we need to look whether or not we should strictly
+            #hit the number of partitions, or whether we allow for fewer than the requested number to be returned.
+            if len(new_roots) < num_sub_tables:
+                #Perform bisection operations on the heaviest subtrees until we hit the target number.
+                while len(new_roots) < num_sub_tables:
+                    partitioned_tree, new_roots, cut_edges = _bisection_pass(partitioned_tree, cut_edges, new_roots, num_sub_tables,
+                                                                             weight_key='cost' if rebalancing_cost_metric=='size' else 'prop_cost')
+            #add in a final refinement pass to improve the balancing across subtrees.
+            partitioned_tree, new_roots, addl_cut_edges = _refinement_pass(partitioned_tree, new_roots, 
+                                                                           weight_key='cost' if rebalancing_cost_metric=='size' else 'prop_cost', 
+                                                                           imbalance_threshold= imbalance_threshold, 
+                                                                           minimum_improvement_threshold= minimum_improvement_threshold)
+        else:
+            cut_edges, new_roots = tree_partition_kundu_misra(circuit_tree_nx, max_weight = max_sub_table_size,
+                                                              weight_key='cost' if initial_cost_metric=='size' else 'prop_cost')
+            partitioned_tree = _copy_networkx_graph(circuit_tree_nx)
+            for edge in cut_edges:
+                    partitioned_tree.nodes[edge[1]]['prop_cost'] += partitioned_tree.edges[edge[0], edge[1]]['promotion_cost']
+            partitioned_tree.remove_edges_from(cut_edges)
+        
+        #Collect the original circuit indices for each of the parititioned subtrees.
+        orig_index_groups = []
+        for root in new_roots:
+            if isinstance(root,tuple):
+                ckts = []
+                for elem in root:
+                    ckts.extend(_collect_orig_indices(partitioned_tree, elem))
+                orig_index_groups.append(ckts)
+            else:
+                orig_index_groups.append(_collect_orig_indices(partitioned_tree, root))
+
+        return orig_index_groups
+        
 
     def find_splitting(self, max_sub_table_size=None, num_sub_tables=None, cost_metric="size", verbosity=0):
         """
@@ -182,7 +349,7 @@ def create_subtables(max_cost, max_cost_rate=0, max_num=None):
             over the course of the iteration.
             """
 
-            if cost_metric == "applys":
+            if cost_metric == "applies":
                 def cost_fn(rem): return len(rem)  # length of remainder = #-apply ops needed
             elif cost_metric == "size":
                 def cost_fn(rem): return 1  # everything costs 1 in size of table
@@ -333,3 +500,1318 @@ def _get_num_applies(content):
 
         assert(sum(map(len, subTableSetList)) == len(self)), "sub-table sets are not disjoint!"
         return subTableSetList
+    
+
+class PrefixTableJacobian(object):
+    """
+    An ordered list ("table") of circuits to evaluate, where common prefixes can be cached.
+    Specialized for purposes of jacobian calculations.
+
+    """
+
+    def __init__(self, circuits_to_evaluate, max_cache_size, parameter_circuit_dependencies=None):
+        """
+        Creates a "prefix table" for evaluating a set of circuits.
+
+        The table is list of tuples, where each element contains
+        instructions for evaluating a particular operation sequence:
+
+        (iDest, iStart, tuple_of_following_items, iCache)
+
+        Means that circuit[iDest] = cached_circuit[iStart] + tuple_of_following_items,
+        and that the resulting state should be stored at cache index iCache (for
+        later reference as an iStart value).  The ordering of the returned list
+        specifies the evaluation order.
+
+        `iDest` is always in the range [0,len(circuits_to_evaluate)-1], and
+        indexes the result computed for each of the circuits.
+
+        Parameters
+        ----------
+
+
+        circuit_parameter_sensitivities : 
+            A map between the circuits in circuits_to_evaluate and the indices of the model parameters
+            to which these circuits depend.
+
+        Returns
+        -------
+        tuple
+            A tuple of `(table_contents, cache_size)` where `table_contents` is a list
+            of tuples as given above and `cache_size` is the total size of the state
+            cache used to hold intermediate results.
+        """
+        #Sort the operation sequences "alphabetically", so that it's trivial to find common prefixes        
+        circuits_to_sort_by = [cir.circuit_without_povm if isinstance(cir, _SeparatePOVMCircuit) else cir
+                               for cir in circuits_to_evaluate]  # always Circuits - not SeparatePOVMCircuits
+        sorted_circuits_to_sort_by = sorted(list(enumerate(circuits_to_sort_by)), key=lambda x: len(x[1]))
+        sorted_circuits_to_evaluate = [(i, circuits_to_evaluate[i]) for i, _ in sorted_circuits_to_sort_by]
+        #create a map from sorted_circuits_to_sort_by by can be used to quickly sort each of the parameter
+        #dependency lists.
+        fast_sorting_map = {circuits_to_evaluate[i]:j for j, (i, _) in enumerate(sorted_circuits_to_sort_by)} 
+
+        #also need a map from circuits to their original indices in circuits_to_evaluate
+        #for the purpose of setting the correct destination indices in the evaluation instructions.
+        circuit_to_orig_index_map = {circuit: i for i,circuit in enumerate(circuits_to_evaluate)}
+
+        #use this map to sort the parameter_circuit_dependencies sublists.
+        sorted_parameter_circuit_dependencies = []
+        sorted_parameter_circuit_dependencies_orig_indices = []
+        for sublist in parameter_circuit_dependencies:
+            sorted_sublist = [None]*len(sorted_circuits_to_evaluate)
+            for ckt in sublist:
+                sorted_sublist[fast_sorting_map[ckt]] = ckt
+            
+            #filter out instances of None to get the correctly sized and sorted
+            #sublist.
+            filtered_sorted_sublist = [val for val in sorted_sublist if val is not None]
+            orig_index_sublist = [circuit_to_orig_index_map[ckt] for ckt in filtered_sorted_sublist]
+            
+            sorted_parameter_circuit_dependencies.append(filtered_sorted_sublist)
+            sorted_parameter_circuit_dependencies_orig_indices.append(orig_index_sublist)
+
+        sorted_circuit_reps = []
+        sorted_circuit_lengths = []
+        for sublist in sorted_parameter_circuit_dependencies:
+            circuit_reps, circuit_lengths = _circuits_to_compare(sublist)
+            sorted_circuit_reps.append(circuit_reps)
+            sorted_circuit_lengths.append(circuit_lengths)
+
+        #Intuition: The sorted circuit lists should likely break into equivalence classes, wherein multiple
+        #parameters will have the same dependent circuits. This is because in typical models parameters
+        #appear in blocks corresponding to a particular gate label, and so most of the time it should be the
+        #case that the list fractures into all those circuits containing a particular label.
+        #This intuition probably breaks down for ImplicitOpModels with complicated layer rules for which
+        #the breaking into equivalence classes may have limited savings.
+        unique_parameter_circuit_dependency_classes = {}
+        for i, sublist in enumerate(sorted_circuit_reps):
+            if unique_parameter_circuit_dependency_classes.get(sublist, None) is None:
+                unique_parameter_circuit_dependency_classes[sublist] = [i]
+            else:
+                unique_parameter_circuit_dependency_classes[sublist].append(i)
+        
+        self.unique_parameter_circuit_dependency_classes = unique_parameter_circuit_dependency_classes
+
+        #the keys of the dictionary already give the needed circuit rep lists for 
+        #each class, also grab the appropriate list of length for each class.
+        sorted_circuit_lengths_by_class = [sorted_circuit_lengths[class_indices[0]] 
+                                           for class_indices in unique_parameter_circuit_dependency_classes.values()]
+        
+        #also need representatives fo the entries in sorted_parameter_circuit_dependencies for each class,
+        #and for sorted_parameter_circuit_dependencies_orig_indices
+        sorted_parameter_circuit_dependencies_by_class = [sorted_parameter_circuit_dependencies[class_indices[0]] 
+                                                          for class_indices in unique_parameter_circuit_dependency_classes.values()]
+        sorted_parameter_circuit_dependencies_orig_indices_by_class = [sorted_parameter_circuit_dependencies_orig_indices[class_indices[0]] 
+                                                                       for class_indices in unique_parameter_circuit_dependency_classes.values()]
+        
+        #now we can just do the calculation for each of these equivalence classes.
+
+        #get the cache hits for all of the parameter circuit dependency sublists
+        if max_cache_size is None or max_cache_size > 0:
+            cache_hits_by_class = []
+            #CACHE assessment pass: figure out what's worth keeping in the cache.
+            # In this pass, we cache *everything* and keep track of how many times each
+            # original index (after it's cached) is utilized as a prefix for another circuit.
+            # Not: this logic could be much better, e.g. computing a cost savings for each
+            #  potentially-cached item and choosing the best ones, and proper accounting
+            #  for chains of cached items.
+            for circuit_reps, circuit_lengths in zip(unique_parameter_circuit_dependency_classes.keys(), 
+                                                     sorted_circuit_lengths_by_class):
+                cache_hits_by_class.append(_cache_hits(circuit_reps, circuit_lengths))
+        else:
+            cache_hits_by_class = [None]*len(unique_parameter_circuit_dependency_classes)
+                
+        #next construct a prefix table for each sublist.
+        table_contents_by_class = []
+        cache_size_by_class = []
+        for sublist, cache_hits, circuit_reps, circuit_lengths, orig_indices in zip(sorted_parameter_circuit_dependencies_by_class, 
+                                                                                    cache_hits_by_class,
+                                                                                    unique_parameter_circuit_dependency_classes.keys(),
+                                                                                    sorted_circuit_lengths_by_class,
+                                                                                    sorted_parameter_circuit_dependencies_orig_indices_by_class):
+            table_contents, curCacheSize = _build_table(sublist, cache_hits,
+                                                        max_cache_size, circuit_reps, circuit_lengths,
+                                                        orig_indices)
+            table_contents_by_class.append(table_contents)
+            cache_size_by_class.append(curCacheSize)
+
+        #FUTURE: could perform a second pass, and if there is
+        # some threshold number of elements which share the
+        # *same* iStart and the same beginning of the
+        # 'remaining' part then add a new "extra" element
+        # (beyond the #circuits index) which computes
+        # the shared prefix and insert this into the eval
+        # order.
+
+        #map back from equivalence classes to by parameter.
+        table_contents_by_parameter = [None]*len(parameter_circuit_dependencies)
+        cache_size_by_parameter = [None]*len(parameter_circuit_dependencies)
+        for table_contents, cache_size, param_class in zip(table_contents_by_class, cache_size_by_class,
+                                                           unique_parameter_circuit_dependency_classes.values()):
+            for idx in param_class:
+                table_contents_by_parameter[idx] = table_contents
+                cache_size_by_parameter[idx] = cache_size
+
+        self.contents_by_parameter = table_contents_by_parameter
+        self.cache_size_by_parameter = cache_size_by_parameter
+        self.parameter_circuit_dependencies = sorted_parameter_circuit_dependencies
+
+
+#---------Helper Functions------------#
+
+def _circuits_to_compare(sorted_circuits_to_evaluate):
+        
+    bare_circuits = [cir.circuit_without_povm if isinstance(cir, _SeparatePOVMCircuit) else cir
+                            for cir in sorted_circuits_to_evaluate]
+    distinct_line_labels = set([cir.line_labels for cir in bare_circuits])
+    
+    circuit_lens = [None]*len(sorted_circuits_to_evaluate)
+    if len(distinct_line_labels) == 1:
+        circuit_reps = [None]*len(sorted_circuits_to_evaluate)
+        for i, cir in enumerate(bare_circuits):
+            circuit_reps[i] = cir.layertup
+            circuit_lens[i] = len(circuit_reps[i])
+    else:
+        circuit_reps = bare_circuits
+        for i, cir in enumerate(sorted_circuits_to_evaluate):
+            circuit_lens[i] = len(circuit_reps[i])
+
+    return tuple(circuit_reps), tuple(circuit_lens)
+
+def _cache_hits(circuit_reps, circuit_lengths):
+
+    #CACHE assessment pass: figure out what's worth keeping in the cache.
+    # In this pass, we cache *everything* and keep track of how many times each
+    # original index (after it's cached) is utilized as a prefix for another circuit.
+    # Not: this logic could be much better, e.g. computing a cost savings for each
+    #  potentially-cached item and choosing the best ones, and proper accounting
+    #  for chains of cached items.
+    
+    cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
+    cache_hits = [0]*len(circuit_reps)
+
+    for i in range(len(circuit_reps)):
+        circuit = circuit_reps[i] 
+        L = circuit_lengths[i]  # can be a Circuit or a label tuple
+        for cached_index in reversed(cacheIndices):
+            candidate = circuit_reps[cached_index]
+            Lc = circuit_lengths[cached_index]
+            if L >= Lc > 0 and circuit[0:Lc] == candidate:  # a cache hit!
+                cache_hits[cached_index] += 1
+                break  # stop looking through cache
+        cacheIndices.append(i)  # cache *everything* in this pass
+
+    return cache_hits
+
+def _build_table(sorted_circuits_to_evaluate, cache_hits, max_cache_size, circuit_reps, circuit_lengths,
+                 orig_indices):
+
+    # Build prefix table: construct list, only caching items with hits > 0 (up to max_cache_size)
+    cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
+    table_contents = [None]*len(sorted_circuits_to_evaluate)
+    curCacheSize = 0
+    for j, (i, _) in zip(orig_indices,enumerate(sorted_circuits_to_evaluate)):
+        
+        circuit_rep = circuit_reps[i] 
+        L = circuit_lengths[i]
+
+        #find longest existing prefix for circuit by working backwards
+        # and finding the first string that *is* a prefix of this string
+        # (this will necessarily be the longest prefix, given the sorting)
+        for i_in_cache in range(curCacheSize - 1, -1, -1):  # from curCacheSize-1 -> 0
+            candidate = circuit_reps[cacheIndices[i_in_cache]]
+            Lc = circuit_lengths[cacheIndices[i_in_cache]]
+            if L >= Lc > 0 and circuit_rep[0:Lc] == candidate:  # ">=" allows for duplicates
+                iStart = i_in_cache  # an index into the *cache*, not into circuits_to_evaluate
+                remaining = circuit_rep[Lc:]  # *always* a SeparatePOVMCircuit or Circuit
+                break
+        else:  # no break => no prefix
+            iStart = None
+            remaining = circuit_rep
+
+        # if/where this string should get stored in the cache
+        if (max_cache_size is None or curCacheSize < max_cache_size) and cache_hits[i]:
+            iCache = len(cacheIndices)
+            cacheIndices.append(i); curCacheSize += 1
+        else:  # don't store in the cache
+            iCache = None
+
+        #Add instruction for computing this circuit
+        table_contents[i] = (j, iStart, remaining, iCache)
+
+    return table_contents, curCacheSize
+
+#helper method for building a tree showing the connections between different circuits
+#for the purposes of prefix-based evaluation.
+def _build_prefix_tree(sorted_circuits_to_evaluate, circuit_reps, orig_indices):
+    #assume the input circuits have already been sorted by length.
+    circuit_tree = Tree()
+    for j, (i, _) in zip(orig_indices,enumerate(sorted_circuits_to_evaluate)):
+        circuit_rep = circuit_reps[i]
+        #the first layer should be a state preparation. If this isn't in a root in the
+        #tree add it.
+        root_node = circuit_tree.get_root_node(circuit_rep[0])
+        if root_node is None and len(circuit_rep)>0:
+            #cost is the number of propagations, so exclude the initial state prep
+            root_node = RootNode(circuit_rep[0], cost=0) 
+            circuit_tree.add_root(root_node)
+        
+        current_node = root_node
+        for layerlbl in circuit_reps[i][1:]:
+            child_node = current_node.get_child_node(layerlbl)
+            if child_node is None:
+                child_node = ChildNode(layerlbl, parent=current_node)
+            current_node = child_node
+        #when we get to the end of the circuit add a pointer on the
+        #final node to the original index of this circuit in the
+        #circuit list.
+        current_node.add_orig_index(j)
+
+    return circuit_tree
+
+
+#----------------------Helper classes for managing circuit evaluation tree. --------------#
+class TreeNode:
+    def __init__(self, value, children=None, orig_indices=None):
+        """
+        Parameters
+        ----------
+        value : any
+            The value to be stored in the node.
+
+        children : list, optional (default is None)
+            A list of child nodes. If None, initializes an empty list.
+
+        orig_indices : list, optional (default is None)
+            A list of original indices. If None, initializes an empty list.
+        """
+        self.value = value
+        self.children = [] if children is None else children
+        self.orig_indices = [] if orig_indices is None else orig_indices #make this a list to allow for duplicates
+
+    def add_child(self, child_node):
+        """
+        Add a child node to the current node.
+
+        Parameters
+        ----------
+        child_node : TreeNode
+            The child node to be added.
+        """
+
+        self.children.append(child_node)
+
+    def remove_child(self, child_node):
+        """
+        Remove a child node from the current node.
+
+        Parameters
+        ----------
+        child_node : TreeNode
+            The child node to be removed.
+        """
+        self.children = [child for child in self.children if child is not child_node]
+
+    def get_child_node(self, value):
+        """
+        Get the child node associated with the input value. If that node is not present, return None.
+
+        Parameters
+        ----------
+        value : any
+            The value to search for in the child nodes.
+
+        Returns
+        -------
+        TreeNode or None
+            The child node with the specified value, or None if not found.
+        """
+
+        for node in self.children:
+            if node.value == value:
+                return node
+        #if we haven't returned already it is because there wasn't a corresponding root,
+        #so return None
+        return None
+
+    def add_orig_index(self, value):
+        """
+        Add an original index to the node.
+
+        Parameters
+        ----------
+        value : int
+            The original index to be added.
+        """
+        self.orig_indices.append(value)
+
+    def traverse(self):
+        """
+        Traverse the tree in pre-order and return a list of node values.
+
+        Returns
+        -------
+        list
+            A list of node values in pre-order traversal.
+        """
+
+        nodes = []
+        stack = [self]
+        while stack:
+            node = stack.pop()
+            nodes.append(node.value)
+            stack.extend(reversed(node.children))  # Add children to stack in reverse order for pre-order traversal
+        return nodes
+
+    def get_descendants(self):
+        """
+        Get all descendant node values of the current node in pre-order traversal.
+
+        Returns
+        -------
+        list
+            A list of descendant node values.
+        """
+        descendants = []
+        stack = self.children[:]
+        while stack:
+            node = stack.pop()
+            descendants.append(node.value)
+            stack.extend(reversed(node.children))  # Add children to stack in reverse order for pre-order traversal
+        return descendants
+    
+    def total_orig_indices(self):
+        """
+        Calculate the total number of orig_indices values for this node and all of its descendants.
+        """
+        total = len(self.orig_indices)
+        for child in self.get_descendants():
+            total += len(child.orig_indices)
+        return total
+
+    def print_tree(self, level=0, prefix=""):
+        """
+        Print the tree structure starting from the current node.
+
+        Parameters
+        ----------
+        level : int, optional (default 0)
+            The current level in the tree.
+        prefix : str, optional (default "")
+            The prefix for the current level.
+        """
+        connector = "├── " if level > 0 else ""
+        print(prefix + connector + str(self.value) +', ' + str(self.orig_indices))
+        for i, child in enumerate(self.children):
+            if i == len(self.children) - 1:
+                child.print_tree(level + 1, prefix + ("    " if level > 0 else ""))
+            else:
+                child.print_tree(level + 1, prefix + ("│   " if level > 0 else ""))
+
+#create a class for RootNodes that includes additional initial cost information.
+class RootNode(TreeNode):
+    """
+    Class for representing a root node for a tree, along with the corresponding metadata
+    specific to root nodes.
+    """
+
+    def __init__(self, value, cost=0, tree=None, children=None, orig_indices=None):
+        """
+        Initialize a RootNode with a value, optional cost, optional tree, optional children, and optional original indices.
+
+        Parameters
+        ----------
+        value : any
+            The value to be stored in the node.
+        cost : int, optional (default is 0)
+            The initial cost associated with the root node.
+        tree : Tree, optional (default is None)
+            The tree to which this root node belongs.
+        children : list, optional (default is None)
+            A list of child nodes. If None, initializes an empty list.
+        orig_indices : list, optional (default is None)
+            A list of original indices. If None, initializes an empty list.
+        """
+        super().__init__(value, children, orig_indices)
+        self.cost = cost
+        self.tree = tree
+        
+class ChildNode(TreeNode):
+    """
+    Class for representing a child node for a tree, along with the corresponding metadata
+    specific to child nodes.
+    """
+    def __init__(self, value, parent=None, children=None, orig_indices=None):
+        """
+        Parameters
+        ----------
+        value : any
+            The value to be stored in the node.
+        parent : TreeNode, optional (default is None)
+            The parent node.
+        children : list, optional (default is None)
+            A list of child nodes. If None, initializes an empty list.
+        orig_indices : list, optional (default is None)
+            A list of original indices. If None, initializes an empty list.
+        """
+        super().__init__(value, children, orig_indices)
+        self.parent = parent
+        if parent is not None:
+            parent.add_child(self)
+
+    def get_ancestors(self):
+        """
+        Get all ancestor nodes of the current node up to the root node.
+
+        Returns
+        -------
+        list
+            A list of ancestor nodes.
+        """
+        ancestors = []
+        node = self
+        while node:
+            ancestors.append(node)
+            if isinstance(node, RootNode):
+                break
+            node = node.parent
+        return ancestors
+
+    def calculate_promotion_cost(self):
+        """
+        Calculate the cost of promoting this child node to a root node. This
+        corresponds to the sum of the cost of this node's current root, plus
+        the total number of ancestors (less the root).
+        """
+        ancestors = self.get_ancestors()
+        ancestor_count = len(ancestors) - 1
+        current_root = self.get_root()
+        current_root_cost = current_root.cost
+        return ancestor_count + current_root_cost
+
+    def promote_to_root(self):
+        """
+        Promote this child node to a root node, updating the tree structure accordingly.
+        """
+        # Calculate the cost (I know this is code duplication, but in this case
+        #we need the intermediate values as well).
+        ancestors = self.get_ancestors()
+        ancestor_count = len(ancestors) - 1
+        current_root = self.get_root()
+        current_root_cost = current_root.cost
+        new_root_cost = ancestor_count + current_root_cost
+
+        # Remove this node from its parent's children
+        if self.parent:
+            self.parent.remove_child(self)
+
+        # Create a new RootNode
+        ancestor_values = [ancestor.value for ancestor in reversed(ancestors)]
+        if isinstance(ancestor_values[0], tuple):
+            ancestor_values = list(ancestor_values[0]) + ancestor_values[1:]
+        new_root_value = tuple(ancestor_values)
+        new_root = RootNode(new_root_value, cost=new_root_cost, tree=current_root.tree, children=self.children,
+                            orig_indices=self.orig_indices)
+
+        # Update the children of the new RootNode
+        for child in new_root.children:
+            child.parent = new_root
+
+        # Add the new RootNode to the tree
+        if new_root.tree:
+            new_root.tree.add_root(new_root)
+
+        # Delete this ChildNode
+        del self
+
+    def get_root(self):
+        """
+        Get the root node of the current node.
+
+        Returns
+        -------
+        RootNode
+            The root node of the current node.
+        """
+        node = self
+        while node.parent and not isinstance(node.parent, RootNode):
+            node = node.parent
+        return node.parent
+
+class Tree:
+    """
+    Container class for storing a tree structure (technically a forest, as there
+    can be multiple roots).
+    """
+    def __init__(self, roots=None):
+        """
+        Parameters
+        ----------
+        roots: list of RootNode, optional (default None)
+            List of roots for this tree structure.
+        """
+        self.roots = []
+        self.root_set = set(self.roots)
+    
+    def get_root_node(self, value):
+        """
+        Get the root node associated with the input value. If that node is not present, return None.
+
+        Parameters
+        ----------
+        value : any
+            The value to search for in the root nodes.
+
+        Returns
+        -------
+        RootNode or None
+            The root node with the specified value, or None if not found.
+        """
+
+        for node in self.roots:
+            if node.value == value:
+                return node
+        #if we haven't returned already it is because there wasn't a corresponding root,
+        #so return None
+        return None
+
+    def add_root(self, root_node):
+        """
+        Add a root node to the tree.
+
+        Parameters
+        ----------
+        root_node : RootNode
+            The root node to be added.
+        """
+
+        root_node.tree = self
+        self.roots.append(root_node)
+        self.root_set.add(root_node)
+
+    def remove_root(self, root_node):
+        """
+        Remove a root node from the tree.
+
+        Parameters
+        ----------
+        root_node : RootNode
+            The root node to be removed.
+        """
+
+        root_node.tree = None
+        self.roots = [root for root in self.roots if root is not root_node]
+    
+    def total_orig_indices(self):
+        """
+        Calculate the total number of original indices for all root nodes and their descendants.
+        """
+        return sum([root.total_orig_indices() for root in self.roots])
+
+    def traverse(self):
+        """
+        Traverse the entire tree in pre-order and return a list of node values.
+
+        Returns
+        -------
+        list
+            A list of node values in pre-order traversal.
+        """
+        nodes = []
+        for root in self.roots:
+            nodes.extend(root.traverse())
+        return nodes
+    
+    def count_nodes(self):
+        """
+        Count the total number of nodes in the tree.
+        """
+        count = 0
+        stack = self.roots[:]
+        while stack:
+            node = stack.pop()
+            count += 1
+            stack.extend(node.children)
+        return count
+
+    def print_tree(self):
+        """
+        Print the entire tree structure.
+        """
+        for root in self.roots:
+            root.print_tree()
+    
+    def calculate_cost(self):
+        """
+        Calculate the total cost of the tree, including root costs and promotion costs for child nodes.
+        See `RootNode` and `ChildNode`.
+        """
+        total_cost = sum([root.cost for root in self.roots])
+        total_nodes = self.count_nodes()
+        total_child_nodes = total_nodes - len(self.roots)
+        return total_cost + total_child_nodes
+    
+    def to_networkx_graph(self):
+        """
+        Convert the tree to a NetworkX directed graph with node and edge attributes.
+
+        Returns
+        -------
+        networkx.DiGraph
+            The NetworkX directed graph representation of the tree.
+        """
+        G = _nx.DiGraph()
+        stack = [(None, root) for root in self.roots]
+        insertion_order = 0
+        while stack:
+            parent, node = stack.pop()
+            node_id = id(node)
+            prop_cost = node.cost if isinstance(node, RootNode) else 1
+            G.add_node(node_id, cost=len(node.orig_indices), orig_indices=tuple(node.orig_indices), 
+                       label=node.value, prop_cost = prop_cost,
+                       insertion_order=insertion_order)
+            insertion_order+=1
+            if parent is not None:
+                parent_id = id(parent)
+                edge_cost = node.calculate_promotion_cost()
+                G.add_edge(parent_id, node_id, promotion_cost=edge_cost)
+            for child in node.children:
+                stack.append((node, child))
+
+        #if there are multiple roots then add an additional virtual root node as the
+        #parent for all of these roots to enable partitioning with later algorithms.
+        if len(self.roots)>1:
+            G.add_node('virtual_root', cost = 0, orig_indices=(), label = (), prop_cost=0, insertion_order=-1)
+            for root in self.roots:
+                G.add_edge('virtual_root', id(root), promotion_cost=0)
+
+        return G
+
+#--------------- Tree Partitioning Algorithm Helpers (+NetworkX Utilities)-----------------#
+
+def _draw_graph(G, node_label_key='label', edge_label_key='promotion_cost', figure_size=(10,10)):
+    """
+    Draw the NetworkX graph with node labels.
+    
+    Parameters
+    ----------
+    G : networkx.Graph
+        The networkx Graph object to draw.
+    
+    node_label_key : str, optional (default 'label')
+        Optional key for the node attribute to use for the node labels.
+    
+    edge_label_key : str, optional (default 'cost')
+        Optional key for the edge attribute to use for the edge labels.
+    
+    figure_size : tuple of floats, optional (default (10,10))
+        An optional size specifier passed into the matplotlib figure
+        constructor to set the plot size.
+    """
+    plt.figure(figsize=figure_size)
+    pos = _nx.nx_agraph.graphviz_layout(G, prog="dot", args="-Granksep=5 -Gnodesep=10")
+    labels = _nx.get_node_attributes(G, node_label_key)
+    _nx.draw(G, pos, labels=labels, with_labels=True, node_size=500, node_color='lightblue', font_size=6, font_weight='bold')
+    edge_labels = _nx.get_edge_attributes(G, edge_label_key)
+    _nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
+    plt.show()
+
+def _copy_networkx_graph(G):
+    """
+    Create a new independent copy of a NetworkX directed graph with node and edge attributes that
+    match the original graph. Specialized to copying graphs with the known attributes set by the
+    `to_networkx_graph` method of the `Tree` class.
+
+    Parameters
+    ----------
+    G : networkx.DiGraph
+        The original NetworkX directed graph.
+
+    Returns
+    -------
+    networkx.DiGraph
+        The new independent copy of the NetworkX directed graph.
+    """
+    new_G = _nx.DiGraph()
+
+    # Copy nodes with attributes
+    for node, data in G.nodes(data=True):
+        new_G.add_node(node, cost = data['cost'], orig_indices=data['orig_indices'], 
+                       label= data['label'] , prop_cost = data['prop_cost'],
+                       insertion_order=data['insertion_order'])
+
+    # Copy edges with attributes
+    for u, v, data in G.edges(data=True):
+        new_G.add_edge(u, v, promotion_cost = data['promotion_cost'])
+
+    return new_G
+
+def _find_root(tree):
+    """
+    Find the root node of a directed tree.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        The directed tree.
+    
+    Returns
+    -------
+    networkx node corresponding to the root.
+    """
+
+    # The root node will have no incoming edges
+    for node in tree.nodes():
+        if tree.in_degree(node) == 0:
+            return node
+    raise ValueError("The input graph is not a valid tree (no root found).")
+
+def _compute_subtree_weights(tree, root, weight_key):
+    """
+    This function computes the total weight of each subtree in a directed tree.
+    The weight of a subtree is defined as the sum of the weights of all nodes
+    in that subtree, including the root of the subtree.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        The directed tree.
+    
+    root: networkx node 
+        The root node of the tree.
+    
+    weight_key : str
+        A string corresponding to the node attribute to use as the weights.
+
+    Returns
+    -------
+    A dictionary where keys are nodes and values are the total weights of the subtrees rooted at those nodes.
+    """
+
+    subtree_weights = {} # {node: 0 for node in tree.nodes()}
+    stack = [root]
+    visited = set()
+
+    # First pass: calculate the subtree weights in a bottom-up manner
+    while stack:
+        node = stack.pop()
+        if node in visited:
+            # All children have been processed, now process the node itself
+            subtree_weight = tree.nodes[node][weight_key]
+            for child in tree.successors(node):
+                subtree_weight += subtree_weights[child]
+            subtree_weights[node] = subtree_weight
+        else:
+            # Process the node after its children
+            visited.add(node)
+            stack.append(node)
+            for child in tree.successors(node):
+                if child not in visited:
+                    stack.append(child)
+
+    return subtree_weights
+
+def _partition_levels(tree, root):
+    """
+    Partition the nodes of a rooted directed tree into levels based on their distance from the root.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph 
+        The directed tree.
+    root : networkx node
+        The root node of the tree.
+
+    Returns
+    -------
+    list of sets: 
+        A list where each set contains nodes that are equidistant from the root.
+    """
+    # Initialize a dictionary to store the level of each node
+    levels = {}
+    # Initialize a queue for BFS
+    queue = _collections.deque([(root, 0)])
+    
+    while queue:
+        node, level = queue.popleft()
+        if level not in levels:
+            levels[level] = set()
+        levels[level].add(node)
+        
+        for child in tree.successors(node):
+            queue.append((child, level + 1))
+    
+    tree_nodes = tree.nodes
+    # Convert the levels dictionary to a list of sets ordered by level
+    sorted_levels = []
+    for level in sorted(levels.keys()):
+        # Sort nodes at each level by 'insertion_order' attribute
+        sorted_nodes = sorted(levels[level], key=lambda node: tree_nodes[node]['insertion_order'])
+        sorted_levels.append(sorted_nodes)
+    
+    return sorted_levels
+
+
+def _partition_levels_and_compute_subtree_weights(tree, root, weight_key):
+    """
+    Partition the nodes of a rooted directed tree into levels based on their distance from the root
+    and compute the total weight of each subtree.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph 
+        The directed tree.
+    root : networkx node
+        The root node of the tree.
+    weight_key : str
+        A string corresponding to the node attribute to use as the weights.
+
+    Returns
+    -------
+    tuple:
+        - list of sets: A list where each set contains nodes that are equidistant from the root.
+        - dict: A dictionary where keys are nodes and values are the total weights of the subtrees rooted at those nodes.
+    """
+    # Initialize a dictionary to store the level of each node
+    levels = {}
+    # Initialize a dictionary to store the subtree weights
+    subtree_weights = {}
+    # Initialize a queue for BFS
+    queue = _collections.deque([(root, 0)])
+    # Initialize a stack for DFS to compute subtree weights
+    stack = []
+    visited = set()
+
+    #I think this returns a view, so grab this ahead of time in case
+    #there is overhead with that.
+    tree_nodes = tree.nodes
+
+    #successors is kind of an expensive call and we use at least twice
+    #per node, so let's just compute it once and cache in a dict.
+    node_successors = {node: list(tree.successors(node)) for node in tree_nodes}
+
+    while queue:
+        node, level = queue.popleft()
+        if node not in visited:
+            visited.add(node)
+            if level not in levels:
+                levels[level] = set()
+            levels[level].add(node)
+            stack.append(node)
+            for child in node_successors[node]:
+                queue.append((child, level + 1))
+
+    # Compute subtree weights in a bottom-up manner
+    while stack:
+        node = stack.pop()
+        subtree_weight = tree_nodes[node][weight_key]
+        for child in node_successors[node]:
+            subtree_weight += subtree_weights[child]
+        subtree_weights[node] = subtree_weight
+
+    # Convert the levels dictionary to a list of sets ordered by level
+    sorted_levels = []
+    for level in sorted(levels.keys()):
+        # Sort nodes at each level by 'insertion_order' attribute
+        sorted_nodes = sorted(levels[level], key=lambda node: tree_nodes[node]['insertion_order'])
+        sorted_levels.append(sorted_nodes)
+
+    return sorted_levels, subtree_weights
+
+
+def _find_leaves(tree):
+    """
+    Find all leaf nodes in a directed tree.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        The directed tree.
+
+    Returns
+    -------
+    A list of leaf nodes.
+    """
+    leaf_nodes = set([node for node in tree.nodes() if tree.out_degree(node) == 0])
+    return leaf_nodes
+        
+def _path_to_root(tree, node, root):
+    """
+    Return a list of nodes along the path from the given node to the root.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph 
+        The directed tree.
+    node : networkx node
+        The starting node.
+    root : networkx node
+        The root node of the tree.
+
+    Returns
+    -------
+    A list of nodes along the path from the given node to the root.
+    """
+    path = []
+    current_node = node
+
+    while current_node != root:
+        path.append(current_node)
+        #note: for a tree structure there should be just one predecessor
+        #so not worried about nondeterminism, if we every apply this to another
+        #graph structure this needs to be reevaluated.
+        predecessors = list(tree.predecessors(current_node))
+        current_node = predecessors[0]
+    path.append(root)
+
+    return path
+
+def _get_subtree(tree, root):
+    """
+    Return a new graph corresponding to the subtree rooted at the given node.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        The directed tree.
+    
+    root : networkx node
+        The root node of the subtree.
+
+    Returns
+    -------
+    subtree : networkx.DiGraph
+        A new directed graph corresponding to the subtree rooted at the given node.
+    """
+    # Create a new directed graph for the subtree
+    subtree = _nx.DiGraph()
+    
+    # Use a queue to perform BFS and add nodes and edges to the subtree
+    queue = [root]
+    while queue:
+        node = queue.pop(0)
+        subtree.add_node(node, **tree.nodes[node])
+        for child in tree.successors(node):
+            subtree.add_edge(node, child, **tree.edges[node, child])
+            queue.append(child)
+    
+    return subtree
+
+def _collect_orig_indices(tree, root):
+    """
+    Collect all values of the 'orig_indices' node attributes in the subtree rooted at the given node.
+    The 'orig_indices' values are tuples, and the function flattens these tuples into a single list.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        The directed tree.
+    
+    root : networkx node
+        The root node of the subtree.
+
+    Returns
+    -------
+    list
+        A flattened list of all values of the 'orig_indices' node attributes in the subtree.
+    """
+    orig_indices_list = []
+    queue = [root]
+    
+    #TODO: See if this would be any faster with one of the dfs/bfs iterators in networkx
+    while queue:
+        node = queue.pop()
+        orig_indices_list.extend(tree.nodes[node]['orig_indices'])
+        for child in tree.successors(node):
+            queue.append(child)
+    
+    return sorted(orig_indices_list) #sort it to account for any nondeterministic traversal order.
+
+def _process_node_km(node, tree, subtree_weights, cut_edges, max_weight, root, new_roots):
+    """
+    Helper function for Kundu-Misra algorithm. This function processes each node
+    by cutting edges with the highest weight children until the node's subtree weight
+    is below the maximum weight threshold, updating the subtree weights of any ancestors
+    as needed.
+    """
+
+    #if the subtree weight of this node is less than max weight we can stop right away
+    #and avoid the sorting of the child weights.
+    if subtree_weights[node]<=max_weight:
+        return
+    
+    tree_nodes = tree.nodes
+    #otherwise we will sort the weights of the child nodes to get the heaviest weight ones.
+    #sorting by insertion order to ensure determinism.
+    weighted_children = [(child, subtree_weights[child]) for child in 
+                         sorted(tree.successors(node), key=lambda node: tree_nodes[node]['insertion_order']) ]
+    sorted_weighted_children = sorted(weighted_children, key = lambda x: x[1], reverse=True)
+    
+    #get the path of nodes up to the root which need to have their weights updated upon edge removal.
+    nodes_to_update = _path_to_root(tree, node, root)
+        
+    #remove the weightiest children until the weight is below the maximum weight.
+    removed_child_index = 0 #track the index of the child being removed.
+    while subtree_weights[node]>max_weight:
+        removed_child =  sorted_weighted_children[removed_child_index][0]
+        #add the edge to this child to the list of those cut.
+        cut_edges.append((node, removed_child))
+        new_roots.append(removed_child)
+        removed_child_weight = subtree_weights[removed_child]
+        #update the subtree weight of the current node and all parents up to the root.
+        for node_to_update in nodes_to_update:
+            subtree_weights[node_to_update]-= removed_child_weight
+        #update index:
+        removed_child_index+=1
+
+def tree_partition_kundu_misra(tree, max_weight, weight_key='cost', test_leaves = True,
+                               return_levels_and_weights=False, precomp_levels = None,
+                               precomp_weights = None):
+    """
+    Algorithm for optimal minimum cardinality k-partition of tree (a partition
+    of a tree into cluster of size at most k) based on a slightly less sophisticated
+    implementation of the algorithm from "A Linear Tree Partitioning Algorithm"
+    by Kundu and Misra (SIAM J. Comput. Vol. 6, No. 1, March 1977). Less sophisiticated
+    because the strictly linear time implementation uses linear-time median estimation
+    routine, while this implementation uses sorting (n log(n)-time), in practice it is
+    likely that the highly-optimized C implementation of sorting would beat an uglier
+    python implementation of median finding for most problem instances of interest anyhow.
+
+    Parameters
+    ----------
+    tree : networkx.DiGraph
+        An input graph representing the directed tree to perform partitioning on.
+    
+    max_weight : int
+        Maximum node weight allowed for each partition.
+    
+    weight_key : str, optional (default 'cost')
+        An optional string denoting the node attribute label to use for node weights
+        in partitioning.
+
+    test_leaves : bool, optional (default True)
+        When True an initial test is performed to ensure that the weight of the leaves are all
+        less than the maximum weight. Only turn off if you know for certain this is true.
+
+    return_levels_and_weights : bool, optional (default False)
+        If True return the constructed tree level structure (the lists of nodes partitioned
+        by distance from the root) and subtree weights.
+
+    precomp_levels : list of sets, optional (default None)
+        A list where each set contains nodes that are equidistant from the root.
+
+    precomp_weights : dict, optional (default None)
+        A dictionary where keys are nodes and values are the total weights of the subtrees rooted at those nodes.
+        
+    Returns
+    -------
+    partitioned_tree : networkx.DiGraph
+        A new DiGraph corresponding to the partitioned tree. I.e. a copy of the original
+        tree with the requisite edge cuts performed.
+ 
+    cut_edges : list of tuples
+        A list of the parent-child node pairs whose edges were cut in partitioning the tree.
+
+    
+    """
+    #create a copy of the input tree:
+    #tree = _copy_networkx_graph(tree)
+    
+    cut_edges = [] #list of cut edges.
+    new_roots = [] #list of the subtree root node in the partitioned tree
+
+    #find the root node of tree:
+    root = _find_root(tree)
+    new_roots.append(root)
+
+    tree_nodes = tree.nodes
+
+    if test_leaves:
+        #find the leaves:
+        leaves = _find_leaves(tree)
+        #make sure that the weights of the leaves are all less than the maximum weight.
+        msg = 'The maximum node weight for at least one leaf is greater than the maximum weight, no partition possible.'
+        assert all([tree_nodes[leaf][weight_key]<=max_weight for leaf in leaves]), msg
+        
+    #precompute a list of subtree weights which will be dynamically updated as we make cuts. Also
+    #parition tree into levels based on distance from root.
+    if precomp_levels is None and precomp_weights is None:
+        tree_levels, subtree_weights = _partition_levels_and_compute_subtree_weights(tree, root, weight_key)
+    else:
+        tree_levels = precomp_levels if precomp_levels is not None else _partition_levels(tree, root)
+        subtree_weights = precomp_weights.copy() if precomp_weights is not None else _compute_subtree_weights(tree, root, weight_key)
+        
+    #the subtree_weights get modified in-place by _process_node_km, so create a copy for the return value.
+    if return_levels_and_weights:
+        subtree_weights_orig = subtree_weights.copy()
+
+    #begin processing the nodes level-by-level.
+    for level in reversed(tree_levels):
+        for node in level:
+            _process_node_km(node, tree, subtree_weights, cut_edges, max_weight, root, new_roots)
+
+    #sort the new root nodes in case there are determinism issues
+    new_roots = sorted(new_roots, key=lambda node: tree_nodes[node]['insertion_order'])    
+    
+    if return_levels_and_weights:
+        return cut_edges, new_roots, tree_levels, subtree_weights_orig
+    else:
+        return cut_edges, new_roots
+
+def _bisect_tree(tree, subtree_root, subtree_weights, weight_key, root_cost = 0, target_proportion = .5):
+    #perform a bisection on the subtree. Loop through the tree beginning at the root,
+    #and find as cheap as possible of an edge which when cut approximately bisects the tree based on cost.
+    
+    heaviest_subtree_levels = _partition_levels(tree, subtree_root)
+    new_subtree_cost = {}
+    
+    new_subtree_cost[subtree_root] =  subtree_weights[subtree_root]
+    for i, level in enumerate(heaviest_subtree_levels[1:]): #skip the root.
+        for node in level:
+            #calculate the cost of a new subtree rooted at this node. This is the current cost
+            #plus the current level plus the propagation cost of the current root.
+            new_subtree_cost[node] = subtree_weights[node] + i + root_cost if weight_key == 'prop_cost' else subtree_weights[node]
+    
+    #find the node that results in as close as possible to a bisection of the subtree
+    #in terms of propagation cost.
+    target_prop_cost = new_subtree_cost[subtree_root] * target_proportion
+    closest_node = subtree_root
+    closest_distance = new_subtree_cost[subtree_root]
+    for node, cost in new_subtree_cost.items(): #since the nodes in each level are sorted this should be alright for determinism.
+        current_distance = abs(cost - target_prop_cost)
+        if current_distance < closest_distance:
+            closest_distance = current_distance
+            closest_node = node
+    #we now have the node which when promoted to a root produces the tree closest to a bisection in terms of propagation
+    #cost possible. Let's perform that bisection now.
+    if closest_node is not subtree_root:
+        #since a tree should only be one predecessor, so don't need to worry about determinism.
+        cut_edge = (list(tree.predecessors(closest_node))[0], closest_node)
+        return cut_edge, (new_subtree_cost[closest_node], subtree_weights[subtree_root] - subtree_weights[closest_node])
+    else:
+        return None, None
+
+def _bisection_pass(partitioned_tree, cut_edges, new_roots, num_sub_tables, weight_key):
+    partitioned_tree = _copy_networkx_graph(partitioned_tree)
+    subtree_weights = [(root, _compute_subtree_weights(partitioned_tree, root, weight_key)) for root in new_roots]
+    sorted_subtree_weights = sorted(subtree_weights, key=lambda x: x[1][x[0]], reverse=True)
+
+    #perform a bisection on the heaviest subtree. Loop through the tree beginning at the root,
+    #and find as cheap as possible of an edge which when cut approximately bisects the tree based on cost.
+    for i in range(len(sorted_subtree_weights)):
+        heaviest_subtree_root = sorted_subtree_weights[i][0]
+        heaviest_subtree_weights = sorted_subtree_weights[i][1]
+        root_cost =  partitioned_tree.nodes[heaviest_subtree_root][weight_key] if weight_key == 'prop_cost' else 0
+        cut_edge, new_subtree_costs = _bisect_tree(partitioned_tree, heaviest_subtree_root, heaviest_subtree_weights, weight_key, root_cost)
+        if cut_edge is not None:
+            cut_edges.append(cut_edge)
+            new_roots.append(cut_edge[1])
+            #cut the prescribed edge.
+            partitioned_tree.remove_edge(cut_edge[0], cut_edge[1])
+        #check whether we need to continue paritioning subtrees.
+        if len(new_roots) == num_sub_tables:
+            break
+    #sort the new root nodes in case there are determinism issues
+    new_roots = sorted(new_roots, key=lambda node: partitioned_tree.nodes[node]['insertion_order'])    
+
+    return partitioned_tree, new_roots, cut_edges
+
+def _refinement_pass(partitioned_tree, roots, weight_key, imbalance_threshold=1.2, minimum_improvement_threshold = .1):
+    #refine the partitioning to improve the balancing of the specified weights across the
+    #subtrees.
+    #start by recomputing the latest subtree weights and ranking them from heaviest to lightest.
+    partitioned_tree = _copy_networkx_graph(partitioned_tree)
+    subtree_weights = [(root, _compute_subtree_weights(partitioned_tree, root, weight_key)) for root in roots]
+    sorted_subtree_weights = sorted(subtree_weights, key=lambda x: x[1][x[0]], reverse=True)
+
+    partitioned_tree_nodes = partitioned_tree.nodes
+
+    #Strategy: pair heaviest and lightest subtrees and identify the subtree in the heaviest that could be
+    #snipped out and added to the lightest to bring their weights as close as possible.
+    #Next do this for the second heaviest and second lightest, etc. 
+    #Only do so while the imbalance threshold, the ratio between the heaviest and lightest subtrees, is
+    #above a specified threshold.
+    heavy_light_pairs = _pair_elements(sorted_subtree_weights)
+    heavy_light_pair_indices = _pair_elements(list(range(len(sorted_subtree_weights))))
+    heavy_light_weights = [(sorted_subtree_weights[i][1][sorted_subtree_weights[i][0]], sorted_subtree_weights[j][1][sorted_subtree_weights[j][0]])
+                          for i,j in heavy_light_pair_indices]
+    heavy_light_ratios = [weight_1/weight_2 for weight_1,weight_2 in heavy_light_weights]
+
+    heavy_light_pairs_to_balance = heavy_light_pairs if len(sorted_subtree_weights)%2==0 else heavy_light_pairs[0:-1]
+    new_roots = []
+    addl_cut_edges = []
+    pair_iter = iter(range(len(heavy_light_pairs_to_balance)))
+    for i in pair_iter:
+        #if the ratio is above the threshold then try a rebalancing
+        #step.
+        if heavy_light_ratios[i] > imbalance_threshold:
+            #calculate the fraction of the heavy tree that would be needed to bring the weight of the
+            #lighter tree in line.
+            root_cost =  partitioned_tree_nodes[heavy_light_pairs[i][0][0]][weight_key] if weight_key == 'prop_cost' else 0
+
+            rebalancing_target_fraction = (.5*(heavy_light_weights[i][0] - heavy_light_weights[i][1]))/heavy_light_weights[i][0]
+            cut_edge, new_subtree_weights =_bisect_tree(partitioned_tree, heavy_light_pairs[i][0][0], heavy_light_pairs[i][0][1], 
+                                                        weight_key, root_cost = root_cost,
+                                                        target_proportion = rebalancing_target_fraction)
+            #before applying the edge cut check whether the edge we found was close enough
+            # to bring us below the threshold.
+            if cut_edge is not None:
+                new_light_tree_weight = new_subtree_weights[0] + heavy_light_weights[i][1]
+                new_heavy_tree_weight = new_subtree_weights[1]
+                new_heavy_light_ratio = new_heavy_tree_weight/new_light_tree_weight
+                if new_heavy_light_ratio > imbalance_threshold and \
+                    (heavy_light_ratios[i] - new_heavy_light_ratio)<minimum_improvement_threshold:
+                    #We're only as good as the worst balancing, so if we are unable to 
+                    #balance below the threshold and the improvement is below some minimum threshold 
+                    #then we won't make an update and will terminate. 
+                    #Maybe we should throw a warning too?
+                    #but it isn't clear whether that would just be confusing to end-users who wouldn't
+                    #know what was meant or if it was important.
+                    #also add the roots of any of the pairs we haven't yet processed.
+                    remaining_indices = [i] + [j for j in pair_iter]
+                    for idx in remaining_indices:
+                        new_roots.extend((heavy_light_pairs[idx][0][0], heavy_light_pairs[idx][1][0]))
+                    break
+
+                else:
+                    #append the original root of the heavy tree, and a tuple of roots for the light plus the
+                    #bisected part of the heavy.
+                    new_roots.append(heavy_light_pairs[i][0][0])
+                    new_roots.append((heavy_light_pairs[i][1][0], cut_edge[1]))
+                    addl_cut_edges.append(cut_edge)
+                    #apply the cut
+                    partitioned_tree.remove_edge(cut_edge[0],cut_edge[1])
+            else:
+                #if the cut edge is None append the original heavy and light roots.
+                new_roots.extend((heavy_light_pairs[i][0][0], heavy_light_pairs[i][1][0]))
+        #since we're pairing up subsequent pairs of heavy and light
+        #elements, once we see one which is sufficiently balanced we
+        #know the rest must be.
+        else:
+            remaining_indices = [i] + [j for j in pair_iter]
+            for idx in remaining_indices:
+                new_roots.extend((heavy_light_pairs[idx][0][0], heavy_light_pairs[idx][1][0]))
+            break
+
+    #if the number of subtrees was odd to start we need to append on the median weight element which hasn't
+    #been processed.
+    if len(sorted_subtree_weights)%2!=0:
+        new_roots.append(heavy_light_pairs[-1][0][0])
+
+    return partitioned_tree, new_roots, addl_cut_edges
+
+
+#helper function for pairing up heavy and light subtrees.
+def _pair_elements(lst):
+    paired_list = []
+    length = len(lst)
+    
+    for i in range((length + 1) // 2):
+        if i == length - i - 1:
+            paired_list.append((lst[i], lst[i]))
+        else:
+            paired_list.append((lst[i], lst[length - i - 1]))
+    
+    return paired_list
+
+                     
\ No newline at end of file
diff --git a/pygsti/layouts/termlayout.py b/pygsti/layouts/termlayout.py
index 10b5458b7..95835c79a 100644
--- a/pygsti/layouts/termlayout.py
+++ b/pygsti/layouts/termlayout.py
@@ -57,7 +57,7 @@ def __init__(self, unique_complete_circuits, ds_circuits, group, model, dataset)
         expanded_circuit_outcomes = _collections.OrderedDict()
         for i in group:
             observed_outcomes = None if (dataset is None) else dataset[ds_circuits[i]].outcomes
-            d = unique_complete_circuits[i].expand_instruments_and_separate_povm(model, observed_outcomes)
+            d = model.expand_instruments_and_separate_povm(unique_complete_circuits[i], observed_outcomes)
             expanded_circuit_outcomes_by_unique[i] = d
             expanded_circuit_outcomes.update(d)
 
diff --git a/pygsti/modelmembers/instruments/instrument.py b/pygsti/modelmembers/instruments/instrument.py
index 133c19d0a..1c1c7b5ce 100644
--- a/pygsti/modelmembers/instruments/instrument.py
+++ b/pygsti/modelmembers/instruments/instrument.py
@@ -71,7 +71,7 @@ def __init__(self, member_ops, evotype=None, state_space=None, called_from_reduc
                 if state_space is None:
                     state_space = _statespace.default_space_for_dim(member_list[0][1].shape[0])
                 if evotype is None:
-                    evotype = _Evotype.cast('default')
+                    evotype = _Evotype.cast('default', state_space=state_space)
                 member_list = [(k, v if isinstance(v, _op.LinearOperator) else
                                 _op.FullArbitraryOp(v, None, evotype, state_space)) for k, v in member_list]
 
@@ -79,10 +79,10 @@ def __init__(self, member_ops, evotype=None, state_space=None, called_from_reduc
                 "Must specify `state_space` when there are no instrument members!"
             assert(len(member_list) > 0 or evotype is not None), \
                 "Must specify `evotype` when there are no instrument members!"
-            evotype = _Evotype.cast(evotype) if (evotype is not None) else member_list[0][1].evotype
             state_space = member_list[0][1].state_space if (state_space is None) \
                 else _statespace.StateSpace.cast(state_space)
-
+            evotype = _Evotype.cast(evotype, state_space=state_space) if (evotype is not None)\
+                else member_list[0][1].evotype
             items = []
             for k, member in member_list:
                 assert(evotype == member.evotype), \
diff --git a/pygsti/modelmembers/instruments/tpinstrument.py b/pygsti/modelmembers/instruments/tpinstrument.py
index e300fcb26..a41bd9301 100644
--- a/pygsti/modelmembers/instruments/tpinstrument.py
+++ b/pygsti/modelmembers/instruments/tpinstrument.py
@@ -77,8 +77,6 @@ def __init__(self, op_matrices, evotype="default", state_space=None, called_from
         self._readonly = False  # until init is done
         if len(items) > 0:
             assert(op_matrices is None), "`items` was given when op_matrices != None"
-
-        evotype = _Evotype.cast(evotype)
         self.param_ops = []  # first element is TP sum (MT), following
         #elements are fully-param'd (Mi-Mt) for i=0...n-2
 
@@ -98,6 +96,7 @@ def __init__(self, op_matrices, evotype="default", state_space=None, called_from
                 "Must specify `state_space` when there are no instrument members!"
             state_space = _statespace.default_space_for_dim(matrix_list[0][1].shape[0]) if (state_space is None) \
                 else _statespace.StateSpace.cast(state_space)
+            evotype = _Evotype.cast(evotype, state_space=state_space)
 
             # Create gate objects that are used to parameterize this instrument
             MT_mx = sum([v for k, v in matrix_list])  # sum-of-instrument-members matrix
@@ -125,6 +124,7 @@ def __init__(self, op_matrices, evotype="default", state_space=None, called_from
             #    print(k,":\n",v)
         else:
             assert(state_space is not None), "`state_space` cannot be `None` when there are no members!"
+            evotype = _Evotype.cast(evotype, state_space=state_space)
 
         _collections.OrderedDict.__init__(self, items)
         _mm.ModelMember.__init__(self, state_space, evotype)
diff --git a/pygsti/modelmembers/operations/composederrorgen.py b/pygsti/modelmembers/operations/composederrorgen.py
index 2a29d82c8..d628285b8 100644
--- a/pygsti/modelmembers/operations/composederrorgen.py
+++ b/pygsti/modelmembers/operations/composederrorgen.py
@@ -63,7 +63,7 @@ def __init__(self, errgens_to_compose, evotype="auto", state_space="auto"):
 
         if evotype == "auto":
             evotype = errgens_to_compose[0]._evotype
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         assert(all([evotype == eg._evotype for eg in errgens_to_compose])), \
             "All error generators must have the same evolution type (%s expected)!" % evotype
 
diff --git a/pygsti/modelmembers/operations/composedop.py b/pygsti/modelmembers/operations/composedop.py
index 2a7abb24a..9990c1669 100644
--- a/pygsti/modelmembers/operations/composedop.py
+++ b/pygsti/modelmembers/operations/composedop.py
@@ -69,7 +69,7 @@ def __init__(self, ops_to_compose, evotype="auto", state_space="auto", allocated
             evotype = ops_to_compose[0]._evotype
         assert(all([evotype == operation._evotype for operation in ops_to_compose])), \
             "All operations must have the same evolution type (%s expected)!" % evotype
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
 
         rep = self._create_rep_object(evotype, state_space)
 
diff --git a/pygsti/modelmembers/operations/denseop.py b/pygsti/modelmembers/operations/denseop.py
index eb798ecb8..fae131123 100644
--- a/pygsti/modelmembers/operations/denseop.py
+++ b/pygsti/modelmembers/operations/denseop.py
@@ -313,7 +313,7 @@ def __init__(self, mx, basis, evotype, state_space=None):
         mx = _LinearOperator.convert_to_matrix(mx)
         state_space = _statespace.default_space_for_dim(mx.shape[0]) if (state_space is None) \
             else _statespace.StateSpace.cast(state_space)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         self._basis = _Basis.cast(basis, state_space.dim) if (basis is not None) else None  # for Hilbert-Schmidt space
         rep = evotype.create_dense_superop_rep(mx, self._basis, state_space)
         _LinearOperator.__init__(self, rep, evotype)
@@ -533,7 +533,7 @@ def __init__(self, mx, basis, evotype, state_space):
         state_space = _statespace.default_space_for_udim(mx.shape[0]) if (state_space is None) \
             else _statespace.StateSpace.cast(state_space)
         basis = _Basis.cast(basis, state_space.dim)  # basis for Hilbert-Schmidt (superop) space
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
 
         #Try to create a dense unitary rep.  If this fails, see if a dense superop rep
         # can be created, as this type of rep can also hold arbitrary unitary ops.
diff --git a/pygsti/modelmembers/operations/experrorgenop.py b/pygsti/modelmembers/operations/experrorgenop.py
index 142ee2c21..d6c4e6200 100644
--- a/pygsti/modelmembers/operations/experrorgenop.py
+++ b/pygsti/modelmembers/operations/experrorgenop.py
@@ -699,9 +699,9 @@ def spam_transform_inplace(self, s, typ):
 
             #just act on postfactor and Lindbladian exponent:
             if typ == "prep":
-                mx = _mt.safe_dot(Uinv, mx)
+                mx = Uinv @ mx
             else:
-                mx = _mt.safe_dot(mx, U)
+                mx = mx @ U
             self.set_dense(mx)  # calls _update_rep() and sets dirty flag
         else:
             raise ValueError("Invalid transform for this LindbladErrorgen: type %s"
diff --git a/pygsti/modelmembers/operations/fullcptpop.py b/pygsti/modelmembers/operations/fullcptpop.py
index 8123a5b3e..431280b5c 100644
--- a/pygsti/modelmembers/operations/fullcptpop.py
+++ b/pygsti/modelmembers/operations/fullcptpop.py
@@ -42,7 +42,7 @@ def __init__(self, choi_mx, basis, evotype, state_space=None, truncate=False):
         choi_mx = _LinearOperator.convert_to_matrix(choi_mx)
         state_space = _statespace.default_space_for_dim(choi_mx.shape[0]) if (state_space is None) \
             else _statespace.StateSpace.cast(state_space)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         self._basis = _Basis.cast(basis, state_space.dim) if (basis is not None) else None  # for Hilbert-Schmidt space
 
         #scratch space
diff --git a/pygsti/modelmembers/operations/fulltpop.py b/pygsti/modelmembers/operations/fulltpop.py
index b8ce2251e..86aafdb62 100644
--- a/pygsti/modelmembers/operations/fulltpop.py
+++ b/pygsti/modelmembers/operations/fulltpop.py
@@ -164,16 +164,18 @@ def from_vector(self, v, close=False, dirty_value=True):
         self._ptr_has_changed()  # because _rep.base == _ptr (same memory)
         self.dirty = dirty_value
 
-    def stateless_data(self) -> Tuple[int]:
-        return (self.dim,)
-
-    @staticmethod
-    def torch_base(sd: Tuple[int], t_param: _torch.Tensor) -> _torch.Tensor:
-        dim = sd[0]
+    def stateless_data(self) -> Tuple[int, _torch.Tensor]:
+        dim = self.dim
         t_const = _torch.zeros(size=(1, dim), dtype=_torch.double)
         t_const[0,0] = 1.0
-        t_param_mat = t_param.reshape((dim - 1, dim))
+        return (dim, t_const)
+
+    @staticmethod
+    def torch_base(sd: Tuple[int, _torch.Tensor], t_param: _torch.Tensor) -> _torch.Tensor:
+        dim, t_const = sd
+        t_param_mat = t_param.view(dim - 1, dim)
         t = _torch.row_stack((t_const, t_param_mat))
+        # TODO: cache the row of all zeros?
         return t
 
 
diff --git a/pygsti/modelmembers/operations/fullunitaryop.py b/pygsti/modelmembers/operations/fullunitaryop.py
index c75bec72b..f26770684 100644
--- a/pygsti/modelmembers/operations/fullunitaryop.py
+++ b/pygsti/modelmembers/operations/fullunitaryop.py
@@ -201,7 +201,7 @@ def transform_inplace(self, s):
             Uinv = s.transform_matrix_inverse
 
             my_superop_mx = _ot.unitary_to_superop(self._ptr, self._basis)
-            my_superop_mx = _mt.safe_dot(Uinv, _mt.safe_dot(my_superop_mx, U))
+            my_superop_mx = Uinv @ (my_superop_mx @ U)
 
             self._ptr[:, :] = _ot.superop_to_unitary(my_superop_mx, self._basis)
             self._ptr_has_changed()
@@ -251,9 +251,9 @@ def spam_transform_inplace(self, s, typ):
 
             #Note: this code may need to be tweaked to work with sparse matrices
             if typ == "prep":
-                my_superop_mx = _mt.safe_dot(Uinv, my_superop_mx)
+                my_superop_mx = Uinv @ my_superop_mx
             else:
-                my_superop_mx = _mt.safe_dot(my_superop_mx, U)
+                my_superop_mx = my_superop_mx @ U
 
             self._ptr[:, :] = _ot.superop_to_unitary(my_superop_mx, self._basis)
             self._ptr_has_changed()
diff --git a/pygsti/modelmembers/operations/lindbladcoefficients.py b/pygsti/modelmembers/operations/lindbladcoefficients.py
index 25ebcaab2..b4db7d4f7 100644
--- a/pygsti/modelmembers/operations/lindbladcoefficients.py
+++ b/pygsti/modelmembers/operations/lindbladcoefficients.py
@@ -8,11 +8,14 @@
 from pygsti.tools import lindbladtools as _lt
 from pygsti.tools import matrixtools as _mt
 from pygsti.tools import optools as _ot
+from pygsti.tools import fastcalc as _fc
 from pygsti.baseobjs.basis import Basis as _Basis, BuiltinBasis as _BuiltinBasis
 from pygsti.modelmembers import term as _term
 from pygsti.baseobjs.polynomial import Polynomial as _Polynomial
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 
+from functools import lru_cache
+
 IMAG_TOL = 1e-7  # tolerance for imaginary part being considered zero
 
 
@@ -195,7 +198,7 @@ def create_lindblad_term_superoperators(self, mx_basis='pp', sparse="auto", incl
                 if sparse:
                     #Note: complex OK here sometimes, as only linear combos of "other" gens
                     # (like (i,j) + (j,i) terms) need to be real.
-                    superops = [_mt.safe_dot(leftTrans, _mt.safe_dot(mx, rightTrans)) for mx in superops]
+                    superops = [leftTrans @ (mx @ rightTrans) for mx in superops]
                     for mx in superops: mx.sort_indices()
                 else:
                     #superops = _np.einsum("ik,akl,lj->aij", leftTrans, superops, rightTrans)
@@ -816,12 +819,19 @@ def from_vector(self, v):
                 #  encodes a lower-triangular matrix "cache_mx" via:
                 #  cache_mx[i,i] = params[i,i]
                 #  cache_mx[i,j] = params[i,j] + 1j*params[j,i] (i > j)
+
                 cache_mx = self._cache_mx
-                iparams = 1j * params
-                for i in range(num_bels):
-                    cache_mx[i, i] = params[i, i]
-                    cache_mx[i, :i] = params[i, :i] + iparams[:i, i]
 
+                params_upper_indices = _fc.fast_triu_indices(num_bels) 
+                params_upper = 1j*params[params_upper_indices]
+                params_lower = (params.T)[params_upper_indices]
+
+                cache_mx_trans = cache_mx.T
+                cache_mx_trans[params_upper_indices] = params_lower + params_upper
+                        
+                diag_indices = cached_diag_indices(num_bels)
+                cache_mx[diag_indices] = params[diag_indices]
+                
                 #The matrix of (complex) "other"-coefficients is build by assuming
                 # cache_mx is its Cholesky decomp; means otherCoeffs is pos-def.
 
@@ -830,20 +840,22 @@ def from_vector(self, v):
                 # matrix, but we don't care about this uniqueness criteria and so
                 # the diagonal els of cache_mx can be negative and that's fine -
                 # block_data will still be posdef.
-                self.block_data[:, :] = _np.dot(cache_mx, cache_mx.T.conjugate())
+                self.block_data[:, :] = cache_mx@cache_mx.T.conj()
 
-                #DEBUG - test for pos-def
-                #evals = _np.linalg.eigvalsh(block_data)
-                #DEBUG_TOL = 1e-16; #print("EVALS DEBUG = ",evals)
-                #assert(all([ev >= -DEBUG_TOL for ev in evals]))
 
             elif self._param_mode == "elements":  # params mx stores block_data (hermitian) directly
                 #params holds block_data real and imaginary parts directly
-                iparams = 1j * params
-                for i in range(num_bels):
-                    self.block_data[i, i] = params[i, i]
-                    self.block_data[i, :i] = params[i, :i] + iparams[:i, i]
-                    self.block_data[:i, i] = params[i, :i] - iparams[:i, i]
+                params_upper_indices = _fc.fast_triu_indices(num_bels) 
+                params_upper = -1j*params[params_upper_indices]
+                params_lower = (params.T)[params_upper_indices]
+
+                block_data_trans = self.block_data.T
+                self.block_data[params_upper_indices] = params_lower + params_upper
+                block_data_trans[params_upper_indices] = params_lower - params_upper
+
+                diag_indices = cached_diag_indices(num_bels)
+                self.block_data[diag_indices] = params[diag_indices]
+
             else:
                 raise ValueError("Internal error: invalid parameter mode (%s) for block type %s!"
                                  % (self._param_mode, self._block_type))
@@ -1204,3 +1216,7 @@ def __str__(self):
         if len(self._bel_labels) < 10:
             s += " Coefficients are:\n" + str(_np.round(self.block_data, 4))
         return s
+
+@lru_cache(maxsize=16)
+def cached_diag_indices(n):
+    return _np.diag_indices(n)
\ No newline at end of file
diff --git a/pygsti/modelmembers/operations/lindbladerrorgen.py b/pygsti/modelmembers/operations/lindbladerrorgen.py
index 68097dd82..433663014 100644
--- a/pygsti/modelmembers/operations/lindbladerrorgen.py
+++ b/pygsti/modelmembers/operations/lindbladerrorgen.py
@@ -49,61 +49,6 @@ class LindbladErrorgen(_LinearOperator):
     is CPTP.  These terms can be divided into "Hamiltonian"-type terms, which
     map rho -> i[H,rho] and "non-Hamiltonian"/"other"-type terms, which map rho
     -> A rho B + 0.5*(ABrho + rhoAB).
-
-    Parameters
-    ----------
-    dim : int
-        The Hilbert-Schmidt (superoperator) dimension, which will be the
-        dimension of the created operator.
-
-    lindblad_term_dict : dict
-        A dictionary specifying which Linblad terms are present in the
-        parameteriztion.  Keys are `(termType, basisLabel1, <basisLabel2>)`
-        tuples, where `termType` can be `"H"` (Hamiltonian), `"S"`
-        (Stochastic), or `"A"` (Affine).  Hamiltonian and Affine terms always
-        have a single basis label (so key is a 2-tuple) whereas Stochastic
-        tuples with 1 basis label indicate a *diagonal* term, and are the
-        only types of terms allowed when `nonham_mode != "all"`.  Otherwise,
-        Stochastic term tuples can include 2 basis labels to specify
-        "off-diagonal" non-Hamiltonian Lindblad terms.  Basis labels can be
-        strings or integers.  Values are complex coefficients.
-
-    basis : Basis, optional
-        A basis mapping the labels used in the keys of `lindblad_term_dict` to
-        basis matrices (e.g. numpy arrays or Scipy sparse matrices).
-
-    param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-        Describes how the Lindblad coefficients/projections relate to the
-        error generator's parameter values.  Allowed values are:
-        `"unconstrained"` (coeffs are independent unconstrained parameters),
-        `"cptp"` (independent parameters but constrained so map is CPTP),
-        `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-        `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-
-    nonham_mode : {"diagonal", "diag_affine", "all"}
-        Which non-Hamiltonian Lindblad projections are potentially non-zero.
-        Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-        `"diag_affine"` (diagonal coefficients + affine projections), and
-        `"all"` (the entire matrix of coefficients is allowed).
-
-    truncate : bool, optional
-        Whether to truncate the projections onto the Lindblad terms in
-        order to meet constraints (e.g. to preserve CPTP) when necessary.
-        If False, then an error is thrown when the given dictionary of
-        Lindblad terms doesn't conform to the constrains.
-
-    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-        The basis for this error generator's linear mapping. Allowed
-        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-        and Qutrit (qt) (or a custom basis object).
-
-    evotype : {"densitymx","svterm","cterm"}
-        The evolution type of the error generator being constructed.
-        `"densitymx"` means the usual Lioville density-matrix-vector
-        propagation via matrix-vector products.  `"svterm"` denotes
-        state-vector term-based evolution (action of operation is obtained by
-        evaluating the rank-1 terms up to some order).  `"cterm"` is similar
-        but uses Clifford operation action on stabilizer states.
     """
 
     _generators_cache = {}  # a custom cache for _init_generators method calls
@@ -111,6 +56,41 @@ class LindbladErrorgen(_LinearOperator):
     @classmethod
     def from_operation_matrix_and_blocks(cls, op_matrix, lindblad_coefficient_blocks, lindblad_basis='auto',
                                          mx_basis='pp', truncate=True, evotype="default", state_space=None):
+        
+        """
+        Create a Lindblad-parameterized error generator from an operation matrix and coefficient blocks.
+
+        Parameters
+        ----------
+        op_matrix : numpy array or SciPy sparse matrix
+            A square 2D array that gives the raw operation matrix, assumed to be in the `mx_basis` basis.
+            The shape of this array sets the dimension of the operation.
+
+        lindblad_coefficient_blocks : list
+            A list of Lindblad coefficient blocks to set from the error generator projections.
+
+        lindblad_basis : {'auto', 'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for Lindblad terms. Default is 'auto'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'.
+
+        truncate : bool, optional
+            Whether to truncate the projections onto the Lindblad terms in order to meet constraints.
+            Default is True. (e.g. to preserve CPTP) when necessary. If False, then an error is thrown 
+            when the Lindblad terms don't conform to the constrains.
+
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
+
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
+
+        Returns
+        -------
+        LindbladErrorgen
+        """
+
         sparseOp = _sps.issparse(op_matrix)
 
         #Init base from error generator: sets basis members and ultimately
@@ -145,58 +125,31 @@ def from_operation_matrix(cls, op_matrix, parameterization='CPTP', lindblad_basi
         Parameters
         ----------
         op_matrix : numpy array or SciPy sparse matrix
-            a square 2D array that gives the raw operation matrix, assumed to
-            be in the `mx_basis` basis, to parameterize.  The shape of this
-            array sets the dimension of the operation. If None, then it is assumed
-            equal to `unitary_postfactor` (which cannot also be None). The
-            quantity `op_matrix inv(unitary_postfactor)` is parameterized via
-            projection onto the Lindblad terms.
-
-        ham_basis : {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-            The basis is used to construct the Hamiltonian-type lindblad error
-            Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-            and Qutrit (qt), list of numpy arrays, or a custom basis object.
-
-        nonham_basis : {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-            The basis is used to construct the non-Hamiltonian (generalized
-            Stochastic-type) lindblad error Allowed values are Matrix-unit
-            (std), Gell-Mann (gm), Pauli-product (pp), and Qutrit (qt), list of
-            numpy arrays, or a custom basis object.
-
-        param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-            Describes how the Lindblad coefficients/projections relate to the
-            operation's parameter values.  Allowed values are:
-            `"unconstrained"` (coeffs are independent unconstrained parameters),
-            `"cptp"` (independent parameters but constrained so map is CPTP),
-            `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-            `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-
-        nonham_mode : {"diagonal", "diag_affine", "all"}
-            Which non-Hamiltonian Lindblad projections are potentially non-zero.
-            Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-            `"diag_affine"` (diagonal coefficients + affine projections), and
-            `"all"` (the entire matrix of coefficients is allowed).
+            A square 2D array that gives the raw operation matrix, assumed to be in the `mx_basis` basis.
+            The shape of this array sets the dimension of the operation.
 
-        truncate : bool, optional
-            Whether to truncate the projections onto the Lindblad terms in
-            order to meet constraints (e.g. to preserve CPTP) when necessary.
-            If False, then an error is thrown when the given `operation` cannot
-            be realized by the specified set of Lindblad projections.
+        parameterization : str, optional (default 'CPTP') 
+            Describes how the Lindblad coefficients/projections relate to the error generator's parameter values.
+            Default is "CPTP". Supported strings are those castable to `LindbladParameterization`. See
+            `LindbladParameterization` for supported options.
 
-        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-            The source and destination basis, respectively.  Allowed
+        lindblad_basis : {'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for Lindblad terms. Default is 'PP'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'. Allowed
             values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
             and Qutrit (qt) (or a custom basis object).
 
-        evotype : Evotype or str, optional
-            The evolution type.  The special value `"default"` is equivalent
-            to specifying the value of `pygsti.evotypes.Evotype.default_evotype`.
+        truncate : bool, optional
+            Whether to truncate the projections onto the Lindblad terms in order to meet constraints.
+            Default is True.
 
-        state_space : TODO docstring
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
 
-        Returns
-        -------
-        LindbladOp
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
         """
 
         #Compute an errorgen from the given op_matrix. Works with both
@@ -226,7 +179,40 @@ def from_operation_matrix(cls, op_matrix, parameterization='CPTP', lindblad_basi
     def from_error_generator(cls, errgen_or_dim, parameterization="CPTP", lindblad_basis='PP', mx_basis='pp',
                              truncate=True, evotype="default", state_space=None):
         """
-        TODO: docstring - take from now-private version below Note: errogen_or_dim can be an integer => zero errgen
+        Create a Lindblad-parameterized error generator from an error generator matrix or dimension.
+
+        Parameters
+        ----------
+        errgen_or_dim : numpy array, SciPy sparse matrix, or int
+            A square 2D array that gives the full error generator or an integer specifying the dimension
+            of a zero error generator.
+
+        parameterization : str, optional (default 'CPTP') 
+            Describes how the Lindblad coefficients/projections relate to the error generator's parameter values.
+            Default is "CPTP". Supported strings are those castable to `LindbladParameterization`. See
+            `LindbladParameterization` for supported options.
+
+        lindblad_basis : {'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for Lindblad terms. Default is 'PP'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'. Allowed
+            values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
+            and Qutrit (qt) (or a custom basis object).
+
+        truncate : bool, optional
+            Whether to truncate the projections onto the Lindblad terms in order to meet constraints.
+            Default is True.
+
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
+
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
+
+        Returns
+        -------
+        LindbladErrorgen
         """
         errgen = _np.zeros((errgen_or_dim, errgen_or_dim), 'd') \
             if isinstance(errgen_or_dim, (int, _np.int64)) else errgen_or_dim
@@ -238,7 +224,36 @@ def from_error_generator_and_blocks(cls, errgen_or_dim, lindblad_coefficient_blo
                                         lindblad_basis='PP', mx_basis='pp',
                                         truncate=True, evotype="default", state_space=None):
         """
-        TODO: docstring - take from now-private version below Note: errogen_or_dim can be an integer => zero errgen
+        Create a Lindblad-parameterized error generator from an error generator matrix or dimension and coefficient blocks.
+
+        Parameters
+        ----------
+        errgen_or_dim : numpy array, SciPy sparse matrix, or int
+            A square 2D array that gives the full error generator or an integer specifying the dimension
+            of a zero error generator.
+
+        lindblad_coefficient_blocks : list
+            A list of Lindblad coefficient blocks to set from the error generator projections.
+
+        lindblad_basis : {'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for Lindblad terms. Default is 'PP'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'.
+
+        truncate : bool, optional
+            Whether to truncate the projections onto the Lindblad terms in order to meet constraints.
+            Default is True.
+
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
+
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
+
+        Returns
+        -------
+        LindbladErrorgen
         """
         errgenMx = _np.zeros((errgen_or_dim, errgen_or_dim), 'd') \
             if isinstance(errgen_or_dim, (int, _np.int64)) else errgen_or_dim
@@ -251,7 +266,6 @@ def _from_error_generator(cls, errgen, parameterization="CPTP", lindblad_basis="
                               mx_basis="pp", truncate=True, evotype="default", state_space=None):
         """
         Create a Lindblad-form error generator from an error generator matrix and a basis.
-        TODO: fix docstring -- ham/nonham_basis ==> lindblad_basis
 
         The basis specifies how to decompose (project) the error generator.
 
@@ -260,38 +274,19 @@ def _from_error_generator(cls, errgen, parameterization="CPTP", lindblad_basis="
         errgen : numpy array or SciPy sparse matrix
             a square 2D array that gives the full error generator. The shape of
             this array sets the dimension of the operator. The projections of
-            this quantity onto the `ham_basis` and `nonham_basis` are closely
-            related to the parameters of the error generator (they may not be
-            exactly equal if, e.g `cptp=True`).
-
-        ham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-            The basis is used to construct the Hamiltonian-type lindblad error
-            Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-            and Qutrit (qt), list of numpy arrays, or a custom basis object.
-
-        nonham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-            The basis is used to construct the non-Hamiltonian-type lindblad error
-            Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-            and Qutrit (qt), list of numpy arrays, or a custom basis object.
-
-        param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-            Describes how the Lindblad coefficients/projections relate to the
-            operation's parameter values.  Allowed values are:
-            `"unconstrained"` (coeffs are independent unconstrained parameters),
-            `"cptp"` (independent parameters but constrained so map is CPTP),
-            `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-            `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-
-        nonham_mode : {"diagonal", "diag_affine", "all"}
-            Which non-Hamiltonian Lindblad projections are potentially non-zero.
-            Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-            `"diag_affine"` (diagonal coefficients + affine projections), and
-            `"all"` (the entire matrix of coefficients is allowed).
-
-        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-            The source and destination basis, respectively.  Allowed
-            values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-            and Qutrit (qt) (or a custom basis object).
+            this quantity are closely related to the parameters of the error 
+            generator (they may not be exactly equal if parameterization = 'CPTP').
+
+        lindblad_basis : {'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for Lindblad terms. Default is 'PP'.
+
+        parameterization : str, optional (default 'CPTP') 
+            Describes how the Lindblad coefficients/projections relate to the error generator's parameter values.
+            Default is "CPTP". Supported strings are those castable to `LindbladParameterization`. See
+            `LindbladParameterization` for supported options.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'.
 
         truncate : bool, optional
             Whether to truncate the projections onto the Lindblad terms in
@@ -307,7 +302,8 @@ def _from_error_generator(cls, errgen, parameterization="CPTP", lindblad_basis="
             terms up to some order).  `"cterm"` is similar but uses Clifford operation
             action on stabilizer states.
 
-        state_space : TODO docstring
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
 
         Returns
         -------
@@ -350,8 +346,47 @@ def _from_error_generator(cls, errgen, parameterization="CPTP", lindblad_basis="
 
     @classmethod
     def from_elementary_errorgens(cls, elementary_errorgens, parameterization='auto', elementary_errorgen_basis='PP',
-                                  mx_basis="pp", truncate=True, evotype="default", state_space=None):
-        """TODO: docstring"""
+                              mx_basis="pp", truncate=True, evotype="default", state_space=None):
+        """
+        Create a Lindblad-parameterized error generator from elementary error generators.
+
+        Parameters
+        ----------
+        elementary_errorgens : dict
+            A dictionary of elementary error generators. Keys are labels specifying the type and basis
+            elements of the elementary error generators, and values are the corresponding coefficients.
+            Keys are `(termType, basisLabel1, <basisLabel2>)` tuples, where `termType` is 
+            `"H"` (Hamiltonian), `"S"` (Stochastic), `"C"` (Correlation) or `"A"` (Active).  
+            Hamiltonian and Stochastic terms always have a single basis label (so key is a 2-tuple) 
+            whereas C and A tuples have 2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
+            terms.  Basis labels are pauli strings. Values are coefficients.
+
+        parameterization : str, optional (default 'CPTP') 
+            Describes how the Lindblad coefficients/projections relate to the error generator's parameter values.
+            Default is "CPTP". Supported strings are those castable to `LindbladParameterization`. See
+            `LindbladParameterization` for supported options.
+
+        elementary_errorgen_basis : {'PP', 'std', 'gm', 'qt'}, optional
+            The basis used for the elementary error generators. Default is 'PP'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'.
+
+        truncate : bool, optional
+            Whether to truncate the projections onto the Lindblad terms in order to meet constraints.
+            Default is True.
+
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
+
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
+
+        Returns
+        -------
+        LindbladErrorgen
+        """
+
         state_space = _statespace.StateSpace.cast(state_space)
         dim = state_space.dim  # Store superop dimension
         basis = _Basis.cast(elementary_errorgen_basis, dim)
@@ -384,6 +419,33 @@ def from_elementary_errorgens(cls, elementary_errorgens, parameterization='auto'
 
     def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis='pp',
                  evotype="default", state_space=None):
+        
+        """
+        Initialize a LindbladErrorgen object.
+
+        Parameters
+        ----------
+        lindblad_coefficient_blocks : list of LindbladCoefficientBlock
+            A list of Lindblad coefficient blocks that define the error generator.
+
+        lindblad_basis : {'auto', 'PP', 'std', 'gm', 'qt'} or Basis object, optional
+            The basis used for Lindblad terms. If 'auto', the basis is inferred from the coefficient blocks.
+            Default is 'auto'.
+
+        mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object, optional
+            The basis for this error generator's linear mapping. Default is 'pp'.
+
+        evotype : {"default", "densitymx", "svterm", "cterm"}, optional
+            The evolution type of the error generator being constructed. Default is "default".
+
+        state_space : StateSpace, optional
+            The state space for the error generator. Default is None.
+
+        Raises
+        ------
+        ValueError
+            If the provided evotype does not support any of the required representations for a LindbladErrorgen.
+        """
 
         if isinstance(lindblad_coefficient_blocks, dict):  # backward compat warning
             _warnings.warn(("You're trying to create a LindbladErrorgen object using a dictionary.  This"
@@ -392,10 +454,10 @@ def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis=
                             " a LindbladErrorgen.from_elementary_errorgens(...) instead."))
 
         state_space = _statespace.StateSpace.cast(state_space)
-
+        dim = state_space.dim  # Store superop dimension
         #Decide on our rep-type ahead of time so we know whether to make bases sparse
         # (a LindbladErrorgen with a sparse rep => sparse bases and similar with dense rep)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         reptype_preferences = ('lindblad errorgen', 'dense superop', 'sparse superop') \
             if evotype.prefer_dense_reps else ('lindblad errorgen', 'sparse superop', 'dense superop')
         for reptype in reptype_preferences:
@@ -403,13 +465,8 @@ def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis=
                 self._rep_type = reptype; break
         else:
             raise ValueError("Evotype doesn't support any of the representations a LindbladErrorgen requires.")
-        sparse_bases = bool(self._rep_type == 'sparse superop')  # we use sparse bases iff we have a sparse rep
-
-        state_space = _statespace.StateSpace.cast(state_space)
-        dim = state_space.dim  # Store superop dimension
+        sparse_bases = bool(self._rep_type == 'sparse superop')  # we use sparse bases iff we have a sparse rep        
 
-        #UPDATE: no more self.lindblad_basis
-        #self.lindblad_basis = _Basis.cast(lindblad_basis, dim, sparse=sparse_bases)
         if lindblad_basis == "auto":
             assert(all([(blk._basis is not None) for blk in lindblad_coefficient_blocks])), \
                 "When `lindblad_basis == 'auto'`, the supplied coefficient blocks must have valid bases!"
@@ -422,26 +479,6 @@ def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis=
             elif blk._basis.sparse != sparse_bases:  # update block bases to desired sparsity if needed
                 blk._basis = blk._basis.with_sparsity(sparse_bases)
 
-        #UPDATE - this essentially constructs the coefficient blocks from a single dict, which are now given as input
-        ## lindblad_term_dict, basis => bases + parameter values
-        ## but maybe we want lindblad_term_dict, basisdict => basis + projections/coeffs,
-        ##  then projections/coeffs => paramvals? since the latter is what set_errgen needs
-        #hamC, otherC, self.ham_basis, self.other_basis = \
-        #    _ot.lindblad_terms_to_projections(lindblad_term_dict, self.lindblad_basis,
-        #                                      self.parameterization.nonham_mode)
-
-        #UPDATE - self.ham_basis_size and self.other_basis_size have been removed!
-        #self.ham_basis_size = len(self.ham_basis)
-        #self.other_basis_size = len(self.other_basis)
-        #assert(self.parameterization.ham_params_allowed or self.ham_basis_size == 0), \
-        #    "Hamiltonian lindblad terms are not allowed!"
-        #assert(self.parameterization.nonham_params_allowed or self.other_basis_size == 0), \
-        #    "Non-Hamiltonian lindblad terms are not allowed!"
-        #
-        ## Check that bases have the desired sparseness (should be same as lindblad_basis)
-        #assert (self.ham_basis_size == 0 or self.ham_basis.sparse == sparse_bases)
-        #assert (self.other_basis_size == 0 or self.other_basis.sparse == sparse_bases)
-
         self.coefficient_blocks = lindblad_coefficient_blocks
         self.matrix_basis = _Basis.cast(mx_basis, dim, sparse=sparse_bases)
 
@@ -464,6 +501,9 @@ def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis=
             blk.create_lindblad_term_superoperators(self.matrix_basis, sparse_bases, include_1norms=True, flat=True)
             for blk in lindblad_coefficient_blocks]
 
+        #combine all of the linblad term superoperators across the blocks to a single concatenated tensor.
+        self.combined_lindblad_term_superops = _np.concatenate([Lterm_superops for (Lterm_superops, _) in self.lindblad_term_superops_and_1norms], axis=0)
+
         #Create a representation of the type chosen above:
         if self._rep_type == 'lindblad errorgen':
             rep = evotype.create_lindblad_errorgen_rep(lindblad_coefficient_blocks, state_space)
@@ -497,140 +537,8 @@ def __init__(self, lindblad_coefficient_blocks, lindblad_basis='auto', mx_basis=
         self._paramlbls = _np.array(list(_itertools.chain.from_iterable(
             [blk.param_labels for blk in self.coefficient_blocks])), dtype=object)
         assert(self._onenorm_upbound is not None)  # _update_rep should set this
-        #Done with __init__(...)
-
-    #def _init_generators(self, dim):
-    #    #assumes self.dim, self.ham_basis, self.other_basis, and self.matrix_basis are setup...
-    #    sparse_bases = bool(self._rep_type == 'sparse superop')
-    #
-    #    #HERE TODO - need to update this / MOVE to block class?
-    #    #use caching to increase performance - cache based on all the self.XXX members utilized by this fn
-    #    cache_key = (self._rep_type, self.matrix_basis, self.ham_basis, self.other_basis, self.parameterization)
-    #    #print("cache key = ",self._rep_type, (self.matrix_basis.name, self.matrix_basis.dim),
-    #    #      (self.ham_basis.name, self.ham_basis.dim), (self.other_basis.name, self.other_basis.dim),
-    #    #      str(self.parameterization))
-    #
-    #    if cache_key not in self._generators_cache:
-    #
-    #        d = int(round(_np.sqrt(dim)))
-    #        assert(d * d == dim), "Errorgen dim must be a perfect square"
-    #
-    #        # Get basis transfer matrix
-    #        mxBasisToStd = self.matrix_basis.create_transform_matrix(
-    #            _BuiltinBasis("std", self.matrix_basis.dim, sparse_bases))
-    #        # use BuiltinBasis("std") instead of just "std" in case matrix_basis is a TensorProdBasis
-    #        leftTrans = _spsl.inv(mxBasisToStd.tocsc()).tocsr() if _sps.issparse(mxBasisToStd) \
-    #            else _np.linalg.inv(mxBasisToStd)
-    #        rightTrans = mxBasisToStd
-    #
-    #        hamBasisMxs = self.ham_basis.elements
-    #        otherBasisMxs = self.other_basis.elements
-    #
-    #        hamGens, otherGens = _ot.lindblad_error_generators(
-    #            hamBasisMxs, otherBasisMxs, normalize=False,
-    #            other_mode=self.parameterization.nonham_mode)  # in std basis
-    #
-    #        # Note: lindblad_error_generators will return sparse generators when
-    #        #  given a sparse basis (or basis matrices)
-    #
-    #        if hamGens is not None:
-    #            bsH = len(hamGens) + 1  # projection-basis size (not nec. == dim)
-    #            _ot._assert_shape(hamGens, (bsH - 1, dim, dim), sparse_bases)
-    #
-    #            # apply basis change now, so we don't need to do so repeatedly later
-    #            if sparse_bases:
-    #                hamGens = [_mt.safe_real(_mt.safe_dot(leftTrans, _mt.safe_dot(mx, rightTrans)),
-    #                                         inplace=True, check=True) for mx in hamGens]
-    #                for mx in hamGens: mx.sort_indices()
-    #                # for faster addition ops in _construct_errgen_matrix
-    #            else:
-    #                #hamGens = _np.einsum("ik,akl,lj->aij", leftTrans, hamGens, rightTrans)
-    #                hamGens = _np.transpose(_np.tensordot(
-    #                    _np.tensordot(leftTrans, hamGens, (1, 1)), rightTrans, (2, 0)), (1, 0, 2))
-    #        else:
-    #            bsH = 0
-    #        assert(bsH == self.ham_basis_size)
-    #
-    #        if otherGens is not None:
-    #
-    #            if self.parameterization.nonham_mode == "diagonal":
-    #                bsO = len(otherGens) + 1  # projection-basis size (not nec. == dim)
-    #                _ot._assert_shape(otherGens, (bsO - 1, dim, dim), sparse_bases)
-    #
-    #                # apply basis change now, so we don't need to do so repeatedly later
-    #                if sparse_bases:
-    #                    otherGens = [_mt.safe_real(_mt.safe_dot(leftTrans, _mt.safe_dot(mx, rightTrans)),
-    #                                               inplace=True, check=True) for mx in otherGens]
-    #                    for mx in otherGens: mx.sort_indices()
-    #                    # for faster addition ops in _construct_errgen_matrix
-    #                else:
-    #                    #otherGens = _np.einsum("ik,akl,lj->aij", leftTrans, otherGens, rightTrans)
-    #                    otherGens = _np.transpose(_np.tensordot(
-    #                        _np.tensordot(leftTrans, otherGens, (1, 1)), rightTrans, (2, 0)), (1, 0, 2))
-    #
-    #            elif self.parameterization.nonham_mode == "diag_affine":
-    #                # projection-basis size (not nec. == dim) [~shape[1] but works for lists too]
-    #                bsO = len(otherGens[0]) + 1
-    #                _ot._assert_shape(otherGens, (2, bsO - 1, dim, dim), sparse_bases)
-    #
-    #                # apply basis change now, so we don't need to do so repeatedly later
-    #                if sparse_bases:
-    #                    otherGens = [[_mt.safe_dot(leftTrans, _mt.safe_dot(mx, rightTrans))
-    #                                  for mx in mxRow] for mxRow in otherGens]
-    #
-    #                    for mxRow in otherGens:
-    #                        for mx in mxRow: mx.sort_indices()
-    #                        # for faster addition ops in _construct_errgen_matrix
-    #                else:
-    #                    #otherGens = _np.einsum("ik,abkl,lj->abij", leftTrans,
-    #                    #                          otherGens, rightTrans)
-    #                    otherGens = _np.transpose(_np.tensordot(
-    #                        _np.tensordot(leftTrans, otherGens, (1, 2)), rightTrans, (3, 0)), (1, 2, 0, 3))
-    #
-    #            else:
-    #                bsO = len(otherGens) + 1  # projection-basis size (not nec. == dim)
-    #                _ot._assert_shape(otherGens, (bsO - 1, bsO - 1, dim, dim), sparse_bases)
-    #
-    #                # apply basis change now, so we don't need to do so repeatedly later
-    #                if sparse_bases:
-    #                    otherGens = [[_mt.safe_dot(leftTrans, _mt.safe_dot(mx, rightTrans))
-    #                                  for mx in mxRow] for mxRow in otherGens]
-    #                    #Note: complex OK here, as only linear combos of otherGens (like (i,j) + (j,i)
-    #                    # terms) need to be real
-    #
-    #                    for mxRow in otherGens:
-    #                        for mx in mxRow: mx.sort_indices()
-    #                        # for faster addition ops in _construct_errgen_matrix
-    #                else:
-    #                    #otherGens = _np.einsum("ik,abkl,lj->abij", leftTrans,
-    #                    #                            otherGens, rightTrans)
-    #                    otherGens = _np.transpose(_np.tensordot(
-    #                        _np.tensordot(leftTrans, otherGens, (1, 2)), rightTrans, (3, 0)), (1, 2, 0, 3))
-    #
-    #        else:
-    #            bsO = 0
-    #        assert(bsO == self.other_basis_size)
-    #
-    #        if hamGens is not None:
-    #            hamGens_1norms = _np.array([_mt.safe_onenorm(mx) for mx in hamGens], 'd')
-    #        else:
-    #            hamGens_1norms = None
-    #
-    #        if otherGens is not None:
-    #            if self.parameterization.nonham_mode == "diagonal":
-    #                otherGens_1norms = _np.array([_mt.safe_onenorm(mx) for mx in otherGens], 'd')
-    #            else:
-    #                otherGens_1norms = _np.array([_mt.safe_onenorm(mx)
-    #                                              for oGenRow in otherGens for mx in oGenRow], 'd')
-    #        else:
-    #            otherGens_1norms = None
-    #
-    #        self._generators_cache[cache_key] = (hamGens, otherGens, hamGens_1norms, otherGens_1norms)
-    #
-    #    cached_hamGens, cached_otherGens, cached_h1norms, cached_o1norms = self._generators_cache[cache_key]
-    #    return (_copy.deepcopy(cached_hamGens), _copy.deepcopy(cached_otherGens),
-    #            cached_h1norms.copy() if (cached_h1norms is not None) else None,
-    #            cached_o1norms.copy() if (cached_o1norms is not None) else None)
+
+        # Done with __init__(...)
 
     def _init_terms(self, coefficient_blocks, max_polynomial_vars):
 
@@ -732,17 +640,16 @@ def _update_rep(self):
             # __init__, so we just update the *data* array).
             self._rep.data[:] = data.real
 
-        else:  # dense matrices
-            lnd_error_gen = sum([_np.tensordot(blk.block_data.flat, Lterm_superops, (0, 0)) for blk, (Lterm_superops, _)
-                                 in zip(self.coefficient_blocks, self.lindblad_term_superops_and_1norms)])
-
-            assert(_np.isclose(_np.linalg.norm(lnd_error_gen.imag), 0)), \
+        else:  # dense matrices            
+            comb_blk_datas = _np.concatenate([blk.block_data.ravel() for blk in self.coefficient_blocks])
+            lnd_error_gen = _np.einsum('i,ijk->jk', comb_blk_datas, self.combined_lindblad_term_superops)
+            
+            #This test has been previously commented out in the sparse case, should we do the same for this one?
+            assert(_np.linalg.norm(lnd_error_gen.imag)<1e-10), \
                 "Imaginary error gen norm: %g" % _np.linalg.norm(lnd_error_gen.imag)
-            #print("errgen pre-real = \n"); _mt.print_mx(lnd_error_gen,width=4,prec=1)
             self._rep.base[:, :] = lnd_error_gen.real
 
         self._onenorm_upbound = onenorm
-        #assert(self._onenorm_upbound >= _np.linalg.norm(self.to_dense(), ord=1) - 1e-6)  #DEBUG
 
     def to_dense(self, on_space='minimal'):
         """
@@ -762,10 +669,10 @@ def to_dense(self, on_space='minimal'):
         """
         if self._rep_type == 'lindblad errorgen':
             assert(on_space in ('minimal', 'HilbertSchmidt'))
-            lnd_error_gen = sum([_np.tensordot(blk.block_data.flat, Lterm_superops, (0, 0)) for blk, (Lterm_superops, _)
-                                 in zip(self.coefficient_blocks, self.lindblad_term_superops_and_1norms)])
+            comb_blk_datas = _np.concatenate([blk.block_data.ravel() for blk in self.coefficient_blocks])
+            lnd_error_gen = _np.einsum('i,ijk->jk', comb_blk_datas, self.combined_lindblad_term_superops)
 
-            assert(_np.isclose(_np.linalg.norm(lnd_error_gen.imag), 0)), \
+            assert(_np.linalg.norm(lnd_error_gen.imag)<1e-10), \
                 "Imaginary error gen norm: %g" % _np.linalg.norm(lnd_error_gen.imag)
             return lnd_error_gen.real
 
@@ -791,29 +698,6 @@ def to_sparse(self, on_space='minimal'):
         else:  # dense rep
             return _sps.csr_matrix(self.to_dense(on_space))
 
-    #def torep(self):
-    #    """
-    #    Return a "representation" object for this error generator.
-    #
-    #    Such objects are primarily used internally by pyGSTi to compute
-    #    things like probabilities more efficiently.
-    #
-    #    Returns
-    #    -------
-    #    OpRep
-    #    """
-    #    if self._evotype == "densitymx":
-    #        if self._rep_type == 'sparse superop':
-    #            A = self.err_gen_mx
-    #            return replib.DMOpRepSparse(
-    #                _np.ascontiguousarray(A.data),
-    #                _np.ascontiguousarray(A.indices, _np.int64),
-    #                _np.ascontiguousarray(A.indptr, _np.int64))
-    #        else:
-    #            return replib.DMOpRepDense(_np.ascontiguousarray(self.err_gen_mx, 'd'))
-    #    else:
-    #        raise NotImplementedError("torep(%s) not implemented for %s objects!" %
-    #                                  (self._evotype, self.__class__.__name__))
 
     def taylor_order_terms(self, order, max_polynomial_vars=100, return_coeff_polys=False):
         """
@@ -995,7 +879,6 @@ def from_vector(self, v, close=False, dirty_value=True):
 
     def coefficients(self, return_basis=False, logscale_nonham=False):
         """
-        TODO: docstring
         Constructs a dictionary of the Lindblad-error-generator coefficients of this error generator.
 
         Note that these are not necessarily the parameter values, as these
@@ -1022,12 +905,10 @@ def coefficients(self, return_basis=False, logscale_nonham=False):
         Ltermdict : dict
             Keys are `(termType, basisLabel1, <basisLabel2>)`
             tuples, where `termType` is `"H"` (Hamiltonian), `"S"` (Stochastic),
-            or `"A"` (Affine).  Hamiltonian and Affine terms always have a
-            single basis label (so key is a 2-tuple) whereas Stochastic tuples
-            have 1 basis label to indicate a *diagonal* term and otherwise have
-            2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
-            terms.  Basis labels are integers starting at 0.  Values are complex
-            coefficients.
+            `"C"` (Correlation) or `"A"` (Active).  Hamiltonian and Stochastic terms 
+            always have a single basis label (so key is a 2-tuple) whereas C and A tuples
+            have 2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
+            terms.  Basis labels are pauli strings. Values are coefficients.
         basis : Basis
             A Basis mapping the basis labels used in the
             keys of `Ltermdict` to basis matrices.
@@ -1155,15 +1036,14 @@ def error_rates(self):
 
         Returns
         -------
-        lindblad_term_dict : dict
+        Ltermdict : dict
             Keys are `(termType, basisLabel1, <basisLabel2>)`
             tuples, where `termType` is `"H"` (Hamiltonian), `"S"` (Stochastic),
-            or `"A"` (Affine).  Hamiltonian and Affine terms always have a
-            single basis label (so key is a 2-tuple) whereas Stochastic tuples
-            have 1 basis label to indicate a *diagonal* term and otherwise have
-            2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
-            terms.  Values are real error rates except for the 2-basis-label
-            case.
+            `"C"` (Correlation) or `"A"` (Active).  Hamiltonian and Stochastic terms 
+            always have a single basis label (so key is a 2-tuple) whereas C and A tuples
+            have 2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
+            terms.  Basis labels are pauli strings. Values are coefficients. 
+            Values are real error rates except for the 2-basis-label case.
         """
         return self.coefficients(return_basis=False, logscale_nonham=True)
 
@@ -1171,7 +1051,6 @@ def set_coefficients(self, elementary_errorgens, action="update", logscale_nonha
         """
         Sets the coefficients of elementary error generator terms in this error generator.
 
-        TODO: docstring update
         The dictionary `lindblad_term_dict` has tuple-keys describing the type
         of term and the basis elements used to construct it, e.g. `('H','X')`.
 
@@ -1180,12 +1059,10 @@ def set_coefficients(self, elementary_errorgens, action="update", logscale_nonha
         lindblad_term_dict : dict
             Keys are `(termType, basisLabel1, <basisLabel2>)`
             tuples, where `termType` is `"H"` (Hamiltonian), `"S"` (Stochastic),
-            or `"A"` (Affine).  Hamiltonian and Affine terms always have a
-            single basis label (so key is a 2-tuple) whereas Stochastic tuples
-            have 1 basis label to indicate a *diagonal* term and otherwise have
-            2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
-            terms.  Values are the coefficients of these error generators,
-            and should be real except for the 2-basis-label case.
+            `"C"` (Correlation) or `"A"` (Active).  Hamiltonian and Stochastic terms 
+            always have a single basis label (so key is a 2-tuple) whereas C and A tuples
+            have 2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
+            terms.  Basis labels are pauli strings.
 
         action : {"update","add","reset"}
             How the values in `lindblad_term_dict` should be combined with existing
@@ -1260,7 +1137,6 @@ def set_error_rates(self, elementary_errorgens, action="update"):
         """
         Sets the coeffcients of elementary error generator terms in this error generator.
 
-        TODO: update docstring
         Coefficients are set so that the contributions of the resulting
         channel's error rate are given by the values in `lindblad_term_dict`.
         See :meth:`error_rates` for more details.
@@ -1270,12 +1146,10 @@ def set_error_rates(self, elementary_errorgens, action="update"):
         lindblad_term_dict : dict
             Keys are `(termType, basisLabel1, <basisLabel2>)`
             tuples, where `termType` is `"H"` (Hamiltonian), `"S"` (Stochastic),
-            or `"A"` (Affine).  Hamiltonian and Affine terms always have a
-            single basis label (so key is a 2-tuple) whereas Stochastic tuples
-            have 1 basis label to indicate a *diagonal* term and otherwise have
-            2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
-            terms.  Values are real error rates except for the 2-basis-label
-            case, when they may be complex.
+            `"C"` (Correlation) or `"A"` (Active).  Hamiltonian and Stochastic terms 
+            always have a single basis label (so key is a 2-tuple) whereas C and A tuples
+            have 2 basis labels to specify off-diagonal non-Hamiltonian Lindblad
+            terms.  Basis labels are pauli strings.
 
         action : {"update","add","reset"}
             How the values in `lindblad_term_dict` should be combined with existing
@@ -1289,7 +1163,19 @@ def set_error_rates(self, elementary_errorgens, action="update"):
 
     def coefficient_weights(self, weights):
         """
-        TODO: docstring
+        Get the non-default coefficient weights.
+
+        This method returns a dictionary of coefficient weights that are not equal to the default value of 1.0.
+
+        Parameters
+        ----------
+        weights : dict
+            A dictionary where keys are coefficient labels and values are the corresponding weights.
+
+        Returns
+        -------
+        dict
+            A dictionary where keys are coefficient labels and values are the corresponding weights that are not equal to 1.0.
         """
         coeff_labels = self.coefficient_labels()
         lbl_lookup = {i: lbl for i, lbl in enumerate(coeff_labels)}
@@ -1305,7 +1191,15 @@ def coefficient_weights(self, weights):
 
     def set_coefficient_weights(self, weights):
         """
-        TODO: docstring
+        Set the coefficient weights.
+
+        This method sets the weights for the coefficients of the error generator. If the coefficient weights
+        array is not initialized, it initializes it to an array of ones.
+
+        Parameters
+        ----------
+        weights : dict
+            A dictionary where keys are coefficient labels and values are the corresponding weights to set.
         """
         coeff_labels = self.coefficient_labels()
         ilbl_lookup = {lbl: i for i, lbl in enumerate(coeff_labels)}
@@ -1341,7 +1235,7 @@ def transform_inplace(self, s):
 
             #conjugate Lindbladian exponent by U:
             err_gen_mx = self.to_sparse() if self._rep_type == 'sparse superop' else self.to_dense()
-            err_gen_mx = _mt.safe_dot(Uinv, _mt.safe_dot(err_gen_mx, U))
+            err_gen_mx = Uinv  @ (err_gen_mx @ U)
             trunc = 1e-6 if isinstance(s, _gaugegroup.UnitaryGaugeGroupElement) else False
             self._set_params_from_matrix(err_gen_mx, truncate=trunc)
             self.dirty = True
@@ -1355,56 +1249,7 @@ def transform_inplace(self, s):
         else:
             raise ValueError("Invalid transform for this LindbladErrorgen: type %s"
                              % str(type(s)))
-
-    #I don't think this is ever needed
-    #def spam_transform_inplace(self, s, typ):
-    #    """
-    #    Update operation matrix `O` with `inv(s) * O` OR `O * s`, depending on the value of `typ`.
-    #
-    #    This functions as `transform_inplace(...)` but is used when this
-    #    Lindblad-parameterized operation is used as a part of a SPAM
-    #    vector.  When `typ == "prep"`, the spam vector is assumed
-    #    to be `rho = dot(self, <spamvec>)`, which transforms as
-    #    `rho -> inv(s) * rho`, so `self -> inv(s) * self`. When
-    #    `typ == "effect"`, `e.dag = dot(e.dag, self)` (not that
-    #    `self` is NOT `self.dag` here), and `e.dag -> e.dag * s`
-    #    so that `self -> self * s`.
-    #
-    #    Parameters
-    #    ----------
-    #    s : GaugeGroupElement
-    #        A gauge group element which specifies the "s" matrix
-    #        (and it's inverse) used in the above similarity transform.
-    #
-    #    typ : { 'prep', 'effect' }
-    #        Which type of SPAM vector is being transformed (see above).
-    #
-    #    Returns
-    #    -------
-    #    None
-    #    """
-    #    assert(typ in ('prep', 'effect')), "Invalid `typ` argument: %s" % typ
-    #
-    #    if isinstance(s, _gaugegroup.UnitaryGaugeGroupElement) or \
-    #       isinstance(s, _gaugegroup.TPSpamGaugeGroupElement):
-    #        U = s.transform_matrix
-    #        Uinv = s.transform_matrix_inverse
-    #        err_gen_mx = self.to_sparse() if self._rep_type == 'sparse superop' else self.to_dense()
-    #
-    #        #just act on postfactor and Lindbladian exponent:
-    #        if typ == "prep":
-    #            err_gen_mx = _mt.safe_dot(Uinv, err_gen_mx)
-    #        else:
-    #            err_gen_mx = _mt.safe_dot(err_gen_mx, U)
-    #
-    #        self._set_params_from_matrix(err_gen_mx, truncate=True)
-    #        self.dirty = True
-    #        #Note: truncate=True above because some unitary transforms seem to
-    #        ## modify eigenvalues to be negative beyond the tolerances
-    #        ## checked when truncate == False.
-    #    else:
-    #        raise ValueError("Invalid transform for this LindbladDenseOp: type %s"
-    #                         % str(type(s)))
+        
 
     def deriv_wrt_params(self, wrt_filter=None):
         """
@@ -1427,8 +1272,6 @@ def deriv_wrt_params(self, wrt_filter=None):
             Array of derivatives, shape == (dimension^2, num_params)
         """
         if self._rep_type == 'sparse superop':
-            #raise NotImplementedError(("LindbladErrorgen.deriv_wrt_params(...) can only be called "
-            #                           "when using *dense* basis elements!"))
             _warnings.warn("Using finite differencing to compute LindbladErrorGen derivative!")
             return super(LindbladErrorgen, self).deriv_wrt_params(wrt_filter)
 
@@ -1541,28 +1384,18 @@ def to_memoized_dict(self, mmg_memo):
         mm_dict = super().to_memoized_dict(mmg_memo)
 
         mm_dict['rep_type'] = self._rep_type
-        #OLD: mm_dict['parameterization'] = self.parameterization.to_nice_serialization()
-        #OLD: mm_dict['lindblad_basis'] = self.lindblad_basis.to_nice_serialization()
-        #OLD: mm_dict['coefficients'] = [(str(k), self._encodevalue(v)) for k, v in self.coefficients().items()]
         mm_dict['matrix_basis'] = self.matrix_basis.to_nice_serialization()
         mm_dict['coefficient_blocks'] = [blk.to_nice_serialization() for blk in self.coefficient_blocks]
         return mm_dict
 
     @classmethod
     def _from_memoized_dict(cls, mm_dict, serial_memo):
-        #lindblad_term_dict = {_GlobalElementaryErrorgenLabel.cast(k): cls._decodevalue(v)
-        #                      for k, v in mm_dict['coefficients']}  # convert keys from str->objects
-        #parameterization = LindbladParameterization.from_nice_serialization(mm_dict['parameterization'])
-        #lindblad_basis = _Basis.from_nice_serialization(mm_dict['lindblad_basis'])
-        #truncate = False  # shouldn't need to truncate since we're reloading a valid set of coefficients
         mx_basis = _Basis.from_nice_serialization(mm_dict['matrix_basis'])
         state_space = _statespace.StateSpace.from_nice_serialization(mm_dict['state_space'])
         coeff_blocks = [_LindbladCoefficientBlock.from_nice_serialization(blk)
                         for blk in mm_dict['coefficient_blocks']]
 
         return cls(coeff_blocks, 'auto', mx_basis, mm_dict['evotype'], state_space)
-        #return cls(lindblad_term_dict, parameterization, lindblad_basis,
-        #           mx_basis, truncate, mm_dict['evotype'], state_space)
 
     def _is_similar(self, other, rtol, atol):
         """ Returns True if `other` model member (which it guaranteed to be the same type as self) has
@@ -1629,7 +1462,7 @@ def minimal_from_elementary_errorgens(cls, errs):
         errs : dict
             Error dictionary with keys as `(termType, basisLabel)` tuples, where
             `termType` can be `"H"` (Hamiltonian), `"S"` (Stochastic), or `"A"`
-            (Affine), and `basisLabel` is a string of I, X, Y, or Z to describe a
+            (Active), and `basisLabel` is a string of I, X, Y, or Z, or to describe a
             Pauli basis element appropriate for the gate (i.e. having the same
             number of letters as there are qubits in the gate).  For example, you
             could specify a 0.01-radian Z-rotation error and 0.05 rate of Pauli-
@@ -1717,12 +1550,6 @@ def __init__(self, block_types, param_modes, abbrev=None, meta=None):
         self.abbrev = abbrev
         self.meta = meta
 
-        #REMOVE
-        #self.nonham_block_type = nonham_block_type  #nonham_mode
-        #self.nonham_param_mode = nonham_param_mode  #param_mode
-        #self.include_ham_block = include_ham_block #ham_params_allowed = ham_params_allowed
-        #self.include_nonham_block = include_nonham_block  #nonham_params_allowed = nonham_params_allowed
-
     def __hash__(self):
         return hash((self.block_types, self.param_modes))
 
diff --git a/pygsti/modelmembers/operations/opfactory.py b/pygsti/modelmembers/operations/opfactory.py
index c79dfc0e8..013c0ad06 100644
--- a/pygsti/modelmembers/operations/opfactory.py
+++ b/pygsti/modelmembers/operations/opfactory.py
@@ -98,7 +98,7 @@ class OpFactory(_gm.ModelMember):
     def __init__(self, state_space, evotype):
         #self._paramvec = _np.zeros(nparams, 'd')
         state_space = _statespace.StateSpace.cast(state_space)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         _gm.ModelMember.__init__(self, state_space, evotype)
 
     def create_object(self, args=None, sslbls=None):
diff --git a/pygsti/modelmembers/operations/repeatedop.py b/pygsti/modelmembers/operations/repeatedop.py
index f5c21deed..76ca647ed 100644
--- a/pygsti/modelmembers/operations/repeatedop.py
+++ b/pygsti/modelmembers/operations/repeatedop.py
@@ -45,7 +45,7 @@ def __init__(self, op_to_repeat, num_repetitions, evotype="auto"):
 
         if evotype == "auto":
             evotype = op_to_repeat._evotype
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         rep = evotype.create_repeated_rep(self.repeated_op._rep, self.num_repetitions, state_space)
         _LinearOperator.__init__(self, rep, evotype)
         self.init_gpindices()  # initialize our gpindices based on sub-members
diff --git a/pygsti/modelmembers/operations/staticstdop.py b/pygsti/modelmembers/operations/staticstdop.py
index 6a7154138..e23c10b05 100644
--- a/pygsti/modelmembers/operations/staticstdop.py
+++ b/pygsti/modelmembers/operations/staticstdop.py
@@ -54,7 +54,7 @@ def __init__(self, name, basis='pp', evotype="default", state_space=None):
             else _statespace.StateSpace.cast(state_space)
         basis = _Basis.cast(basis, state_space.dim)  # basis for Hilbert-Schmidt (superop) space
 
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         rep = evotype.create_standard_rep(name, basis, state_space)
         _LinearOperator.__init__(self, rep, evotype)
 
diff --git a/pygsti/modelmembers/povms/basepovm.py b/pygsti/modelmembers/povms/basepovm.py
index 4e4bd0ced..22e6baccd 100644
--- a/pygsti/modelmembers/povms/basepovm.py
+++ b/pygsti/modelmembers/povms/basepovm.py
@@ -67,7 +67,7 @@ def __init__(self, effects, evotype=None, state_space=None, preserve_sum=False,
             self.complement_label = None
 
         if evotype is not None:
-            evotype = _Evotype.cast(evotype)  # e.g., resolve "default"
+            evotype = _Evotype.cast(evotype, items[0][1].state_space)  # e.g., resolve "default"
 
         #Copy each effect vector and set it's parent and gpindices.
         # Assume each given effect vector's parameters are independent.
diff --git a/pygsti/modelmembers/povms/composedeffect.py b/pygsti/modelmembers/povms/composedeffect.py
index 845085bad..28c15078e 100644
--- a/pygsti/modelmembers/povms/composedeffect.py
+++ b/pygsti/modelmembers/povms/composedeffect.py
@@ -42,328 +42,8 @@ class ComposedPOVMEffect(_POVMEffect):  # , _ErrorMapContainer
         parameters with other gates and spam vectors.)
     """
 
-    #@classmethod
-    #def _from_spamvec_obj(cls, spamvec, typ, param_type="GLND", purevec=None,
-    #                      proj_basis="pp", mx_basis="pp", truncate=True,
-    #                      lazy=False):
-    #    """
-    #    Creates a LindbladSPAMVec from an existing SPAMVec object and some additional information.
-    #
-    #    This function is different from `from_spam_vector` in that it assumes
-    #    that `spamvec` is a :class:`SPAMVec`-derived object, and if `lazy=True`
-    #    and if `spamvec` is already a matching LindbladSPAMVec, it
-    #    is returned directly.  This routine is primarily used in spam vector
-    #    conversion functions, where conversion is desired only when necessary.
-    #
-    #    Parameters
-    #    ----------
-    #    spamvec : SPAMVec
-    #        The spam vector object to "convert" to a
-    #        `LindbladSPAMVec`.
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    param_type : str, optional
-    #        The high-level "parameter type" of the gate to create.  This
-    #        specifies both which Lindblad parameters are included and what
-    #        type of evolution is used.  Examples of valid values are
-    #        `"CPTP"`, `"H+S"`, `"S terms"`, and `"GLND clifford terms"`.
-    #
-    #    purevec : numpy array or SPAMVec object, optional
-    #        A SPAM vector which represents a pure-state, taken as the "ideal"
-    #        reference state when constructing the error generator of the
-    #        returned `LindbladSPAMVec`.  Note that this vector
-    #        still acts on density matrices (if it's a SPAMVec it should have
-    #        a "densitymx", "svterm", or "cterm" evolution type, and if it's
-    #        a numpy array it should have the same dimension as `spamvec`).
-    #        If None, then it is taken to be `spamvec`, and so `spamvec` must
-    #        represent a pure state in this case.
-    #
-    #    proj_basis : {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis used to construct the Lindblad-term error generators onto
-    #        which the SPAM vector's error generator is projected.  Allowed values
-    #        are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given `spamvec` cannot
-    #        be realized by the specified set of Lindblad projections.
-    #
-    #    lazy : bool, optional
-    #        If True, then if `spamvec` is already a LindbladSPAMVec
-    #        with the requested details (given by the other arguments), then
-    #        `spamvec` is returned directly and no conversion/copying is
-    #        performed. If False, then a new object is always returned.
-    #
-    #    Returns
-    #    -------
-    #    LindbladSPAMVec
-    #    """
-    #
-    #    if not isinstance(spamvec, SPAMVec):
-    #        spamvec = StaticSPAMVec(spamvec, typ=typ)  # assume spamvec is just a vector
-    #
-    #    if purevec is None:
-    #        purevec = spamvec  # right now, we don't try to extract a "closest pure vec"
-    #        # to spamvec - below will fail if spamvec isn't pure.
-    #    elif not isinstance(purevec, SPAMVec):
-    #        purevec = StaticSPAMVec(purevec, typ=typ)  # assume spamvec is just a vector
-    #
-    #    #Break param_type in to a "base" type and an evotype
-    #    from .operation import LindbladOp as _LPGMap
-    #    bTyp, evotype, nonham_mode, param_mode = _LPGMap.decomp_paramtype(param_type)
-    #
-    #    ham_basis = proj_basis if (("H" == bTyp) or ("H+" in bTyp) or bTyp in ("CPTP", "GLND")) else None
-    #    nonham_basis = None if bTyp == "H" else proj_basis
-    #
-    #    def beq(b1, b2):
-    #        """ Check if bases have equal names """
-    #        b1 = b1.name if isinstance(b1, _Basis) else b1
-    #        b2 = b2.name if isinstance(b2, _Basis) else b2
-    #        return b1 == b2
-    #
-    #    def normeq(a, b):
-    #        if a is None and b is None: return True
-    #        if a is None or b is None: return False
-    #        return _mt.safe_norm(a - b) < 1e-6  # what about possibility of Clifford gates?
-    #
-    #    if isinstance(spamvec, LindbladSPAMVec) \
-    #       and spamvec._evotype == evotype and spamvec.typ == typ \
-    #       and beq(ham_basis, spamvec.error_map.ham_basis) and beq(nonham_basis, spamvec.error_map.other_basis) \
-    #       and param_mode == spamvec.error_map.param_mode and nonham_mode == spamvec.error_map.nonham_mode \
-    #       and beq(mx_basis, spamvec.error_map.matrix_basis) and lazy:
-    #        #normeq(gate.pure_state_vec,purevec) \ # TODO: more checks for equality?!
-    #        return spamvec  # no creation necessary!
-    #    else:
-    #        #Convert vectors (if possible) to SPAMVecs
-    #        # of the appropriate evotype and 0 params.
-    #        bDiff = spamvec is not purevec
-    #        spamvec = _convert_to_lindblad_base(spamvec, typ, evotype, mx_basis)
-    #        purevec = _convert_to_lindblad_base(purevec, typ, evotype, mx_basis) if bDiff else spamvec
-    #        assert(spamvec._evotype == evotype)
-    #        assert(purevec._evotype == evotype)
-    #
-    #        return cls.from_spam_vector(
-    #            spamvec, purevec, typ, ham_basis, nonham_basis,
-    #            param_mode, nonham_mode, truncate, mx_basis, evotype)
-    #
-    #@classmethod
-    #def from_spam_vector(cls, spam_vec, pure_vec, typ,
-    #                     ham_basis="pp", nonham_basis="pp", param_mode="cptp",
-    #                     nonham_mode="all", truncate=True, mx_basis="pp",
-    #                     evotype="densitymx"):
-    #    """
-    #    Creates a Lindblad-parameterized spamvec from a state vector and a basis.
-    #
-    #    The basis specifies how to decompose (project) the vector's error generator.
-    #
-    #    Parameters
-    #    ----------
-    #    spam_vec : SPAMVec
-    #        the SPAM vector to initialize from.  The error generator that
-    #        tranforms `pure_vec` into `spam_vec` forms the parameterization
-    #        of the returned LindbladSPAMVec.
-    #
-    #    pure_vec : numpy array or SPAMVec
-    #        An array or SPAMVec in the *full* density-matrix space (this
-    #        vector will have the same dimension as `spam_vec` - 4 in the case
-    #        of a single qubit) which represents a pure-state preparation or
-    #        projection.  This is used as the "base" preparation/projection
-    #        when computing the error generator that will be parameterized.
-    #        Note that this argument must be specified, as there is no natural
-    #        default value (like the identity in the case of gates).
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    ham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis is used to construct the Hamiltonian-type lindblad error
-    #        Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    nonham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis is used to construct the Stochastic-type lindblad error
-    #        Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-    #        Describes how the Lindblad coefficients/projections relate to the
-    #        SPAM vector's parameter values.  Allowed values are:
-    #        `"unconstrained"` (coeffs are independent unconstrained parameters),
-    #        `"cptp"` (independent parameters but constrained so map is CPTP),
-    #        `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-    #        `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-    #
-    #    nonham_mode : {"diagonal", "diag_affine", "all"}
-    #        Which non-Hamiltonian Lindblad projections are potentially non-zero.
-    #        Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-    #        `"diag_affine"` (diagonal coefficients + affine projections), and
-    #        `"all"` (the entire matrix of coefficients is allowed).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given `gate` cannot
-    #        be realized by the specified set of Lindblad projections.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    evotype : {"densitymx","svterm","cterm"}
-    #        The evolution type of the spamvec being constructed.  `"densitymx"` is
-    #        usual Lioville density-matrix-vector propagation via matrix-vector
-    #        products.  `"svterm"` denotes state-vector term-based evolution
-    #        (spamvec is obtained by evaluating the rank-1 terms up to
-    #        some order).  `"cterm"` is similar but stabilizer states.
-    #
-    #    Returns
-    #    -------
-    #    LindbladSPAMVec
-    #    """
-    #    #Compute a (errgen, pure_vec) pair from the given
-    #    # (spam_vec, pure_vec) pair.
-    #
-    #    assert(pure_vec is not None), "Must supply `pure_vec`!"  # since there's no good default?
-    #
-    #    if not isinstance(spam_vec, SPAMVec):
-    #        spam_vec = StaticSPAMVec(spam_vec, evotype, typ)  # assume spamvec is just a vector
-    #    if not isinstance(pure_vec, SPAMVec):
-    #        pure_vec = StaticSPAMVec(pure_vec, evotype, typ)  # assume spamvec is just a vector
-    #    d2 = pure_vec.dim
-    #
-    #    #Determine whether we're using sparse bases or not
-    #    sparse = None
-    #    if ham_basis is not None:
-    #        if isinstance(ham_basis, _Basis): sparse = ham_basis.sparse
-    #        elif not isinstance(ham_basis, str) and len(ham_basis) > 0:
-    #            sparse = _sps.issparse(ham_basis[0])
-    #    if sparse is None and nonham_basis is not None:
-    #        if isinstance(nonham_basis, _Basis): sparse = nonham_basis.sparse
-    #        elif not isinstance(nonham_basis, str) and len(nonham_basis) > 0:
-    #            sparse = _sps.issparse(nonham_basis[0])
-    #    if sparse is None: sparse = False  # the default
-    #
-    #    if spam_vec is None or spam_vec is pure_vec:
-    #        if sparse: errgen = _sps.csr_matrix((d2, d2), dtype='d')
-    #        else: errgen = _np.zeros((d2, d2), 'd')
-    #    else:
-    #        #Construct "spam error generator" by comparing *dense* vectors
-    #        pvdense = pure_vec.to_dense()
-    #        svdense = spam_vec.to_dense()
-    #        errgen = _ot.spam_error_generator(svdense, pvdense, mx_basis)
-    #        if sparse: errgen = _sps.csr_matrix(errgen)
-    #
-    #    assert(pure_vec._evotype == evotype), "`pure_vec` must have evotype == '%s'" % evotype
-    #
-    #    from .operation import LindbladErrorgen as _LErrorgen
-    #    from .operation import LindbladOp as _LPGMap
-    #    from .operation import LindbladDenseOp as _LPOp
-    #
-    #    errgen = _LErrorgen.from_error_generator(errgen, ham_basis,
-    #                                             nonham_basis, param_mode, nonham_mode,
-    #                                             mx_basis, truncate, evotype)
-    #    errcls = _LPOp if (pure_vec.dim <= 64 and evotype == "densitymx") else _LPGMap
-    #    errmap = errcls(None, errgen)
-    #
-    #    return cls(pure_vec, errmap, typ)
-
-    #@classmethod
-    #def from_lindblad_terms(cls, pure_vec, lindblad_term_dict, typ, basisdict=None,
-    #                        param_mode="cptp", nonham_mode="all", truncate=True,
-    #                        mx_basis="pp", evotype="densitymx"):
-    #    """
-    #    Create a Lindblad-parameterized spamvec with a given set of Lindblad terms.
-    #
-    #    Parameters
-    #    ----------
-    #    pure_vec : numpy array or SPAMVec
-    #        An array or SPAMVec in the *full* density-matrix space (this
-    #        vector will have dimension 4 in the case of a single qubit) which
-    #        represents a pure-state preparation or projection.  This is used as
-    #        the "base" preparation or projection that is followed or preceded
-    #        by, respectively, the parameterized Lindblad-form error generator.
-    #
-    #    lindblad_term_dict : dict
-    #        A dictionary specifying which Linblad terms are present in the gate
-    #        parameteriztion.  Keys are `(termType, basisLabel1, <basisLabel2>)`
-    #        tuples, where `termType` can be `"H"` (Hamiltonian), `"S"`
-    #        (Stochastic), or `"A"` (Affine).  Hamiltonian and Affine terms always
-    #        have a single basis label (so key is a 2-tuple) whereas Stochastic
-    #        tuples with 1 basis label indicate a *diagonal* term, and are the
-    #        only types of terms allowed when `nonham_mode != "all"`.  Otherwise,
-    #        Stochastic term tuples can include 2 basis labels to specify
-    #        "off-diagonal" non-Hamiltonian Lindblad terms.  Basis labels can be
-    #        strings or integers.  Values are complex coefficients (error rates).
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    basisdict : dict, optional
-    #        A dictionary mapping the basis labels (strings or ints) used in the
-    #        keys of `lindblad_term_dict` to basis matrices (numpy arrays or Scipy sparse
-    #        matrices).
-    #
-    #    param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-    #        Describes how the Lindblad coefficients/projections relate to the
-    #        SPAM vector's parameter values.  Allowed values are:
-    #        `"unconstrained"` (coeffs are independent unconstrained parameters),
-    #        `"cptp"` (independent parameters but constrained so map is CPTP),
-    #        `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-    #        `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-    #
-    #    nonham_mode : {"diagonal", "diag_affine", "all"}
-    #        Which non-Hamiltonian Lindblad projections are potentially non-zero.
-    #        Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-    #        `"diag_affine"` (diagonal coefficients + affine projections), and
-    #        `"all"` (the entire matrix of coefficients is allowed).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given dictionary of
-    #        Lindblad terms doesn't conform to the constrains.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    evotype : {"densitymx","svterm","cterm"}
-    #        The evolution type of the spamvec being constructed.  `"densitymx"` is
-    #        usual Lioville density-matrix-vector propagation via matrix-vector
-    #        products.  `"svterm"` denotes state-vector term-based evolution
-    #        (spamvec is obtained by evaluating the rank-1 terms up to
-    #        some order).  `"cterm"` is similar but stabilizer states.
-    #
-    #    Returns
-    #    -------
-    #    LindbladOp
-    #    """
-    #    #Need a dimension for error map construction (basisdict could be completely empty)
-    #    if not isinstance(pure_vec, SPAMVec):
-    #        pure_vec = StaticSPAMVec(pure_vec, evotype, typ)  # assume spamvec is just a vector
-    #    d2 = pure_vec.dim
-    #
-    #    from .operation import LindbladOp as _LPGMap
-    #    errmap = _LPGMap(d2, lindblad_term_dict, basisdict, param_mode, nonham_mode,
-    #                     truncate, mx_basis, evotype)
-    #    return cls(pure_vec, errmap, typ)
-
     def __init__(self, static_effect, errormap):
         evotype = errormap._evotype
-        #from .operation import LindbladOp as _LPGMap
-        #assert(evotype in ("densitymx", "svterm", "cterm")), \
-        #    "Invalid evotype: %s for %s" % (evotype, self.__class__.__name__)
 
         if not isinstance(static_effect, _POVMEffect):
             # UNSPECIFIED BASIS -- should be able to use static_effect._rep.basis once we get std attribute setup
diff --git a/pygsti/modelmembers/povms/computationaleffect.py b/pygsti/modelmembers/povms/computationaleffect.py
index ea727525d..57d11b17e 100644
--- a/pygsti/modelmembers/povms/computationaleffect.py
+++ b/pygsti/modelmembers/povms/computationaleffect.py
@@ -151,7 +151,7 @@ def __init__(self, zvals, basis='pp', evotype="default", state_space=None):
         # or maybe remove and use self._rep.basis if that's the std attribute
         self._basis = basis
 
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         self._evotype = evotype  # set this before call to _State.__init__ so self.to_dense() can work...
         rep = evotype.create_computational_effect_rep(zvals, basis, state_space)
         _POVMEffect.__init__(self, rep, evotype)
diff --git a/pygsti/modelmembers/povms/computationalpovm.py b/pygsti/modelmembers/povms/computationalpovm.py
index 98ab4991b..540a62945 100644
--- a/pygsti/modelmembers/povms/computationalpovm.py
+++ b/pygsti/modelmembers/povms/computationalpovm.py
@@ -72,11 +72,13 @@ def __init__(self, nqubits, evotype="default", qubit_filter=None, state_space=No
         #LATER - do something with qubit_filter here
         # qubits = self.qubit_filter if (self.qubit_filter is not None) else list(range(self.nqubits))
 
-        evotype = _Evotype.cast(evotype)
         items = []  # init as empty (lazy creation of members)
         if state_space is None:
             state_space = _statespace.QubitSpace(nqubits)
         assert(state_space.num_qubits == nqubits), "`state_space` must describe %d qubits!" % nqubits
+        
+        evotype = _Evotype.cast(evotype, state_space=state_space)
+
         try:
             rep = evotype.create_computational_povm_rep(self.nqubits, self.qubit_filter)
         except AttributeError:
diff --git a/pygsti/modelmembers/povms/conjugatedeffect.py b/pygsti/modelmembers/povms/conjugatedeffect.py
index 5af305a44..3b0b5ddec 100644
--- a/pygsti/modelmembers/povms/conjugatedeffect.py
+++ b/pygsti/modelmembers/povms/conjugatedeffect.py
@@ -80,6 +80,12 @@ def __setitem__(self, key, val):
         ret = self.columnvec.__setitem__(key, val)
         self._ptr_has_changed()
         return ret
+    
+    def __getstate__(self):
+        return self.__dict__
+    
+    def __setstate__(self, d):
+        self.__dict__.update(d)
 
     def __getattr__(self, attr):
         #use __dict__ so no chance for recursive __getattr__
diff --git a/pygsti/modelmembers/povms/denseeffect.py b/pygsti/modelmembers/povms/denseeffect.py
deleted file mode 100644
index b0deb1e68..000000000
--- a/pygsti/modelmembers/povms/denseeffect.py
+++ /dev/null
@@ -1,142 +0,0 @@
-
-
-#UNUSED - I think we can remove this
-#class DensePOVMEffect(_POVMEffect):
-#    """
-#    A POVM effect vector that behaves like a numpy array.
-#
-#    This class is the common base class for parameterizations of an effect vector
-#    that have a dense representation and can be accessed like a numpy array.
-#
-#    Parameters
-#    ----------
-#    vec : numpy.ndarray
-#        The effect vector as a dense numpy array.
-#
-#    evotype : EvoType
-#        The evolution type.
-#
-#    Attributes
-#    ----------
-#    _base_1d : numpy.ndarray
-#        Direct access to the underlying 1D array.
-#
-#    base : numpy.ndarray
-#        Direct access the the underlying data as column vector,
-#        i.e, a (dim,1)-shaped array.
-#    """
-#
-#    def __init__(self, vec, evotype):
-#        #dtype = complex if evotype == "statevec" else 'd'
-#        vec = _np.asarray(vec, dtype='d')
-#        vec.shape = (vec.size,)  # just store 1D array flatten
-#        vec = _np.require(vec, requirements=['OWNDATA', 'C_CONTIGUOUS'])
-#        evotype = _Evotype.cast(evotype)
-#        rep = evotype.create_dense_effect_rep(vec)
-#        super(DensePOVMEffect, self).__init__(rep, evotype)
-#        assert(self._base_1d.flags['C_CONTIGUOUS'] and self._base_1d.flags['OWNDATA'])
-#
-#    def to_dense(self, scratch=None):
-#        """
-#        Return this effect vector as a (dense) numpy array.
-#
-#        The memory in `scratch` maybe used when it is not-None.
-#
-#        Parameters
-#        ----------
-#        scratch : numpy.ndarray, optional
-#            scratch space available for use.
-#
-#        Returns
-#        -------
-#        numpy.ndarray
-#        """
-#        #don't use scratch since we already have memory allocated
-#        return self._base_1d  # *must* be a numpy array for Cython arg conversion
-#
-#    @property
-#    def _base_1d(self):
-#        """
-#        Direct access to the underlying 1D array.
-#        """
-#        return self._rep.base
-#
-#    @property
-#    def base(self):
-#        """
-#        Direct access the the underlying data as column vector, i.e, a (dim,1)-shaped array.
-#        """
-#        bv = self._base_1d.view()
-#        bv.shape = (bv.size, 1)  # 'base' is by convention a (N,1)-shaped array
-#        return bv
-#
-#    def __copy__(self):
-#        # We need to implement __copy__ because we defer all non-existing
-#        # attributes to self.base (a numpy array) which *has* a __copy__
-#        # implementation that we don't want to use, as it results in just a
-#        # copy of the numpy array.
-#        cls = self.__class__
-#        cpy = cls.__new__(cls)
-#        cpy.__dict__.update(self.__dict__)
-#        return cpy
-#
-#    def __deepcopy__(self, memo):
-#        # We need to implement __deepcopy__ because we defer all non-existing
-#        # attributes to self.base (a numpy array) which *has* a __deepcopy__
-#        # implementation that we don't want to use, as it results in just a
-#        # copy of the numpy array.
-#        cls = self.__class__
-#        cpy = cls.__new__(cls)
-#        memo[id(self)] = cpy
-#        for k, v in self.__dict__.items():
-#            setattr(cpy, k, _copy.deepcopy(v, memo))
-#        return cpy
-#
-#    #Access to underlying array
-#    def __getitem__(self, key):
-#        self.dirty = True
-#        return self.base.__getitem__(key)
-#
-#    def __getslice__(self, i, j):
-#        self.dirty = True
-#        return self.__getitem__(slice(i, j))  # Called for A[:]
-#
-#    def __setitem__(self, key, val):
-#        self.dirty = True
-#        return self.base.__setitem__(key, val)
-#
-#    def __getattr__(self, attr):
-#        #use __dict__ so no chance for recursive __getattr__
-#        if '_rep' in self.__dict__:  # sometimes in loading __getattr__ gets called before the instance is loaded
-#            ret = getattr(self.base, attr)
-#        else:
-#            raise AttributeError("No attribute:", attr)
-#        self.dirty = True
-#        return ret
-#
-#    #Mimic array
-#    def __pos__(self): return self.base
-#    def __neg__(self): return -self.base
-#    def __abs__(self): return abs(self.base)
-#    def __add__(self, x): return self.base + x
-#    def __radd__(self, x): return x + self.base
-#    def __sub__(self, x): return self.base - x
-#    def __rsub__(self, x): return x - self.base
-#    def __mul__(self, x): return self.base * x
-#    def __rmul__(self, x): return x * self.base
-#    def __truediv__(self, x): return self.base / x
-#    def __rtruediv__(self, x): return x / self.base
-#    def __floordiv__(self, x): return self.base // x
-#    def __rfloordiv__(self, x): return x // self.base
-#    def __pow__(self, x): return self.base ** x
-#    def __eq__(self, x): return self.base == x
-#    def __len__(self): return len(self.base)
-#    def __int__(self): return int(self.base)
-#    def __long__(self): return int(self.base)
-#    def __float__(self): return float(self.base)
-#    def __complex__(self): return complex(self.base)
-#
-#    def __str__(self):
-#        s = "%s with dimension %d\n" % (self.__class__.__name__, self.dim)
-#        s += _mt.mx_to_string(self.to_dense(), width=4, prec=2)
-#        return s
diff --git a/pygsti/modelmembers/povms/tppovm.py b/pygsti/modelmembers/povms/tppovm.py
index 80753385f..1183f5e3e 100644
--- a/pygsti/modelmembers/povms/tppovm.py
+++ b/pygsti/modelmembers/povms/tppovm.py
@@ -102,29 +102,28 @@ def to_vector(self):
         vec = _np.concatenate(effect_vecs)
         return vec
 
-    def stateless_data(self) -> Tuple[int, _np.ndarray]:
+    def stateless_data(self) -> Tuple[int, _torch.Tensor, int]:
         num_effects = len(self)
         complement_effect = self[self.complement_label]
         identity = complement_effect.identity.to_vector()
-        return (num_effects, identity)
-
-    @staticmethod
-    def torch_base(sd: Tuple[int, _np.ndarray], t_param: _torch.Tensor) -> _torch.Tensor:
-        num_effects, identity = sd
+        identity = identity.reshape((1, -1)) # make into a row vector
+        t_identity = _torch.from_numpy(identity)
+    
         dim = identity.size
-
-        first_basis_vec = _np.zeros(dim)
-        first_basis_vec[0] = dim ** 0.25
+        first_basis_vec = _np.zeros((1,dim))
+        first_basis_vec[0,0] = dim ** 0.25
         TOL = 1e-15 * _np.sqrt(dim)
         if _np.linalg.norm(first_basis_vec - identity) > TOL:
             # Don't error out. The documentation for the class
             # clearly indicates that the meaning of "identity"
             # can be nonstandard.
             warnings.warn('Unexpected normalization!') 
+        return (num_effects, t_identity, dim)
 
-        identity = identity.reshape((1, -1)) # make into a row vector
-        t_identity = _torch.from_numpy(identity)
-        t_param_mat = t_param.reshape((num_effects - 1, dim))
+    @staticmethod
+    def torch_base(sd: Tuple[int, _torch.Tensor, int], t_param: _torch.Tensor) -> _torch.Tensor:
+        num_effects, t_identity, dim = sd
+        t_param_mat = t_param.view(num_effects - 1, dim)
         t_func = t_identity - t_param_mat.sum(axis=0, keepdim=True)
         t = _torch.row_stack((t_param_mat, t_func))
         return t
diff --git a/pygsti/modelmembers/states/composedstate.py b/pygsti/modelmembers/states/composedstate.py
index 03b555b4f..9826db229 100644
--- a/pygsti/modelmembers/states/composedstate.py
+++ b/pygsti/modelmembers/states/composedstate.py
@@ -43,323 +43,6 @@ class ComposedState(_State):  # , _ErrorMapContainer
         parameters with other gates and spam vectors.)
     """
 
-    #@classmethod
-    #def _from_spamvec_obj(cls, spamvec, typ, param_type="GLND", purevec=None,
-    #                      proj_basis="pp", mx_basis="pp", truncate=True,
-    #                      lazy=False):
-    #    """
-    #    Creates a LindbladSPAMVec from an existing SPAMVec object and some additional information.
-    #
-    #    This function is different from `from_spam_vector` in that it assumes
-    #    that `spamvec` is a :class:`SPAMVec`-derived object, and if `lazy=True`
-    #    and if `spamvec` is already a matching LindbladSPAMVec, it
-    #    is returned directly.  This routine is primarily used in spam vector
-    #    conversion functions, where conversion is desired only when necessary.
-    #
-    #    Parameters
-    #    ----------
-    #    spamvec : SPAMVec
-    #        The spam vector object to "convert" to a
-    #        `LindbladSPAMVec`.
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    param_type : str, optional
-    #        The high-level "parameter type" of the gate to create.  This
-    #        specifies both which Lindblad parameters are included and what
-    #        type of evolution is used.  Examples of valid values are
-    #        `"CPTP"`, `"H+S"`, `"S terms"`, and `"GLND clifford terms"`.
-    #
-    #    purevec : numpy array or SPAMVec object, optional
-    #        A SPAM vector which represents a pure-state, taken as the "ideal"
-    #        reference state when constructing the error generator of the
-    #        returned `LindbladSPAMVec`.  Note that this vector
-    #        still acts on density matrices (if it's a SPAMVec it should have
-    #        a "densitymx", "svterm", or "cterm" evolution type, and if it's
-    #        a numpy array it should have the same dimension as `spamvec`).
-    #        If None, then it is taken to be `spamvec`, and so `spamvec` must
-    #        represent a pure state in this case.
-    #
-    #    proj_basis : {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis used to construct the Lindblad-term error generators onto
-    #        which the SPAM vector's error generator is projected.  Allowed values
-    #        are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given `spamvec` cannot
-    #        be realized by the specified set of Lindblad projections.
-    #
-    #    lazy : bool, optional
-    #        If True, then if `spamvec` is already a LindbladSPAMVec
-    #        with the requested details (given by the other arguments), then
-    #        `spamvec` is returned directly and no conversion/copying is
-    #        performed. If False, then a new object is always returned.
-    #
-    #    Returns
-    #    -------
-    #    LindbladSPAMVec
-    #    """
-    #
-    #    if not isinstance(spamvec, SPAMVec):
-    #        spamvec = StaticSPAMVec(spamvec, typ=typ)  # assume spamvec is just a vector
-    #
-    #    if purevec is None:
-    #        purevec = spamvec  # right now, we don't try to extract a "closest pure vec"
-    #        # to spamvec - below will fail if spamvec isn't pure.
-    #    elif not isinstance(purevec, SPAMVec):
-    #        purevec = StaticSPAMVec(purevec, typ=typ)  # assume spamvec is just a vector
-    #
-    #    #Break param_type in to a "base" type and an evotype
-    #    from .operation import LindbladOp as _LPGMap
-    #    bTyp, evotype, nonham_mode, param_mode = _LPGMap.decomp_paramtype(param_type)
-    #
-    #    ham_basis = proj_basis if (("H" == bTyp) or ("H+" in bTyp) or bTyp in ("CPTP", "GLND")) else None
-    #    nonham_basis = None if bTyp == "H" else proj_basis
-    #
-    #    def beq(b1, b2):
-    #        """ Check if bases have equal names """
-    #        b1 = b1.name if isinstance(b1, _Basis) else b1
-    #        b2 = b2.name if isinstance(b2, _Basis) else b2
-    #        return b1 == b2
-    #
-    #    def normeq(a, b):
-    #        if a is None and b is None: return True
-    #        if a is None or b is None: return False
-    #        return _mt.safe_norm(a - b) < 1e-6  # what about possibility of Clifford gates?
-    #
-    #    if isinstance(spamvec, LindbladSPAMVec) \
-    #       and spamvec._evotype == evotype and spamvec.typ == typ \
-    #       and beq(ham_basis, spamvec.error_map.ham_basis) and beq(nonham_basis, spamvec.error_map.other_basis) \
-    #       and param_mode == spamvec.error_map.param_mode and nonham_mode == spamvec.error_map.nonham_mode \
-    #       and beq(mx_basis, spamvec.error_map.matrix_basis) and lazy:
-    #        #normeq(gate.pure_state_vec,purevec) \ # TODO: more checks for equality?!
-    #        return spamvec  # no creation necessary!
-    #    else:
-    #        #Convert vectors (if possible) to SPAMVecs
-    #        # of the appropriate evotype and 0 params.
-    #        bDiff = spamvec is not purevec
-    #        spamvec = _convert_to_lindblad_base(spamvec, typ, evotype, mx_basis)
-    #        purevec = _convert_to_lindblad_base(purevec, typ, evotype, mx_basis) if bDiff else spamvec
-    #        assert(spamvec._evotype == evotype)
-    #        assert(purevec._evotype == evotype)
-    #
-    #        return cls.from_spam_vector(
-    #            spamvec, purevec, typ, ham_basis, nonham_basis,
-    #            param_mode, nonham_mode, truncate, mx_basis, evotype)
-    #
-    #@classmethod
-    #def from_spam_vector(cls, spam_vec, pure_vec, typ,
-    #                     ham_basis="pp", nonham_basis="pp", param_mode="cptp",
-    #                     nonham_mode="all", truncate=True, mx_basis="pp",
-    #                     evotype="densitymx"):
-    #    """
-    #    Creates a Lindblad-parameterized spamvec from a state vector and a basis.
-    #
-    #    The basis specifies how to decompose (project) the vector's error generator.
-    #
-    #    Parameters
-    #    ----------
-    #    spam_vec : SPAMVec
-    #        the SPAM vector to initialize from.  The error generator that
-    #        tranforms `pure_vec` into `spam_vec` forms the parameterization
-    #        of the returned LindbladSPAMVec.
-    #
-    #    pure_vec : numpy array or SPAMVec
-    #        An array or SPAMVec in the *full* density-matrix space (this
-    #        vector will have the same dimension as `spam_vec` - 4 in the case
-    #        of a single qubit) which represents a pure-state preparation or
-    #        projection.  This is used as the "base" preparation/projection
-    #        when computing the error generator that will be parameterized.
-    #        Note that this argument must be specified, as there is no natural
-    #        default value (like the identity in the case of gates).
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    ham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis is used to construct the Hamiltonian-type lindblad error
-    #        Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    nonham_basis: {'std', 'gm', 'pp', 'qt'}, list of matrices, or Basis object
-    #        The basis is used to construct the Stochastic-type lindblad error
-    #        Allowed values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt), list of numpy arrays, or a custom basis object.
-    #
-    #    param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-    #        Describes how the Lindblad coefficients/projections relate to the
-    #        SPAM vector's parameter values.  Allowed values are:
-    #        `"unconstrained"` (coeffs are independent unconstrained parameters),
-    #        `"cptp"` (independent parameters but constrained so map is CPTP),
-    #        `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-    #        `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-    #
-    #    nonham_mode : {"diagonal", "diag_affine", "all"}
-    #        Which non-Hamiltonian Lindblad projections are potentially non-zero.
-    #        Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-    #        `"diag_affine"` (diagonal coefficients + affine projections), and
-    #        `"all"` (the entire matrix of coefficients is allowed).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given `gate` cannot
-    #        be realized by the specified set of Lindblad projections.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    evotype : {"densitymx","svterm","cterm"}
-    #        The evolution type of the spamvec being constructed.  `"densitymx"` is
-    #        usual Lioville density-matrix-vector propagation via matrix-vector
-    #        products.  `"svterm"` denotes state-vector term-based evolution
-    #        (spamvec is obtained by evaluating the rank-1 terms up to
-    #        some order).  `"cterm"` is similar but stabilizer states.
-    #
-    #    Returns
-    #    -------
-    #    LindbladSPAMVec
-    #    """
-    #    #Compute a (errgen, pure_vec) pair from the given
-    #    # (spam_vec, pure_vec) pair.
-    #
-    #    assert(pure_vec is not None), "Must supply `pure_vec`!"  # since there's no good default?
-    #
-    #    if not isinstance(spam_vec, SPAMVec):
-    #        spam_vec = StaticSPAMVec(spam_vec, evotype, typ)  # assume spamvec is just a vector
-    #    if not isinstance(pure_vec, SPAMVec):
-    #        pure_vec = StaticSPAMVec(pure_vec, evotype, typ)  # assume spamvec is just a vector
-    #    d2 = pure_vec.dim
-    #
-    #    #Determine whether we're using sparse bases or not
-    #    sparse = None
-    #    if ham_basis is not None:
-    #        if isinstance(ham_basis, _Basis): sparse = ham_basis.sparse
-    #        elif not isinstance(ham_basis, str) and len(ham_basis) > 0:
-    #            sparse = _sps.issparse(ham_basis[0])
-    #    if sparse is None and nonham_basis is not None:
-    #        if isinstance(nonham_basis, _Basis): sparse = nonham_basis.sparse
-    #        elif not isinstance(nonham_basis, str) and len(nonham_basis) > 0:
-    #            sparse = _sps.issparse(nonham_basis[0])
-    #    if sparse is None: sparse = False  # the default
-    #
-    #    if spam_vec is None or spam_vec is pure_vec:
-    #        if sparse: errgen = _sps.csr_matrix((d2, d2), dtype='d')
-    #        else: errgen = _np.zeros((d2, d2), 'd')
-    #    else:
-    #        #Construct "spam error generator" by comparing *dense* vectors
-    #        pvdense = pure_vec.to_dense()
-    #        svdense = spam_vec.to_dense()
-    #        errgen = _ot.spam_error_generator(svdense, pvdense, mx_basis)
-    #        if sparse: errgen = _sps.csr_matrix(errgen)
-    #
-    #    assert(pure_vec._evotype == evotype), "`pure_vec` must have evotype == '%s'" % evotype
-    #
-    #    from .operation import LindbladErrorgen as _LErrorgen
-    #    from .operation import LindbladOp as _LPGMap
-    #    from .operation import LindbladDenseOp as _LPOp
-    #
-    #    errgen = _LErrorgen.from_error_generator(errgen, ham_basis,
-    #                                             nonham_basis, param_mode, nonham_mode,
-    #                                             mx_basis, truncate, evotype)
-    #    errcls = _LPOp if (pure_vec.dim <= 64 and evotype == "densitymx") else _LPGMap
-    #    errmap = errcls(None, errgen)
-    #
-    #    return cls(pure_vec, errmap, typ)
-
-    #@classmethod
-    #def from_lindblad_terms(cls, pure_vec, lindblad_term_dict, typ, basisdict=None,
-    #                        param_mode="cptp", nonham_mode="all", truncate=True,
-    #                        mx_basis="pp", evotype="densitymx"):
-    #    """
-    #    Create a Lindblad-parameterized spamvec with a given set of Lindblad terms.
-    #
-    #    Parameters
-    #    ----------
-    #    pure_vec : numpy array or SPAMVec
-    #        An array or SPAMVec in the *full* density-matrix space (this
-    #        vector will have dimension 4 in the case of a single qubit) which
-    #        represents a pure-state preparation or projection.  This is used as
-    #        the "base" preparation or projection that is followed or preceded
-    #        by, respectively, the parameterized Lindblad-form error generator.
-    #
-    #    lindblad_term_dict : dict
-    #        A dictionary specifying which Linblad terms are present in the gate
-    #        parameteriztion.  Keys are `(termType, basisLabel1, <basisLabel2>)`
-    #        tuples, where `termType` can be `"H"` (Hamiltonian), `"S"`
-    #        (Stochastic), or `"A"` (Affine).  Hamiltonian and Affine terms always
-    #        have a single basis label (so key is a 2-tuple) whereas Stochastic
-    #        tuples with 1 basis label indicate a *diagonal* term, and are the
-    #        only types of terms allowed when `nonham_mode != "all"`.  Otherwise,
-    #        Stochastic term tuples can include 2 basis labels to specify
-    #        "off-diagonal" non-Hamiltonian Lindblad terms.  Basis labels can be
-    #        strings or integers.  Values are complex coefficients (error rates).
-    #
-    #    typ : {"prep","effect"}
-    #        Whether this is a state preparation or POVM effect vector.
-    #
-    #    basisdict : dict, optional
-    #        A dictionary mapping the basis labels (strings or ints) used in the
-    #        keys of `lindblad_term_dict` to basis matrices (numpy arrays or Scipy sparse
-    #        matrices).
-    #
-    #    param_mode : {"unconstrained", "cptp", "depol", "reldepol"}
-    #        Describes how the Lindblad coefficients/projections relate to the
-    #        SPAM vector's parameter values.  Allowed values are:
-    #        `"unconstrained"` (coeffs are independent unconstrained parameters),
-    #        `"cptp"` (independent parameters but constrained so map is CPTP),
-    #        `"reldepol"` (all non-Ham. diagonal coeffs take the *same* value),
-    #        `"depol"` (same as `"reldepol"` but coeffs must be *positive*)
-    #
-    #    nonham_mode : {"diagonal", "diag_affine", "all"}
-    #        Which non-Hamiltonian Lindblad projections are potentially non-zero.
-    #        Allowed values are: `"diagonal"` (only the diagonal Lind. coeffs.),
-    #        `"diag_affine"` (diagonal coefficients + affine projections), and
-    #        `"all"` (the entire matrix of coefficients is allowed).
-    #
-    #    truncate : bool, optional
-    #        Whether to truncate the projections onto the Lindblad terms in
-    #        order to meet constraints (e.g. to preserve CPTP) when necessary.
-    #        If False, then an error is thrown when the given dictionary of
-    #        Lindblad terms doesn't conform to the constrains.
-    #
-    #    mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis object
-    #        The source and destination basis, respectively.  Allowed
-    #        values are Matrix-unit (std), Gell-Mann (gm), Pauli-product (pp),
-    #        and Qutrit (qt) (or a custom basis object).
-    #
-    #    evotype : {"densitymx","svterm","cterm"}
-    #        The evolution type of the spamvec being constructed.  `"densitymx"` is
-    #        usual Lioville density-matrix-vector propagation via matrix-vector
-    #        products.  `"svterm"` denotes state-vector term-based evolution
-    #        (spamvec is obtained by evaluating the rank-1 terms up to
-    #        some order).  `"cterm"` is similar but stabilizer states.
-    #
-    #    Returns
-    #    -------
-    #    LindbladOp
-    #    """
-    #    #Need a dimension for error map construction (basisdict could be completely empty)
-    #    if not isinstance(pure_vec, SPAMVec):
-    #        pure_vec = StaticSPAMVec(pure_vec, evotype, typ)  # assume spamvec is just a vector
-    #    d2 = pure_vec.dim
-    #
-    #    from .operation import LindbladOp as _LPGMap
-    #    errmap = _LPGMap(d2, lindblad_term_dict, basisdict, param_mode, nonham_mode,
-    #                     truncate, mx_basis, evotype)
-    #    return cls(pure_vec, errmap, typ)
-
     def __init__(self, static_state, errormap):
         evotype = errormap._evotype
         #from .operation import LindbladOp as _LPGMap
diff --git a/pygsti/modelmembers/states/computationalstate.py b/pygsti/modelmembers/states/computationalstate.py
index 1d4114856..e27654bd6 100644
--- a/pygsti/modelmembers/states/computationalstate.py
+++ b/pygsti/modelmembers/states/computationalstate.py
@@ -140,7 +140,7 @@ def __init__(self, zvals, basis='pp', evotype="default", state_space=None):
             else _statespace.StateSpace.cast(state_space)
         basis = _Basis.cast(basis, state_space)  # basis for Hilbert-Schmidt (superop) space
 
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         self._evotype = evotype  # set this before call to _State.__init__ so self.to_dense() can work...
         rep = evotype.create_computational_state_rep(self._zvals, basis, state_space)
         _State.__init__(self, rep, evotype)
diff --git a/pygsti/modelmembers/states/cptpstate.py b/pygsti/modelmembers/states/cptpstate.py
index 3cae0ea7b..8dbb5db22 100644
--- a/pygsti/modelmembers/states/cptpstate.py
+++ b/pygsti/modelmembers/states/cptpstate.py
@@ -84,7 +84,7 @@ def __init__(self, vec, basis, truncate=False, evotype="default", state_space=No
         state_space = _statespace.default_space_for_dim(len(vector)) if (state_space is None) \
             else _statespace.StateSpace.cast(state_space)
 
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         _DenseState.__init__(self, vector, basis, evotype, state_space)
         self._paramlbls = _np.array(labels, dtype=object)
 
diff --git a/pygsti/modelmembers/states/densestate.py b/pygsti/modelmembers/states/densestate.py
index 3c7df543f..b08cf0559 100644
--- a/pygsti/modelmembers/states/densestate.py
+++ b/pygsti/modelmembers/states/densestate.py
@@ -100,6 +100,12 @@ def __setitem__(self, key, val):
         ret = self.columnvec.__setitem__(key, val)
         self._ptr_has_changed()
         return ret
+    
+    def __getstate__(self):
+        return self.__dict__
+    
+    def __setstate__(self, d):
+        self.__dict__.update(d)
 
     def __getattr__(self, attr):
         #use __dict__ so no chance for recursive __getattr__
@@ -170,7 +176,7 @@ def __init__(self, vec, basis, evotype, state_space):
             state_space = _statespace.default_space_for_dim(vec.shape[0])
         else:
             state_space = _statespace.StateSpace.cast(state_space)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         self._basis = _Basis.cast(basis, state_space.dim)
         rep = evotype.create_dense_state_rep(vec, self._basis, state_space)
 
@@ -265,7 +271,7 @@ def __init__(self, purevec, basis, evotype, state_space):
         purevec = purevec.astype(complex)
         state_space = _statespace.default_space_for_udim(purevec.shape[0]) if (state_space is None) \
             else _statespace.StateSpace.cast(state_space)
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         basis = _Basis.cast(basis, state_space.dim)  # basis for Hilbert-Schmidt (superop) space
         
         #Try to create a dense pure rep.  If this fails, see if a dense superket rep
diff --git a/pygsti/modelmembers/states/purestate.py b/pygsti/modelmembers/states/purestate.py
index 17e8ed4d8..f67691c1c 100644
--- a/pygsti/modelmembers/states/purestate.py
+++ b/pygsti/modelmembers/states/purestate.py
@@ -64,7 +64,7 @@ def __init__(self, pure_state, evotype='default', dm_basis='pp'):
         self.pure_state = pure_state
         self.basis = dm_basis  # only used for dense conversion
 
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=self.pure_state.state_space)
         #rep = evotype.create_state_rep()
         #rep.init_from_dense_purevec(pure_state)
         raise NotImplementedError("Maybe this class isn't even needed, or need to create a static pure state class?")
diff --git a/pygsti/modelmembers/states/tpstate.py b/pygsti/modelmembers/states/tpstate.py
index 659d6da24..c74c49d78 100644
--- a/pygsti/modelmembers/states/tpstate.py
+++ b/pygsti/modelmembers/states/tpstate.py
@@ -166,13 +166,14 @@ def from_vector(self, v, close=False, dirty_value=True):
         self._ptr_has_changed()
         self.dirty = dirty_value
 
-    def stateless_data(self) -> Tuple[int]:
-        return (self.dim,)
+    def stateless_data(self) -> Tuple[_torch.Tensor]:
+        dim = self.dim
+        t_const = (dim ** -0.25) * _torch.ones(1, dtype=_torch.double) 
+        return (t_const,)
 
     @staticmethod
-    def torch_base(sd: Tuple[int], t_param: _torch.Tensor) -> _torch.Tensor:
-        dim = sd[0]
-        t_const = (dim ** -0.25) * _torch.ones(1, dtype=_torch.double) 
+    def torch_base(sd: Tuple[_torch.Tensor], t_param: _torch.Tensor) -> _torch.Tensor:
+        t_const = sd[0]
         t = _torch.concat((t_const, t_param)) 
         return t
 
diff --git a/pygsti/modelmembers/term.py b/pygsti/modelmembers/term.py
index b0ca406a6..3023236e9 100644
--- a/pygsti/modelmembers/term.py
+++ b/pygsti/modelmembers/term.py
@@ -384,7 +384,7 @@ def create_from(cls, coeff, pre_state, post_state, evotype, state_space):
         -------
         RankOnePrepTerm
         """
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         default_basis = 'pp'
 
         if isinstance(pre_state, _mm.ModelMember):
@@ -459,7 +459,7 @@ def create_from(cls, coeff, pre_effect, post_effect, evotype, state_space):
         -------
         RankOneEffectTerm
         """
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         default_basis = 'pp'
 
         if isinstance(pre_effect, _mm.ModelMember):
@@ -534,7 +534,7 @@ def create_from(cls, coeff, pre_op, post_op, evotype, state_space):
         -------
         RankOneOpTerm
         """
-        evotype = _Evotype.cast(evotype)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
         pre_reps = []
         post_reps = []
 
diff --git a/pygsti/models/cloudnoisemodel.py b/pygsti/models/cloudnoisemodel.py
index acca14f1c..8147bbf57 100644
--- a/pygsti/models/cloudnoisemodel.py
+++ b/pygsti/models/cloudnoisemodel.py
@@ -156,8 +156,7 @@ def __init__(self, processor_spec, gatedict,
 
         simulator = _FSim.cast(simulator,
                                state_space.num_qubits if isinstance(state_space, _statespace.QubitSpace) else None)
-        prefer_dense_reps = isinstance(simulator, _MatrixFSim)
-        evotype = _Evotype.cast(evotype, default_prefer_dense_reps=prefer_dense_reps)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
 
         # Build gate dictionaries. A value of `gatedict` can be an array, a LinearOperator, or an OpFactory.
         # For later processing, we'll create mm_gatedict to contain each item as a ModelMember.  For cloud-
diff --git a/pygsti/models/fogistore.py b/pygsti/models/fogistore.py
index 389925321..ccbd80848 100644
--- a/pygsti/models/fogistore.py
+++ b/pygsti/models/fogistore.py
@@ -265,8 +265,9 @@ def opcoeffs_to_fogiv_components_array(self, op_coeffs):
         errorgen_vec = _np.zeros(self.errorgen_space_dim, 'd')
         for i, (op_label, elem_lbl) in enumerate(self.errorgen_space_op_elem_labels):
             errorgen_vec[i] += op_coeffs[op_label].get(elem_lbl, 0.0)
-        return self.errorgen_vec_to_fogi_components_array(errorgen_vec), \
-            self.errorgen_vec_to_fogv_components_array(errorgen_vec)
+        out1 = self.errorgen_vec_to_fogi_components_array(errorgen_vec)
+        out2 = self.errorgen_vec_to_fogv_components_array(errorgen_vec)
+        return out1, out2
 
     def fogi_components_array_to_errorgen_vec(self, fogi_components):
         assert(self._dependent_fogi_action == 'drop'), \
@@ -547,7 +548,8 @@ def create_fogi_aggregate_single_op_space(self, op_label, errorgen_type='H',
         else:
             raise ValueError("Invalid intrinsic_or_relational value: `%s`" % str(intrinsic_or_relational))
 
-        space = _mt.remove_dependent_cols(space)
+        col_indices = _mt.independent_columns(space)
+        space = space[:, col_indices]
         return space
 
     @classmethod
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index b76613179..782148f9b 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -152,8 +152,7 @@ def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None,
 
         simulator = _FSim.cast(simulator,
                                state_space.num_qubits if isinstance(state_space, _statespace.QubitSpace) else None)
-        prefer_dense_reps = isinstance(simulator, _MatrixFSim)
-        evotype = _Evotype.cast(evotype, default_prefer_dense_reps=prefer_dense_reps)
+        evotype = _Evotype.cast(evotype, state_space=state_space)
 
         # Build gate dictionaries. A value of `gatedict` can be an array, a LinearOperator, or an OpFactory.
         # For later processing, we'll create mm_gatedict to contain each item as a ModelMember.  In local noise
diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index 368308a46..f4496788a 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -16,7 +16,6 @@
 import uuid as _uuid
 import warnings as _warnings
 import collections as _collections
-
 import numpy as _np
 
 from pygsti.baseobjs import statespace as _statespace
@@ -27,11 +26,13 @@
 from pygsti.forwardsims import forwardsim as _fwdsim
 from pygsti.modelmembers import modelmember as _gm
 from pygsti.modelmembers import operations as _op
+from pygsti.modelmembers.povms import POVM as _POVM, POVMEffect as _POVMEffect
 from pygsti.baseobjs.basis import Basis as _Basis, TensorProdBasis as _TensorProdBasis
 from pygsti.baseobjs.label import Label as _Label
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.tools import slicetools as _slct
 from pygsti.tools import matrixtools as _mt
+from pygsti.circuits import Circuit as _Circuit, SeparatePOVMCircuit as _SeparatePOVMCircuit
 
 MEMLIMIT_FOR_NONGAUGE_PARAMS = None
 
@@ -176,7 +177,8 @@ def set_parameter_bounds(self, index, lower_bound=-_np.inf, upper_bound=_np.inf)
         if lower_bound == -_np.inf and upper_bound == _np.inf:
             return  # do nothing
 
-        if self._param_bounds is None:
+        #Note, this property call will also invoke a param vector rebuild if needed.
+        if self.parameter_bounds is None:
             self._param_bounds = _default_param_bounds(self.num_params)
         self._param_bounds[index, :] = (lower_bound, upper_bound)
 
@@ -465,9 +467,9 @@ def __init__(self, state_space, basis, evotype, layer_rules, simulator="auto"):
         """
         Creates a new OpModel.  Rarely used except from derived classes `__init__` functions.
         """
-        self._evotype = _Evotype.cast(evotype)
         self._set_state_space(state_space, basis)
         #sets self._state_space, self._basis
+        self._evotype = _Evotype.cast(evotype, state_space=self.state_space)
 
         super(OpModel, self).__init__(self.state_space)  # do this as soon as possible
 
@@ -479,6 +481,8 @@ def __init__(self, state_space, basis, evotype, layer_rules, simulator="auto"):
         self._param_interposer = None
         self._reinit_opcaches()
         self.fogi_store = None
+        self._index_mm_map = None
+        self._index_mm_label_map = None
 
     def __setstate__(self, state_dict):
         self.__dict__.update(state_dict)
@@ -603,6 +607,56 @@ def num_params(self):
         self._clean_paramvec()
         return len(self._paramvec)
 
+    @property
+    def parameter_labels(self):
+        """
+        A list of labels, usually of the form `(op_label, string_description)` describing this model's parameters.
+        """
+        self._clean_paramvec()
+        return self._ops_paramlbls_to_model_paramlbls(self._paramlbls)
+    
+    def set_parameter_label(self, index, label):
+        """
+        Set the label of a single model parameter.
+
+        Parameters
+        ----------
+        index : int
+            The index of the paramter whose label should be set.
+
+        label : object
+            An object that serves to label this parameter.  Often a string.
+
+        Returns
+        -------
+        None
+        """
+        self._clean_paramvec()
+        self._paramlbls[index] = label
+    
+    @property
+    def parameter_bounds(self):
+        """ Upper and lower bounds on the values of each parameter, utilized by optimization routines """
+        self._clean_paramvec()
+        return self._param_bounds
+    
+    @property
+    def num_modeltest_params(self):
+        """
+        The parameter count to use when testing this model against data.
+
+        Often times, this is the same as :meth:`num_params`, but there are times
+        when it can convenient or necessary to use a parameter count different than
+        the actual number of parameters in this model.
+
+        Returns
+        -------
+        int
+            the number of model parameters.
+        """
+        self._clean_paramvec()
+        return Model.num_modeltest_params.fget(self)
+
     def _iter_parameterized_objs(self):
         raise NotImplementedError("Derived Model classes should implement _iter_parameterized_objs")
         #return # default is to have no parameterized objects
@@ -646,10 +700,6 @@ def _clean_paramvec(self):
         #    flag as a result of this call.  `False` is the safe option, as
         #    this call potentially changes this operation's parameters.
 
-        #print("Cleaning Paramvec (dirty=%s, rebuild=%s)" % (self.dirty, self._need_to_rebuild))
-        #import inspect, pprint
-        #pprint.pprint([(x.filename,x.lineno,x.function) for x in inspect.stack()[0:7]])
-
         if self._need_to_rebuild:
             self._rebuild_paramvec()
             self._need_to_rebuild = False
@@ -897,6 +947,13 @@ def _rebuild_paramvec(self):
         w = self._model_paramvec_to_ops_paramvec(self._paramvec)
         Np = len(w)  # NOT self.num_params since the latter calls us!
         wl = self._paramlbls
+        
+        if self._param_bounds is not None:
+            msg = 'Internal Model attributes are being rebuilt. This is likely because a modelmember has been '\
+                  + 'either added or removed. If you have manually set parameter bounds values at the Model level '\
+                  + '(not the model member level), for example using the `set_parameter_bounds` method, these values '\
+                  + 'will be overwritten by the parameter bounds found in each of the modelmembers.' 
+            _warnings.warn(msg)
         wb = self._param_bounds if (self._param_bounds is not None) else _default_param_bounds(Np)
         #NOTE: interposer doesn't quite work with parameter bounds yet, as we need to convert "model"
         # bounds to "ops" bounds like we do the parameter vector.  Need something like:
@@ -1014,7 +1071,6 @@ def _rebuild_paramvec(self):
         Np = len(w)  # reset Np from possible new params (NOT self.num_params since the latter calls us!)
         indices_to_remove = sorted(set(range(Np)) - used_gpindices)
         if debug: print("Indices to remove = ", indices_to_remove, " of ", Np)
-
         if len(indices_to_remove) > 0:
             #if debug: print("DEBUG: Removing %d params:"  % len(indices_to_remove), indices_to_remove)
             w = _np.delete(w, indices_to_remove)
@@ -1037,9 +1093,13 @@ def _get_shift(j): return _bisect.bisect_left(indices_to_remove, j)
                 obj.set_gpindices(new_inds, self, memo)
 
         self._paramvec = self._ops_paramvec_to_model_paramvec(w)
-        self._paramlbls = self._ops_paramlbls_to_model_paramlbls(wl)
+        self._paramlbls = wl
         self._param_bounds = wb if _param_bounds_are_nontrivial(wb) else None
         if debug: print("DEBUG: Done rebuild: %d op params" % len(w))
+        
+        #rebuild the model index to model member map if needed.
+        self._build_index_mm_map()
+
 
     def _init_virtual_obj(self, obj):
         """
@@ -1066,6 +1126,34 @@ def _obj_refcount(self, obj):
         for _, o in self._iter_parameterized_objs():
             cnt += o._obj_refcount(obj)
         return cnt
+    
+    def _build_index_mm_map(self):
+        """
+        Build a map between indices into a model's parameter vector and the corresponding children.
+        The map is a list whose indices are indexes into the model's parameter vector and whose values are
+        lists (because there can be more than one with parameter collection) of references to the 
+        corresponding child model members who's gpindices correspond it.
+        """
+
+        #Mapping between the model index and the corresponding model members will be more complicated
+        #when there is a parameter interposer, so table implementing this for that case.
+        if self.param_interposer is not None:
+            self._index_mm_map = None
+            self._index_mm_label_map = None
+        else:
+            index_mm_map = [[] for _ in range(len(self._paramvec))]
+            index_mm_label_map = [[] for _ in range(len(self._paramvec))]
+            
+            for lbl, obj in self._iter_parameterized_objs():
+                #if the gpindices are a slice then convert to a list of indices.
+                gpindices = _slct.indices(obj.gpindices) if isinstance(obj.gpindices, slice) else obj.gpindices
+                for gpidx in gpindices:
+                    index_mm_map[gpidx].append(obj)
+                    index_mm_label_map[gpidx].append(lbl)
+            self._index_mm_map = index_mm_map
+            self._index_mm_label_map = index_mm_label_map
+        #Note to future selves. If we add a flag indicating the presence of collected parameters
+        #then we can improve the performance of this by using a simpler structure when no collected
 
     def to_vector(self):
         """
@@ -1114,6 +1202,105 @@ def from_vector(self, v, close=False):
 
         if OpModel._pcheck: self._check_paramvec()
 
+    def set_parameter_value(self, index, val, close=False):
+        """
+        This method allows for updating the value of a single model parameter at the
+        specified parameter index.
+
+        Parameters
+        ----------
+        index : int or tuple
+            Index of the parameter value in the model's parameter vector to update.
+            If a tuple this instead indexes by the corresponding parameter label.
+        
+        val : float
+            Updated parameter value.
+
+        close : bool, optional
+            Set to `True` if val is close to the current parameter vector.
+            This can make some operations more efficient.  
+
+        Returns
+        -------
+        None
+        """
+        
+        self.set_parameter_values([index], [val], close)
+        
+        
+
+    def set_parameter_values(self, indices, values, close=False):
+        """
+        This method allows for updating the values of multiple model parameter at the
+        specified parameter indices.
+
+        Parameters
+        ----------
+        indices : list of ints or tuples
+            Indices of the parameter values in the model's parameter vector to update.
+            If tuples this instead indexes by the corresponding parameter label.
+            Mixing integer indices and parameter label tuples is not supported.
+            Note: In the event that the parameter labels vector for this model contains
+            duplicates the update may only apply to the first instance.
+        
+        values : list or tuple of floats
+            Updated parameter values.
+
+        close : bool, optional
+            Set to `True` if values are close to the current parameter vector.
+            This can make some operations more efficient.  
+
+        Returns
+        -------
+        None
+        """
+
+        if isinstance(indices[0], tuple):
+            #parse the strings into integer indices.
+            param_labels_list = self.parameter_labels.tolist()
+            indices = [param_labels_list.index(lbl) for lbl in indices]
+                        
+        for idx, val in zip(indices, values):
+            self._paramvec[idx] = val
+
+        if self._param_interposer is not None or self._index_mm_map is None:
+            #fall back to standard from_vector call.
+            self.from_vector(self._paramvec)
+        else:
+            #get all of the model members which need to be be updated and loop through them to update their
+            #parameters.
+            unique_mms = {lbl:val for idx in indices for lbl, val in zip(self._index_mm_label_map[idx], self._index_mm_map[idx])}
+            for obj in unique_mms.values():
+                obj.from_vector(self._paramvec[obj.gpindices].copy(), close, dirty_value=False)
+            
+            #go through the model members which have been updated and identify whether any of them have children
+            #which may be present in the _opcaches which have already been updated by the parents. I think the
+            #conditions under which this should be safe are: a) the layer rules are ExplicitLayerRules,
+            #b) The parent is a POVM (it should be safe to assume that POVMs update their children, 
+            #and c) the effect is a child of that POVM.
+            
+            if isinstance(self._layer_rules, _ExplicitLayerRules):
+                updated_children = []
+                for obj in unique_mms.values():
+                    if isinstance(obj, _POVM):
+                        updated_children.extend(obj.values())
+            else:
+                updated_children = None
+
+            # Call from_vector on elements of the cache
+            if self._call_fromvector_on_cache:
+                #print(f'{self._opcaches=}')
+                for opcache in self._opcaches.values():
+                    for obj in opcache.values():
+                        opcache_elem_gpindices = _slct.indices(obj.gpindices) if isinstance(obj.gpindices, slice) else obj.gpindices
+                        if any([idx in opcache_elem_gpindices for idx in indices]):
+                            #check whether we have already updated this object.
+                            if updated_children is not None and any([child is obj for child in updated_children]):
+                                continue
+                            obj.from_vector(self._paramvec[opcache_elem_gpindices], close, dirty_value=False)
+
+            if OpModel._pcheck: self._check_paramvec()
+
     @property
     def param_interposer(self):
         return self._param_interposer
@@ -1139,6 +1326,8 @@ def _ops_paramlbls_to_model_paramlbls(self, w):
         return self.param_interposer.ops_paramlbls_to_model_paramlbls(w) \
             if (self.param_interposer is not None) else w
 
+#------Model-Specific Circuit Operations------------#
+
     def circuit_outcomes(self, circuit):
         """
         Get all the possible outcome labels produced by simulating this circuit.
@@ -1150,10 +1339,45 @@ def circuit_outcomes(self, circuit):
 
         Returns
         -------
-        tuple
+        tuple corresponding to the possible outcomes for circuit.
         """
-        outcomes = circuit.expand_instruments_and_separate_povm(self)  # dict w/keys=sep-povm-circuits, vals=outcomes
+        outcomes = self.expand_instruments_and_separate_povm(circuit)  # dict w/keys=sep-povm-circuits, vals=outcomes
         return tuple(_itertools.chain(*outcomes.values()))  # concatenate outputs from all sep-povm-circuits
+    
+    def bulk_circuit_outcomes(self, circuits, split_circuits=None, completed_circuits=None):
+        """
+        Get all the possible outcome labels produced by simulating each of the circuits
+        in this list of circuits.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            list of Circuits to get outcomes of.
+        
+        split_circuits : list of tuples, optional (default None)
+            If specified, this is a list of tuples for each circuit corresponding to the splitting of
+            the circuit into the prep label, spam-free circuit, and povm label. This is the same format
+            produced by the :meth:split_circuit(s) method, and so this option can allow for accelerating this
+            method when that has previously been run. When using this kwarg only one of this or 
+            the `complete_circuits` kwargs should be used.
+
+        completed_circuits : list of Circuits, optional (default None)
+            If specified, this is a list of compeleted circuits with prep and povm labels included.
+            This is the format produced by the :meth:complete_circuit(s) method, and this can
+            be used to accelerate this method call when that has been previously run. Should not
+            be used in conjunction with `split_circuits`.
+
+        Returns
+        -------
+        list of tuples corresponding to the possible outcomes for each circuit.
+        """
+
+        # list of dict w/keys=sep-povm-circuits, vals=outcomes
+        outcomes_list = self.bulk_expand_instruments_and_separate_povm(circuits, 
+                                                                       split_circuits=split_circuits,
+                                                                       completed_circuits=completed_circuits)  
+        
+        return [tuple(_itertools.chain(*outcomes.values())) for outcomes in outcomes_list]  # concatenate outputs from all sep-povm-circuits
 
     def split_circuit(self, circuit, erroron=('prep', 'povm'), split_prep=True, split_povm=True):
         """
@@ -1356,9 +1580,185 @@ def complete_circuit(self, circuit, prep_lbl_to_prepend=None, povm_lbl_to_append
         Circuit
             Possibly the same object as `circuit`, if no additions are needed.
         """
-
         comp_circuit = self.complete_circuits([circuit], prep_lbl_to_prepend, povm_lbl_to_append, False)
         return comp_circuit[0]
+    
+    def expand_instruments_and_separate_povm(self, circuit, observed_outcomes=None):
+        """
+        Creates a dictionary of :class:`SeparatePOVMCircuit` objects from expanding the instruments of this circuit.
+
+        Each key of the returned dictionary replaces the instruments in this circuit with a selection
+        of their members.  (The size of the resulting dictionary is the product of the sizes of
+        each instrument appearing in this circuit when `observed_outcomes is None`).  Keys are stored
+        as :class:`SeparatePOVMCircuit` objects so it's easy to keep track of which POVM outcomes (effects)
+        correspond to observed data.  This function is, for the most part, used internally to process
+        a circuit before computing its outcome probabilities.
+
+        Parameters
+        ----------
+        circuit : Circuit
+            The circuit to expand, using necessary details regarding the expansion from this model, including:
+
+            - default SPAM layers
+            - definitions of instrument-containing layers
+            - expansions of individual instruments and POVMs
+
+        observed_outcomes : iterable, optional (default None)
+            If specified an iterable over the subset of outcomes empirically observed for this circuit.
+
+        Returns
+        -------
+        OrderedDict
+            A dict whose keys are :class:`SeparatePOVMCircuit` objects and whose
+            values are tuples of the outcome labels corresponding to this circuit,
+            one per POVM effect held in the key.
+        """
+        expanded_circuit_outcomes = self.bulk_expand_instruments_and_separate_povm([circuit], [observed_outcomes])
+        return expanded_circuit_outcomes[0]
+    
+    def bulk_expand_instruments_and_separate_povm(self, circuits, observed_outcomes_list=None, split_circuits = None, 
+                                                  completed_circuits = None):
+        """
+        Creates a list of dictionaries mapping from :class:`SeparatePOVMCircuit` 
+        objects from expanding the instruments of this circuit.
+
+        Each key of the returned dictionary replaces the instruments in this circuit with a selection
+        of their members.  (The size of the resulting dictionary is the product of the sizes of
+        each instrument appearing in this circuit when `observed_outcomes is None`).  Keys are stored
+        as :class:`SeparatePOVMCircuit` objects so it's easy to keep track of which POVM outcomes (effects)
+        correspond to observed data.  This function is, for the most part, used internally to process
+        a circuit before computing its outcome probabilities.
+
+        This function works similarly to expand_instruments_and_separate_povm, except it operates on
+        an entire list of circuits at once, and provides additional kwargs to accelerate computation.
+
+        Parameters
+        ----------
+        circuit : Circuit
+            The circuit to expand, using necessary details regarding the expansion from this model, including:
+
+            - default SPAM layers
+            - definitions of instrument-containing layers
+            - expansions of individual instruments and POVMs
+
+        observed_outcomes_list : list of iterables, optional (default None)
+            If specified a list of iterables over the subset of outcomes empirically observed for each circuit.
+        
+        split_circuits : list of tuples, optional (default None)
+            If specified, this is a list of tuples for each circuit corresponding to the splitting of
+            the circuit into the prep label, spam-free circuit, and povm label. This is the same format
+            produced by the :meth:split_circuit(s) method, and so this option can allow for accelerating this
+            method when that has previously been run. When using this kwarg only one of this or 
+            the `complete_circuits` kwargs should be used.
+
+        completed_circuits : list of Circuits, optional (default None)
+            If specified, this is a list of compeleted circuits with prep and povm labels included.
+            This is the format produced by the :meth:complete_circuit(s) method, and this can
+            be used to accelerate this method call when that has been previously run. Should not
+            be used in conjunction with `split_circuits`.
+
+        Returns
+        -------
+        list of OrderedDicts
+            A list of dictionaries whose keys are :class:`SeparatePOVMCircuit` objects and whose
+            values are tuples of the outcome labels corresponding to each circuit,
+            one per POVM effect held in the key.
+        """
+
+        assert(not (completed_circuits is not None and split_circuits is not None)), "Inclusion of non-trivial values"\
+              +" for both `complete_circuits` and `split_circuits` is not supported. Please use only one of these two arguments."
+
+        if split_circuits is not None:
+            povm_lbls = [split_ckt[2] for split_ckt in split_circuits]
+            circuits_without_povm = [(split_ckt[0],) + split_ckt[1] for split_ckt in split_circuits]
+        elif completed_circuits is not None:
+            povm_lbls = [comp_ckt[-1] for comp_ckt in completed_circuits]
+            circuits_without_povm = [comp_ckt[:-1] for comp_ckt in completed_circuits]
+        else:
+            completed_circuits = self.complete_circuits(circuits)
+            povm_lbls = [comp_ckt[-1] for comp_ckt in completed_circuits]
+            circuits_without_povm = [comp_ckt[:-1] for comp_ckt in completed_circuits]
+        
+        if observed_outcomes_list is None:
+            observed_outcomes_list = [None]*len(circuits)
+
+
+        expanded_circuit_outcomes_list = [_collections.OrderedDict() for _ in range(len(circuits))]
+
+        def create_tree(lst):
+            subs = _collections.OrderedDict()
+            for el in lst:
+                if len(el) > 0:
+                    if el[0] not in subs: subs[el[0]] = []
+                    subs[el[0]].append(el[1:])
+            return _collections.OrderedDict([(k, create_tree(sub_lst)) for k, sub_lst in subs.items()])
+
+        def add_expanded_circuit_outcomes(circuit, running_outcomes, ootree, start):
+            """
+            """
+            cir = circuit if start == 0 else circuit[start:]  # for performance, avoid uneeded slicing
+            for k, layer_label in enumerate(cir, start=start):
+                components = layer_label.components
+                #instrument_inds = _np.nonzero([model._is_primitive_instrument_layer_lbl(component)
+                #                               for component in components])[0]  # SLOWER than statement below
+                instrument_inds = _np.array([i for i, component in enumerate(components)
+                                             if self._is_primitive_instrument_layer_lbl(component)])
+                if instrument_inds.size > 0:
+                    # This layer contains at least one instrument => recurse with instrument(s) replaced with
+                    #  all combinations of their members.
+                    component_lookup = {i: comp for i, comp in enumerate(components)}
+                    instrument_members = [self._member_labels_for_instrument(components[i])
+                                          for i in instrument_inds]  # also components of outcome labels
+                    for selected_instrmt_members in _itertools.product(*instrument_members):
+                        expanded_layer_lbl = component_lookup.copy()
+                        expanded_layer_lbl.update({i: components[i] + "_" + sel
+                                                   for i, sel in zip(instrument_inds, selected_instrmt_members)})
+                        expanded_layer_lbl = _Label([expanded_layer_lbl[i] for i in range(len(components))])
+
+                        if ootree is not None:
+                            new_ootree = ootree
+                            for sel in selected_instrmt_members:
+                                new_ootree = new_ootree.get(sel, {})
+                            if len(new_ootree) == 0: continue  # no observed outcomes along this outcome-tree path
+                        else:
+                            new_ootree = None
+
+                        add_expanded_circuit_outcomes(circuit[0:k] + _Circuit((expanded_layer_lbl,)) + circuit[k + 1:],
+                                                      running_outcomes + selected_instrmt_members, new_ootree, k + 1)
+                    break
+
+            else:  # no more instruments to process: `cir` contains no instruments => add an expanded circuit
+                assert(circuit not in expanded_circuit_outcomes)  # shouldn't be possible to generate duplicates...
+                elabels = self._effect_labels_for_povm(povm_lbl) if (observed_outcomes is None) \
+                    else tuple(ootree.keys())
+                outcomes = tuple((running_outcomes + (elabel,) for elabel in elabels))
+                expanded_circuit_outcomes[_SeparatePOVMCircuit(circuit, povm_lbl, elabels)] = outcomes
+
+        has_instruments = self._has_instruments()
+        unique_povm_labels = set(povm_lbls)
+        effect_label_dict = {povm_lbl: self._effect_labels_for_povm(povm_lbl) for povm_lbl in unique_povm_labels}
+
+        for povm_lbl, circuit_without_povm, expanded_circuit_outcomes, observed_outcomes in zip(povm_lbls, circuits_without_povm, 
+                                                                                                expanded_circuit_outcomes_list, 
+                                                                                                observed_outcomes_list):
+            ootree = create_tree(observed_outcomes) if observed_outcomes is not None else None  # tree of observed outcomes
+            # e.g. [('0','00'), ('0','01'), ('1','10')] ==> {'0': {'00': {}, '01': {}}, '1': {'10': {}}}
+
+            if has_instruments:
+                add_expanded_circuit_outcomes(circuit_without_povm, (), ootree, start=0)
+            else:
+                # It may be helpful to cache the set of elabels for a POVM (maybe within the model?) because
+                # currently the call to _effect_labels_for_povm may be a bottleneck.  It's needed, even when we have
+                # observed outcomes, because there may be some observed outcomes that aren't modeled (e.g. leakage states)
+                if observed_outcomes is None:
+                    elabels = effect_label_dict[povm_lbl]
+                else:
+                    possible_lbls = set(effect_label_dict[povm_lbl])
+                    elabels = tuple([oo for oo in ootree.keys() if oo in possible_lbls])
+                outcomes = tuple(((elabel,) for elabel in elabels))
+                expanded_circuit_outcomes[_SeparatePOVMCircuit(circuit_without_povm, povm_lbl, elabels)] = outcomes
+
+        return expanded_circuit_outcomes_list
 
     def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_append=None, return_split = False):
         """
@@ -1452,6 +1852,91 @@ def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_appe
         else:
             return comp_circuits
         
+    def circuit_parameter_dependence(self, circuits, return_param_circ_map = False):
+        """
+        Calculate the which model parameters each of the input circuits depends upon.
+        Return this result in the the form of a dictionary whose keys are circuits,
+        and whose values are lists of parameters upon which that circuit depends.
+        Optionally a reverse mapping from model parameters to the input circuits
+        which depend on that parameter.
+
+        Note: This methods does not work with models using parameter interposers presently.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            List of circuits to determine parameter dependence for.
+        
+        return_param_circ_map : bool, optional (default False)
+            A flag indicating whether to return a reverse mapping from parameters
+            to circuits depending on those parameters.
+        
+        Returns
+        -------
+        circuit_parameter_map : dict
+            Dictionary with keys given by Circuits and values giving the list of
+            model parameter indices upon which that circuit depends.
+
+        param_to_circuit_map : dict, optional
+            Dictionary with keys given by model parameter indices, and values
+            giving the list of input circuits dependent upon that parameter.
+        """
+
+        if self.param_interposer is not None:
+            msg = 'Circuit parameter dependence evaluation is not currently implemented for models with parameter interposers.'
+            raise NotImplementedError(msg)
+        #start by completing the model:
+        #Here we want to do this for all of the different primitive prep and
+        #measurement layers present.
+        circuit_parameter_map = {}
+        
+        completed_circuits_by_prep_povm = []
+        prep_povm_pairs = list(_itertools.product(self.primitive_prep_labels, self.primitive_povm_labels))
+        for prep_lbl, povm_lbl in prep_povm_pairs:
+            completed_circuits_by_prep_povm.append(self.complete_circuits(circuits, prep_lbl_to_prepend=prep_lbl, povm_lbl_to_append=povm_lbl))
+        
+        #we should now have in completed_circuits_by_prep_povm a list of completed circuits
+        #for each prep, povm pair. Unique layers by circuit will then be the union of these
+        #accross each of the sublists.
+
+        unique_layers_by_circuit = []
+        for circuits_by_prep_povm in zip(*completed_circuits_by_prep_povm):    
+            #Take the complete set of circuits and get the unique layers which appear accross all of them
+            #then use this to pre-compute circuit_layer_operators and gpindices.
+            unique_layers_by_circuit.append(set(sum([ckt.layertup for ckt in circuits_by_prep_povm], ())))
+
+        #then aggregate these:
+        unique_layers = set()
+        unique_layers = unique_layers.union(*unique_layers_by_circuit)
+
+        #Now pre-compute the gpindices for all of these unique layers
+        unique_layers_gpindices_dict = {layer:_slct.indices(self.circuit_layer_operator(layer).gpindices) for layer in unique_layers}
+        
+        #loop through the circuit layers and get the circuit layer operators.
+        #from each of the circuit layer operators we'll get their gpindices. 
+        
+        for circuit, ckt_layer_set in zip(circuits, unique_layers_by_circuit):
+            seen_gpindices = []
+            for layer in ckt_layer_set:
+                gpindices_for_layer = unique_layers_gpindices_dict[layer]
+                seen_gpindices.extend(gpindices_for_layer)
+                    
+            seen_gpindices = sorted(set(seen_gpindices))
+
+            circuit_parameter_map[circuit] = seen_gpindices
+        
+        #We can also optionally compute the reverse map, from parameters to circuits which touch that parameter.
+        #it would be more efficient to do this in parallel with the other maps construction, so refactor this later.
+        if return_param_circ_map:
+            param_to_circuit_map = [[] for _ in range(self.num_params)]
+            #keys in circuit_parameter_map should be in the same order as in circuits.
+            for param_list in circuit_parameter_map.values():
+                for param_idx in param_list:
+                    param_to_circuit_map[param_idx].append(circuit)
+
+            return circuit_parameter_map, param_to_circuit_map
+        else:
+            return circuit_parameter_map
 
     # ---- Operation container interface ----
     # These functions allow oracle access to whether a label of a given type
@@ -2454,3 +2939,6 @@ def _default_param_bounds(num_params):
 def _param_bounds_are_nontrivial(param_bounds):
     """Checks whether a parameter-bounds array holds any actual bounds, or if all are just +-inf """
     return _np.any(param_bounds[:, 0] != -_np.inf) or _np.any(param_bounds[:, 1] != _np.inf)
+
+#stick this on the bottom to resolve a circular import issue:
+from pygsti.models.explicitmodel import ExplicitLayerRules as _ExplicitLayerRules
diff --git a/pygsti/models/modelconstruction.py b/pygsti/models/modelconstruction.py
index 7b38b34a7..3bfff16b0 100644
--- a/pygsti/models/modelconstruction.py
+++ b/pygsti/models/modelconstruction.py
@@ -752,7 +752,7 @@ def _create_explicit_model(processor_spec, modelnoise, custom_gates=None, evotyp
     state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
         else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)
     std_gate_unitaries = _itgs.standard_gatename_unitaries()
-    evotype = _Evotype.cast(evotype)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
     modelnoise = _OpModelNoise.cast(modelnoise)
     modelnoise.reset_access_counters()
 
@@ -1676,7 +1676,7 @@ def _create_crosstalk_free_model(processor_spec, modelnoise, custom_gates=None,
     qudit_labels = processor_spec.qudit_labels
     state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
         else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)
-    evotype = _Evotype.cast(evotype)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
     modelnoise = _OpModelNoise.cast(modelnoise)
     modelnoise.reset_access_counters()
 
@@ -1867,7 +1867,7 @@ def _create_cloud_crosstalk_model(processor_spec, modelnoise, custom_gates=None,
     qudit_labels = processor_spec.qudit_labels
     state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
         else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)  # FUTURE: allow more types of spaces
-    evotype = _Evotype.cast(evotype)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
     modelnoise = _OpModelNoise.cast(modelnoise)
     modelnoise.reset_access_counters()
     printer = _VerbosityPrinter.create_printer(verbosity)
diff --git a/pygsti/models/modelparaminterposer.py b/pygsti/models/modelparaminterposer.py
index f92f56a70..aa86fc5f3 100644
--- a/pygsti/models/modelparaminterposer.py
+++ b/pygsti/models/modelparaminterposer.py
@@ -77,7 +77,7 @@ def ops_paramlbls_to_model_paramlbls(self, wl):
         # This can and should be improved later - particularly this will be awful when labels (els of wl) are tuples.
         ret = []
         for irow in range(self.inv_transform_matrix.shape[0]):
-            lbl = ' + '.join(["%g%s" % (coeff, str(lbl)) for coeff, lbl in zip(self.inv_transform_matrix[irow, :], wl)])
+            lbl = ' + '.join(["%g%s" % (coeff, str(lbl)) for coeff, lbl in zip(self.inv_transform_matrix[irow, :], wl) if abs(coeff)>1e-10])
             ret.append(lbl)
         return ret
 
diff --git a/pygsti/objectivefns/objectivefns.py b/pygsti/objectivefns/objectivefns.py
index 191fd736b..208bdb46d 100644
--- a/pygsti/objectivefns/objectivefns.py
+++ b/pygsti/objectivefns/objectivefns.py
@@ -19,11 +19,13 @@
 
 from pygsti import tools as _tools
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
+from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
 from pygsti.tools import slicetools as _slct, mpitools as _mpit, sharedmemtools as _smt
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
+from pygsti.models.model import OpModel as _OpModel
 
 
 def _objfn(objfn_cls, model, dataset, circuits=None,
@@ -843,12 +845,10 @@ class ModelDatasetCircuitsStore(object):
     point.
     """
     def __init__(self, model, dataset, circuits=None, resource_alloc=None, array_types=(),
-                 precomp_layout=None, verbosity=0):
+                 precomp_layout=None, verbosity=0, outcome_count_by_circuit=None):
         self.dataset = dataset
         self.model = model
-        #self.opBasis = mdl.basis
         self.resource_alloc = _ResourceAllocation.cast(resource_alloc)
-        # expand = ??? get from model based on fwdsim type?
 
         circuit_list = circuits if (circuits is not None) else list(dataset.keys())
         bulk_circuit_list = circuit_list if isinstance(
@@ -872,8 +872,21 @@ def __init__(self, model, dataset, circuits=None, resource_alloc=None, array_typ
         else:
             self.global_circuits = self.circuits
 
-        #self.circuits = bulk_circuit_list[:]
-        #self.circuit_weights = bulk_circuit_list.circuit_weights
+        #If a matrix layout then we have some precached circuit structures we can
+        #grab to speed up store generation.
+        if isinstance(self.layout, _MatrixCOPALayout):
+            #Grab the split_circuit_cache and down select to those in
+            #self.circuits
+            self.split_circuit_cache = self.layout.split_circuit_cache
+            self.split_circuits = [self.split_circuit_cache[ckt] for ckt in self.circuits]
+        #currently only implemented for matrix, will eventually add map support.
+        else:
+            self.split_circuits = None
+            self.split_circuit_cache = None
+
+        #set the value of the circuit outcome count cache (can be None)
+        self.outcome_count_by_circuit_cache = outcome_count_by_circuit
+
         self.ds_circuits = self.circuits.apply_aliases()
 
         # computed by add_count_vectors
@@ -888,18 +901,6 @@ def __init__(self, model, dataset, circuits=None, resource_alloc=None, array_typ
 
         self.time_dependent = False  # indicates whether the data should be treated as time-resolved
 
-        #if not self.cache.has_evaltree():
-        #    subcalls = self.get_evaltree_subcalls()
-        #    evt_resource_alloc = _ResourceAllocation(self.raw_objfn.comm, evt_mlim,
-        #                                             self.raw_objfn.profiler, self.raw_objfn.distribute_method)
-        #    self.cache.add_evaltree(self.mdl, self.dataset, bulk_circuit_list, evt_resource_alloc,
-        #                            subcalls, self.raw_objfn.printer - 1)
-        #self.eval_tree = self.cache.eval_tree
-        #self.lookup = self.cache.lookup
-        #self.outcomes_lookup = self.cache.outcomes_lookup
-        #self.wrt_block_size = self.cache.wrt_block_size
-        #self.wrt_block_size2 = self.cache.wrt_block_size2
-
         #convenience attributes (could make properties?)
         if isinstance(self.layout, _DistributableCOPALayout):
             self.global_nelements = self.layout.global_num_elements
@@ -940,14 +941,36 @@ def add_omitted_freqs(self, printer=None, force=False):
         """
         if self.firsts is None or force:
             # FUTURE: add any tracked memory? self.resource_alloc.add_tracked_memory(...)
-            self.firsts = []; self.indicesOfCircuitsWithOmittedData = []
-            for i, c in enumerate(self.circuits):
-                indices = _slct.to_array(self.layout.indices_for_index(i))
-                lklen = _slct.length(self.layout.indices_for_index(i))
-                if 0 < lklen < self.model.compute_num_outcomes(c):
+            self.firsts = [] 
+            self.indicesOfCircuitsWithOmittedData = []
+
+            if self.outcome_count_by_circuit_cache is None:
+                #bulk compute the number of outcomes.
+                if isinstance(self.model, _OpModel) and self.split_circuits is not None:
+                    bulk_outcomes_list = self.model.bulk_circuit_outcomes(self.circuits, split_circuits=self.split_circuits)
+                    num_outcomes_list = [len(outcome_tup) for outcome_tup in bulk_outcomes_list]
+                else:
+                    num_outcomes_list = [self.model.compute_num_outcomes(c) for c in self.circuits]
+            else:
+                num_outcomes_list = []
+                for ckt in self.circuits:
+                    num_outcomes = self.outcome_count_by_circuit_cache.get(ckt, None)
+                    if num_outcomes is None:
+                        num_outcomes = self.model.compute_num_outcomes(ckt)
+                        #also add this to the cache, just in case it is later needed.
+                        self.outcome_count_by_circuit_cache[ckt] = num_outcomes
+                    num_outcomes_list.append(num_outcomes)
+
+            for i in range(len(self.circuits)):
+                indices = self.layout.indices_for_index(i)
+                #The return types of indices_for_index are either ndarrays
+                #or slices.
+                if isinstance(indices, slice): 
+                    indices = _slct.indices(indices)
+                if 0 < len(indices) < num_outcomes_list[i]:
                     self.firsts.append(indices[0])
                     self.indicesOfCircuitsWithOmittedData.append(i)
-            if len(self.firsts) > 0:
+            if self.firsts:
                 self.firsts = _np.array(self.firsts, 'i')
                 self.indicesOfCircuitsWithOmittedData = _np.array(self.indicesOfCircuitsWithOmittedData, 'i')
                 self.dprobs_omitted_rowsum = _np.empty((len(self.firsts), self.nparams), 'd')
@@ -974,13 +997,15 @@ def add_count_vectors(self, force=False):
 
             for (i, circuit) in enumerate(self.ds_circuits):
                 cnts = self.dataset[circuit].counts
-                totals[self.layout.indices_for_index(i)] = sum(cnts.values())  # dataset[opStr].total
-                counts[self.layout.indices_for_index(i)] = [cnts.get(x, 0) for x in self.layout.outcomes_for_index(i)]
+                idcs_for_idx = self.layout.indices_for_index(i)
+                totals[idcs_for_idx] = sum(cnts.values())  # dataset[opStr].
+                counts[idcs_for_idx] = [cnts.getitem_unsafe(x, 0) for x in self.layout.outcomes_for_index(i)]
 
             if self.circuits.circuit_weights is not None:
                 for i in range(len(self.ds_circuits)):  # multiply N's by weights
-                    counts[self.layout.indices_for_index(i)] *= self.circuits.circuit_weights[i]
-                    totals[self.layout.indices_for_index(i)] *= self.circuits.circuit_weights[i]
+                    idcs_for_idx = self.layout.indices_for_index(i)
+                    counts[idcs_for_idx] *= self.circuits.circuit_weights[i]
+                    totals[idcs_for_idx] *= self.circuits.circuit_weights[i]
 
             self.counts = counts
             self.total_counts = totals
@@ -994,7 +1019,7 @@ class EvaluatedModelDatasetCircuitsStore(ModelDatasetCircuitsStore):
 
     def __init__(self, mdc_store, verbosity):
         super().__init__(mdc_store.model, mdc_store.dataset, mdc_store.global_circuits, mdc_store.resource_alloc,
-                         mdc_store.array_types, mdc_store.layout, verbosity)
+                         mdc_store.array_types, mdc_store.layout, verbosity, mdc_store.outcome_count_by_circuit_cache)
 
         # Memory check - see if there's enough memory to hold all the evaluated quantities
         #persistent_mem = self.layout.memory_estimate()
diff --git a/pygsti/optimize/customlm.py b/pygsti/optimize/customlm.py
index 41c565cd2..cbaa9b513 100644
--- a/pygsti/optimize/customlm.py
+++ b/pygsti/optimize/customlm.py
@@ -10,6 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+import os as _os
 import signal as _signal
 import time as _time
 
@@ -25,7 +26,10 @@
 # from scipy.optimize import OptimizeResult as _optResult
 
 #Make sure SIGINT will generate a KeyboardInterrupt (even if we're launched in the background)
-_signal.signal(_signal.SIGINT, _signal.default_int_handler)
+#This may be problematic for multithreaded parallelism above pyGSTi, e.g. Dask,
+#so this can be turned off by setting the PYGSTI_NO_CUSTOMLM_SIGINT environment variable
+if 'PYGSTI_NO_CUSTOMLM_SIGINT' not in _os.environ:
+    _signal.signal(_signal.SIGINT, _signal.default_int_handler)
 
 #constants
 _MACH_PRECISION = 1e-12
diff --git a/pygsti/protocols/gst.py b/pygsti/protocols/gst.py
index 74e2d9a6f..9255943d3 100644
--- a/pygsti/protocols/gst.py
+++ b/pygsti/protocols/gst.py
@@ -1133,6 +1133,9 @@ def _to_nice_serialization(self):
             for lbl, goparams in self.gaugeopt_argument_dicts.items():
                 goparams_list = [goparams] if hasattr(goparams, 'keys') else goparams
                 serialize_list = []
+                if lbl == 'trivial_gauge_opt':
+                    dicts_to_serialize[lbl] = None
+                    continue
                 for goparams_dict in goparams_list:
                     to_add = goparams_dict.copy()
                     if 'target_model' in to_add:
@@ -1164,6 +1167,9 @@ def _to_nice_serialization(self):
     def _from_nice_serialization(cls, state):  # memo holds already de-serialized objects
         gaugeopt_argument_dicts = {}
         for lbl, serialized_goparams_list in state['gaugeopt_argument_dicts'].items():
+            if lbl == 'trivial_gauge_opt':
+                gaugeopt_argument_dicts[lbl] = None
+                continue
             goparams_list = []
             for serialized_goparams in serialized_goparams_list:
                 to_add = serialized_goparams.copy()
diff --git a/pygsti/tools/basistools.py b/pygsti/tools/basistools.py
index 4323a18ff..b87c59f67 100644
--- a/pygsti/tools/basistools.py
+++ b/pygsti/tools/basistools.py
@@ -189,9 +189,9 @@ def change_basis(mx, from_basis, to_basis):
     if isMx:
         # want ret = toMx.dot( _np.dot(mx, fromMx)) but need to deal
         # with some/all args being sparse:
-        ret = _mt.safe_dot(toMx, _mt.safe_dot(mx, fromMx))
+        ret = toMx @ (mx @ fromMx)
     else:  # isVec
-        ret = _mt.safe_dot(toMx, mx)
+        ret = toMx @ mx
 
     if not to_basis.real:
         return ret
@@ -199,7 +199,7 @@ def change_basis(mx, from_basis, to_basis):
     if _mt.safe_norm(ret, 'imag') > 1e-8:
         raise ValueError("Array has non-zero imaginary part (%g) after basis change (%s to %s)!\n%s" %
                          (_mt.safe_norm(ret, 'imag'), from_basis, to_basis, ret))
-    return _mt.safe_real(ret)
+    return ret.real
 
 #def transform_matrix(from_basis, to_basis, dim_or_block_dims=None, sparse=False):
 #    '''
@@ -507,6 +507,7 @@ def vec_to_stdmx(v, basis, keep_complex=False):
     """
     if not isinstance(basis, _basis.Basis):
         basis = _basis.BuiltinBasis(basis, len(v))
+    v = v.ravel()
     ret = _np.zeros(basis.elshape, 'complex')
     if v.ndim > 1:
         assert v.size == v.shape[0]
diff --git a/pygsti/tools/fastcalc.pyx b/pygsti/tools/fastcalc.pyx
index bed8e6c23..e7a5201f7 100644
--- a/pygsti/tools/fastcalc.pyx
+++ b/pygsti/tools/fastcalc.pyx
@@ -14,6 +14,7 @@
 
 import numpy as np
 from libc.stdlib cimport malloc, free
+from functools import lru_cache
 cimport numpy as np
 cimport cython
 
@@ -570,67 +571,6 @@ def fast_kron(np.ndarray[double, ndim=1, mode="c"] outvec not None,
             outvec[endoff+i] *= mult
         sz *= fastArraySizes[k]
 
-    #assert(sz == N)
-
-
-
-#An attempt at a faster matrix prod specific to 2D matrices -- much SLOWER than numpy!!
-#@cython.cdivision(True) # turn off divide-by-zero checking
-#@cython.boundscheck(False) # turn off bounds-checking for entire function
-#@cython.wraparound(False)  # turn off negative index wrapping for entire function
-#def fast_dot2(np.ndarray[double, ndim=2] out,
-#              np.ndarray[double, ndim=2] a, np.ndarray[double, ndim=2] b):
-#    cdef double* out_ptr = <double*>out.data
-#    cdef double* a_ptr = <double*>a.data
-#    cdef double* b_ptr = <double*>b.data
-#    cdef double* arow
-#    cdef double* bcol
-#    cdef double* outrow
-#    cdef double tot
-#    cdef INT m = a.shape[0]
-#    cdef INT n = b.shape[1]
-#    cdef INT l = a.shape[1]
-#    cdef INT astride = a.strides[0] // a.itemsize
-#    cdef INT bstride = b.strides[0] // b.itemsize
-#    cdef INT outstride = out.strides[0] // out.itemsize
-#    cdef INT ainc = a.strides[1] // a.itemsize
-#    cdef INT binc = b.strides[1] // b.itemsize
-#    cdef INT outinc = out.strides[1] // out.itemsize
-#    cdef INT i_times_astride
-#    cdef INT i_times_outstride
-#    cdef INT j_times_binc
-#    cdef INT j_times_outinc
-#    cdef INT k_times_bstride
-#    cdef INT k_times_ainc
-#    cdef INT i
-#    cdef INT j
-#    cdef INT k
-#
-#    # out_ij = sum_k a_ik * b_kl
-#
-#    i_times_astride = 0
-#    i_times_outstride = 0
-#    for i in range(m):
-#        arow = &a_ptr[i_times_astride]
-#        outrow = &out_ptr[i_times_outstride]
-#        j_times_binc = 0
-#        j_times_outinc = 0
-#        for j in range(n):
-#            bcol = &b_ptr[j_times_binc]
-#            k_times_bstride = 0
-#            k_times_ainc = 0
-#            tot = 0.0
-#            for k in range(l):
-#                tot = tot + arow[k_times_ainc] * bcol[k_times_bstride]
-#                k_times_bstride = k_times_bstride + bstride
-#                k_times_ainc = k_times_ainc + ainc
-#            outrow[j_times_outinc] = tot
-#            j_times_binc = j_times_binc + binc
-#            j_times_outinc = j_times_outinc + outinc
-#        i_times_astride = i_times_astride + astride
-#        i_times_outstride = i_times_outstride + outstride
-
-
 @cython.boundscheck(False) # turn off bounds-checking for entire function
 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 def fast_kron_complex(np.ndarray[np.complex128_t, ndim=1, mode="c"] outvec not None,
@@ -680,21 +620,6 @@ def fast_kron_complex(np.ndarray[np.complex128_t, ndim=1, mode="c"] outvec not N
     #assert(sz == N)
 
 
-#Manually inline to avoid overhead of argument passing
-#@cython.boundscheck(False) # turn off bounds-checking for entire function
-#@cython.wraparound(False)  # turn off negative index wrapping for entire function
-#cdef vec_inf_norm(np.ndarray[double, ndim=1] v):
-#    cdef INT i
-#    cdef INT N = v.shape[0]
-#    cdef double mx = 0.0
-#    cdef double a
-#    for i in range(N):
-#        a = abs(v[i])
-#        if a > mx: mx = a
-#    return mx
-
-
-
 @cython.cdivision(True) # turn off divide-by-zero checking
 @cython.boundscheck(False) # turn off bounds-checking for entire function
 @cython.wraparound(False)  # turn off negative index wrapping for entire function
@@ -716,9 +641,6 @@ def custom_expm_multiply_simple_core(np.ndarray[double, ndim=1, mode="c"] Adata,
                                        &F[0], &scratch[0])
     return F
 
-
-
-
 @cython.cdivision(True) # turn off divide-by-zero checking
 cdef custom_expm_multiply_simple_core_c(double* Adata, INT* Aindptr,
                                         INT* Aindices, double* B,
@@ -1305,6 +1227,38 @@ def fast_compose_cliffords(np.ndarray[np.int64_t, ndim=2] s1, np.ndarray[np.int6
     return s, p
 
 
+#Faster generation of upper triangular indices specialized to first
+#superdiagonal and up.
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)   # Deactivate negative indexing.
+@cython.cdivision(True)
+@lru_cache(maxsize=16)
+def fast_triu_indices(int n):
+    if n < 1:
+        raise ValueError('n must be greater than 0')
+        
+    cdef int size = (n**2-n)/2
+    cdef int curr_idx = 0
+    cdef int j, i
+    
+    cdef np.ndarray[np.int64_t, ndim=1, mode="c"] row_indices_np = np.empty(size, dtype=np.int64)
+    cdef np.ndarray[np.int64_t, ndim=1, mode="c"] col_indices_np = np.empty(size, dtype=np.int64)
+    
+    cdef np.int64_t[::1] row_indices = row_indices_np
+    cdef np.int64_t[::1] col_indices = col_indices_np
+
+    for j in range(n-1):
+        for i in range(n-j-1, 0, -1):
+            row_indices[curr_idx] = j
+            curr_idx += 1
+
+    curr_idx = 0
+    for j in range(1, n):
+        for i in range(j, n):
+            col_indices[curr_idx] = i
+            curr_idx += 1
+
+    return row_indices_np, col_indices_np
 
 
 
diff --git a/pygsti/tools/matrixtools.py b/pygsti/tools/matrixtools.py
index aaacd1ad5..77a826c97 100644
--- a/pygsti/tools/matrixtools.py
+++ b/pygsti/tools/matrixtools.py
@@ -30,6 +30,39 @@
 #EXPM_DEFAULT_TOL = 1e-7
 EXPM_DEFAULT_TOL = 2**-53  # Scipy default
 
+BLAS_FUNCS = {
+    'herk': {
+        's' : _spl.blas.ssyrk,
+        'd' : _spl.blas.dsyrk,
+        'c' : _spl.blas.cherk,
+        'z': _spl.blas.zherk
+    }
+}
+
+def gram_matrix(m, adjoint=False):
+    """
+    If adjoint=False, then return m.T.conj() @ m, computed in a more efficient way.
+
+    If adjoint=True, return m @ m.T.conj(), likewise computed in a more efficient way.
+    """
+    assert isinstance(m, _np.ndarray)
+    prefix_char, _, _ = _spl.blas.find_best_blas_type(dtype=m.dtype)
+    herk = BLAS_FUNCS["herk"][prefix_char]
+
+    if adjoint:
+        trans = 0
+    elif _np.iscomplexobj(m):
+        trans = 2
+    else:
+        trans = 1
+    out = herk(1.0, m, trans=trans)
+    i_lower = _np.tril_indices(out.shape[0], -1)
+    upper_values = out.T[i_lower]
+    out[i_lower] = upper_values.real
+    if trans > 0:
+        out[i_lower] += upper_values.imag
+    return out
+
 
 def is_hermitian(mx, tol=1e-9):
     """
@@ -49,14 +82,13 @@ def is_hermitian(mx, tol=1e-9):
         True if mx is hermitian, otherwise False.
     """
     (m, n) = mx.shape
-    for i in range(m):
-        if abs(mx[i, i].imag) > tol: return False
-        for j in range(i + 1, n):
-            if abs(mx[i, j] - mx[j, i].conjugate()) > tol: return False
-    return True
+    if m != n:
+        return False
+    else:
+        return _np.all(_np.abs(mx - mx.T.conj()) <= tol)
 
 
-def is_pos_def(mx, tol=1e-9):
+def is_pos_def(mx, tol=1e-9, attempt_cholesky=False):
     """
     Test whether mx is a positive-definite matrix.
 
@@ -73,7 +105,15 @@ def is_pos_def(mx, tol=1e-9):
     bool
         True if mx is positive-semidefinite, otherwise False.
     """
-    evals = _np.linalg.eigvals(mx)
+    if not is_hermitian(mx, tol):
+        return False
+    if attempt_cholesky:
+        try:
+            _ = _spl.cholesky(mx)
+            return True # Cholesky succeeded
+        except _spl.LinAlgError:
+            pass # we fall back on eigenvalue decomposition
+    evals = _np.linalg.eigvalsh(mx)
     return all([ev > -tol for ev in evals])
 
 
@@ -94,7 +134,8 @@ def is_valid_density_mx(mx, tol=1e-9):
     bool
         True if mx is a valid density matrix, otherwise False.
     """
-    return is_hermitian(mx, tol) and is_pos_def(mx, tol) and abs(_np.trace(mx) - 1.0) < tol
+    # is_pos_def includes a check that the matrix is Hermitian.
+    return abs(_np.trace(mx) - 1.0) < tol and is_pos_def(mx, tol)
 
 
 def nullspace(m, tol=1e-7):
@@ -115,7 +156,7 @@ def nullspace(m, tol=1e-7):
     """
     _, s, vh = _np.linalg.svd(m)
     rank = (s > tol).sum()
-    return vh[rank:].T.conjugate().copy()
+    return vh[rank:].T.conjugate()
 
 
 def nullspace_qr(m, tol=1e-7):
@@ -156,10 +197,7 @@ def nice_nullspace(m, tol=1e-7, orthogonalize=False):
     Computes the nullspace of a matrix, and tries to return a "nice" basis for it.
 
     Columns of the returned value (a basis for the nullspace) each have a maximum
-    absolute value of 1.0 and are chosen so as to align with the the original
-    matrix's basis as much as possible (the basis is found by projecting each
-    original basis vector onto an arbitrariliy-found nullspace and keeping only
-    a set of linearly independent projections).
+    absolute value of 1.0.
 
     Parameters
     ----------
@@ -177,21 +215,22 @@ def nice_nullspace(m, tol=1e-7, orthogonalize=False):
     An matrix of shape (M,K) whose columns contain nullspace basis vectors.
     """
     nullsp = nullspace(m, tol)
-    nullsp_projector = _np.dot(nullsp, nullsp.conj().T)
-    keepers = []; current_rank = 0
-    for i in range(nullsp_projector.shape[1]):  # same as mx.shape[1]
-        rank = _np.linalg.matrix_rank(nullsp_projector[:, 0:i + 1], tol=tol)
-        if rank > current_rank:
-            keepers.append(i)
-            current_rank = rank
-    ret = _np.take(nullsp_projector, keepers, axis=1)
-
-    if orthogonalize:  # and not columns_are_orthogonal(ret):
-        ret, _ = _np.linalg.qr(ret)  # Gram-Schmidt orthogonalization
-
+    dim_ker = nullsp.shape[1]
+    if dim_ker == 0:
+        return nullsp  # empty 0-by-N array
+    _, _, p = _spl.qr(nullsp.T.conj(), mode='raw', pivoting=True)
+    ret = nullsp @ (nullsp.T[:, p[:dim_ker]]).conj()
+    # ^ That's equivalent to, but faster than:
+    #     nullsp_projector = nullsp @ nullsp.T.conj()
+    #     _, _, p = _spl.qr(nullsp_projector mode='raw', pivoting=True)
+    #     ret = nullsp_projector[:, p[:dim_ker]]
+
+    if orthogonalize:
+        ret, _ = _spl.qr(ret, mode='economic')
     for j in range(ret.shape[1]):  # normalize columns so largest element is +1.0
         imax = _np.argmax(_np.abs(ret[:, j]))
-        if abs(ret[imax, j]) > 1e-6: ret[:, j] /= ret[imax, j]
+        if abs(ret[imax, j]) > 1e-6:
+            ret[:, j] /= ret[imax, j]
 
     return ret
 
@@ -248,14 +287,16 @@ def column_norms(m, ord=None):
     numpy.ndarray
         A 1-dimensional array of the column norms (length is number of columns of `m`).
     """
-    ord_list = [ord] * m.shape[1] if (ord is None or isinstance(ord, int)) else ord
-    assert(len(ord_list) == m.shape[1])
-
     if _sps.issparse(m):
-        #this could be done more efficiently, e.g. by converting to csc and taking column norms directly
+        ord_list = ord if isinstance(ord, (list, _np.ndarray)) else [ord] * m.shape[1]
+        assert(len(ord_list) == m.shape[1])
         norms = _np.array([_np.linalg.norm(m[:, j].toarray(), ord=o) for j, o in enumerate(ord_list)])
+    elif isinstance(ord, (list, _np.ndarray)):
+        assert(len(ord) == m.shape[1])
+        norms = _np.array([_np.linalg.norm(m[:, j], ord=o) for j, o in enumerate(ord)])
     else:
-        norms = _np.array([_np.linalg.norm(m[:, j], ord=o) for j, o in enumerate(ord_list)])
+        norms = _np.linalg.norm(m, axis=0, ord=ord)
+
     return norms
 
 
@@ -311,8 +352,9 @@ def columns_are_orthogonal(m, tol=1e-7):
     -------
     bool
     """
-    if m.size == 0: return True  # boundary case
-    check = _np.dot(m.conj().T, m)
+    if m.size == 0:
+        return True  # boundary case
+    check = gram_matrix(m)
     check[_np.diag_indices_from(check)] = 0.0
     return bool(_np.linalg.norm(check) / check.size < tol)
 
@@ -337,9 +379,11 @@ def columns_are_orthonormal(m, tol=1e-7):
     -------
     bool
     """
-    if m.size == 0: return True  # boundary case
-    check = _np.dot(m.conj().T, m)
-    return bool(_np.allclose(check, _np.identity(check.shape[0], 'd'), atol=tol))
+    if m.size == 0:
+        return True  # boundary case
+    check = gram_matrix(m)
+    check[_np.diag_indices_from(check)] -= 1.0
+    return bool(_np.linalg.norm(check) / check.size < tol)
 
 
 def independent_columns(m, initial_independent_cols=None, tol=1e-7):
@@ -369,27 +413,30 @@ def independent_columns(m, initial_independent_cols=None, tol=1e-7):
     list
         A list of the independent-column indices of `m`.
     """
-    indep_cols = []
-
     if not _sps.issparse(m):
-        running_indep_cols = initial_independent_cols.copy() \
-            if (initial_independent_cols is not None) else _np.empty((m.shape[0], 0), m.dtype)
-        num_indep_cols = running_indep_cols.shape[0]
-
-        for j in range(m.shape[1]):
-            trial = _np.concatenate((running_indep_cols, m[:, j]), axis=1)
-            if _np.linalg.matrix_rank(trial, tol=tol) == num_indep_cols + 1:
-                running_indep_cols = trial
-                indep_cols.append(j)
-                num_indep_cols += 1
-
-    else:  # sparse case
+        if initial_independent_cols is None:
+            proj_m = m.copy()
+        else:
+            # We assume initial_independent_cols is full column-rank.
+            # This lets us use unpivoted QR instead of pivoted QR or SVD.
+            assert initial_independent_cols.shape[0] == m.shape[0]
+            q = _spl.qr(initial_independent_cols, mode='econ')[0]
+            # proj_m = (I - qq')m
+            temp1 = q.T.conj() @ m
+            temp2 = q @ temp1
+            proj_m = m - temp2
+
+        rank = _np.linalg.matrix_rank(proj_m, tol=tol)
+        pivots = _spl.qr(proj_m, overwrite_a=True, mode='raw', pivoting=True)[2]
+        indep_cols = pivots[:rank].tolist()
 
+    else:
+        # TODO: re-implement to avoid unreliable calls to ARPACK's svds.
+        indep_cols = []
         from scipy.sparse.linalg import ArpackNoConvergence as _ArpackNoConvergence
         from scipy.sparse.linalg import ArpackError as _ArpackError
         running_indep_cols = initial_independent_cols.copy() \
             if (initial_independent_cols is not None) else _sps.csc_matrix((m.shape[0], 0), dtype=m.dtype)
-        num_indep_cols = running_indep_cols.shape[0]
 
         for j in range(m.shape[1]):
             trial = _sps.hstack((running_indep_cols, m[:, j]))
@@ -408,15 +455,33 @@ def independent_columns(m, initial_independent_cols=None, tol=1e-7):
 
 
 def pinv_of_matrix_with_orthogonal_columns(m):
-    """ TODO: docstring """
-    col_scaling = _np.sum(_np.abs(m)**2, axis=0)
+    """
+    Return the matrix "pinv_m" so m @ pinvm and pinv_m @ m are orthogonal projectors
+    onto subspaces of dimension rank(m).
+
+    Parameters
+    ----------
+    m : numpy.ndarray
+
+    Returns
+    ----------
+    pinv_m : numpy.ndarray
+    """
+    col_scaling = _np.linalg.norm(m, axis=0)**2
     m_with_scaled_cols = m.conj() * col_scaling[None, :]
     return m_with_scaled_cols.T
 
 
 def matrix_sign(m):
     """
-    The "sign" matrix of `m`
+    Compute the matrix s = sign(m). The eigenvectors of s are the same as those of m.
+    The eigenvalues of s are +/- 1, corresponding to the signs of m's eigenvalues.
+
+    It's straightforward to compute s when m is a normal operator. If m is not normal,
+    then the definition of s can be given in terms of m's Jordan form, and s
+    can be computed by (suitably post-processing) the Schur decomposition of m.
+
+    See https://nhigham.com/2020/12/15/what-is-the-matrix-sign-function/ for background.
 
     Parameters
     ----------
@@ -427,40 +492,45 @@ def matrix_sign(m):
     -------
     numpy.ndarray
     """
-    #Notes: sign(m) defined s.t. eigvecs of sign(m) are evecs of m
-    # and evals of sign(m) are +/-1 or 0 based on sign of eigenvalues of m
+    N = m.shape[0]
+    assert(m.shape == (N, N)), "m must be square!"
 
-    #Using the extremely numerically stable (but expensive) Schur method
-    # see http://www.maths.manchester.ac.uk/~higham/fm/OT104HighamChapter5.pdf
-    N = m.shape[0]; assert(m.shape == (N, N)), "m must be square!"
-    T, Z = _spl.schur(m, 'complex')  # m = Z T Z^H where Z is unitary and T is upper-triangular
-    U = _np.zeros(T.shape, 'complex')  # will be sign(T), which is easy to compute
-    # (U is also upper triangular), and then sign(m) = Z U Z^H
+    if is_hermitian(m):
+        eigvals, eigvecs = _spl.eigh(m)
+        sign = (eigvecs * _np.sign(eigvals)[None, :]) @ eigvecs.T.conj()
+        return sign
 
-    # diagonals are easy
+    T, Z = _spl.schur(m, 'complex')    # m = Z T Z^H where Z is unitary and T is upper-triangular
+    U = _np.zeros(T.shape, 'complex') 
     U[_np.diag_indices_from(U)] = _np.sign(_np.diagonal(T))
+    # If T is diagonal, then we're basically done. If T isn't diagonal, then we have work to do.
+
+    if not _np.all(_np.isclose(T[_np.triu_indices(N, 1)], 0.0)):
+        # Use the extremely numerically stable (but expensive) method from
+        # N. Higham's book, Functions of Matrices : Theory and Practice, Chapter 5.
+
+        #Off diagonals: use U^2 = I or TU = UT
+        # Note: Tij = Uij = 0 when i > j and i==j easy so just consider i<j case
+        # 0 = sum_k Uik Ukj =  (i!=j b/c off-diag)
+        # FUTURE: speed this up by using np.dot instead of sums below
+        for j in range(1, N):
+            for i in range(j - 1, -1, -1):
+                S = U[i, i] + U[j, j]
+                if _np.isclose(S, 0):  # then use TU = UT
+                    if _np.isclose(T[i, i] - T[j, j], 0):  # then just set to zero
+                        U[i, j] = 0.0  # TODO: check correctness of this case
+                    else:
+                        U[i, j] = T[i, j] * (U[i, i] - U[j, j]) / (T[i, i] - T[j, j]) + \
+                            sum([U[i, k] * T[k, j] - T[i, k] * U[k, j] for k in range(i + 1, j)]) \
+                            / (T[i, i] - T[j, j])
+                else:  # use U^2 = I
+                    U[i, j] = - sum([U[i, k] * U[k, j] for k in range(i + 1, j)]) / S
+    else:
+        pass
 
-    #Off diagonals: use U^2 = I or TU = UT
-    # Note: Tij = Uij = 0 when i > j and i==j easy so just consider i<j case
-    # 0 = sum_k Uik Ukj =  (i!=j b/c off-diag)
-    # FUTURE: speed this up by using np.dot instead of sums below
-    for j in range(1, N):
-        for i in range(j - 1, -1, -1):
-            S = U[i, i] + U[j, j]
-            if _np.isclose(S, 0):  # then use TU = UT
-                if _np.isclose(T[i, i] - T[j, j], 0):  # then just set to zero
-                    U[i, j] = 0.0  # TODO: check correctness of this case
-                else:
-                    U[i, j] = T[i, j] * (U[i, i] - U[j, j]) / (T[i, i] - T[j, j]) + \
-                        sum([U[i, k] * T[k, j] - T[i, k] * U[k, j] for k in range(i + 1, j)]) \
-                        / (T[i, i] - T[j, j])
-            else:  # use U^2 = I
-                U[i, j] = - sum([U[i, k] * U[k, j] for k in range(i + 1, j)]) / S
-    return _np.dot(Z, _np.dot(U, _np.conjugate(Z.T)))
+    sign = Z @ (U @ Z.T.conj())
+    return sign
 
-    #Quick & dirty - not always stable:
-    #U,_,Vt = _np.linalg.svd(M)
-    #return _np.dot(U,Vt)
 
 
 def print_mx(mx, width=9, prec=4, withbrackets=False):
@@ -605,6 +675,7 @@ def unitary_superoperator_matrix_log(m, mx_basis):
     evals = _np.linalg.eigvals(M_std)
     assert(_np.allclose(_np.abs(evals), 1.0))  # simple but technically incomplete check for a unitary superop
     # (e.g. could be anti-unitary: diag(1, -1, -1, -1))
+
     U = _ot.std_process_mx_to_unitary(M_std)
     H = _spl.logm(U) / -1j  # U = exp(-iH)
     logM_std = _lt.create_elementary_errorgen('H', H)  # rho --> -i[H, rho]
@@ -831,9 +902,6 @@ def real_matrix_log(m, action_if_imaginary="raise", tol=1e-8):
 
 
 ## ------------------------ Erik : Matrix tools that Tim has moved here -----------
-from scipy.linalg import sqrtm as _sqrtm
-import itertools as _ittls
-
 
 def column_basis_vector(i, dim):
     """
@@ -873,27 +941,9 @@ def vec(matrix_in):
     return [b for a in _np.transpose(matrix_in) for b in a]
 
 
-def unvec(vector_in):
-    """
-    Slices a vector into the columns of a matrix.
-
-    Parameters
-    ----------
-    vector_in : numpy.ndarray
-
-    Returns
-    -------
-    numpy.ndarray
-    """
-    dim = int(_np.sqrt(len(vector_in)))
-    return _np.transpose(_np.array(list(
-        zip(*[_ittls.chain(vector_in,
-                                   _ittls.repeat(None, dim - 1))] * dim))))
-
-
 def norm1(m):
     """
-    Returns the 1 norm of a matrix
+    Returns the Schatten 1-norm of a matrix
 
     Parameters
     ----------
@@ -904,9 +954,13 @@ def norm1(m):
     -------
     numpy.ndarray
     """
-    return float(_np.real(_np.trace(_sqrtm(_np.dot(m.conj().T, m)))))
+    s = _spl.svdvals(m)
+    nrm = _np.sum(s)
+    return nrm
 
 
+# Riley note: I'd like to rewrite this, but I don't want to mess with reproducibility
+# issues. For now I've just made it a teeny bit more efficient.
 def random_hermitian(dim):
     """
     Generates a random Hermitian matrix
@@ -925,12 +979,13 @@ def random_hermitian(dim):
         dim = int(dim)
         a = _np.random.random(size=[dim, dim])
         b = _np.random.random(size=[dim, dim])
-        c = a + 1.j * b + (a + 1.j * b).conj().T
+        c = a + 1.j * b
+        c += c.conj().T
         my_norm = norm1(c)
     return c / my_norm
 
 
-def norm1to1(operator, num_samples=10000, mx_basis="gm", return_list=False):
+def norm1to1(operator, num_samples=10000, mx_basis="gm"):
     """
     The Hermitian 1-to-1 norm of a superoperator represented in the standard basis.
 
@@ -948,23 +1003,20 @@ def norm1to1(operator, num_samples=10000, mx_basis="gm", return_list=False):
     mx_basis : {'std', 'gm', 'pp', 'qt'} or Basis
         The basis of `operator`.
 
-    return_list : bool, optional
-        Whether the entire list of sampled values is returned or just the maximum.
-
     Returns
     -------
     float or list
         Depends on the value of `return_list`.
     """
     std_operator = change_basis(operator, mx_basis, 'std')
-    rand_dim = int(_np.sqrt(float(len(std_operator))))
-    vals = [norm1(unvec(_np.dot(std_operator, vec(random_hermitian(rand_dim)))))
-            for n in range(num_samples)]
-    if return_list:
-        return vals
-    else:
-        return max(vals)
-
+    dim = int(_np.sqrt(len(std_operator)))
+    max_val = 0.0
+    for _ in range(num_samples):
+        invec = random_hermitian(dim).ravel(order='F')
+        outvec = std_operator @ invec
+        val = norm1(outvec.reshape((dim,dim), order='F'))
+        max_val = max(val, max_val)
+    return max_val
 
 ## ------------------------ General utility fns -----------------------------------
 
@@ -1372,104 +1424,6 @@ def _findx(a, inds, always_copy=False):
         return a_inds
 
 
-def safe_dot(a, b):
-    """
-    Performs dot(a,b) correctly when neither, either, or both arguments are sparse matrices.
-
-    Parameters
-    ----------
-    a : numpy.ndarray or scipy.sparse matrix.
-        First matrix.
-
-    b : numpy.ndarray or scipy.sparse matrix.
-        Second matrix.
-
-    Returns
-    -------
-    numpy.ndarray or scipy.sparse matrix
-    """
-    if _sps.issparse(a):
-        return a.dot(b)  # sparseMx.dot works for both sparse and dense args
-    elif _sps.issparse(b):
-        # to return a sparse mx even when a is dense (asymmetric behavior):
-        # --> return _sps.csr_matrix(a).dot(b) # numpyMx.dot can't handle sparse argument
-        return _np.dot(a, b.toarray())
-    else:
-        return _np.dot(a, b)
-
-
-def safe_real(a, inplace=False, check=False):
-    """
-    Get the real-part of `a`, where `a` can be either a dense array or a sparse matrix.
-
-    Parameters
-    ----------
-    a : numpy.ndarray or scipy.sparse matrix.
-        Array to take real part of.
-
-    inplace : bool, optional
-        Whether this operation should be done in-place.
-
-    check : bool, optional
-        If True, raise a `ValueError` if `a` has a nonzero imaginary part.
-
-    Returns
-    -------
-    numpy.ndarray or scipy.sparse matrix
-    """
-    if check:
-        assert(safe_norm(a, 'imag') < 1e-6), "Check failed: taking real-part of matrix w/nonzero imaginary part"
-    if _sps.issparse(a):
-        if _sps.isspmatrix_csr(a):
-            if inplace:
-                ret = _sps.csr_matrix((_np.real(a.data), a.indices, a.indptr), shape=a.shape, dtype='d')
-            else:  # copy
-                ret = _sps.csr_matrix((_np.real(a.data).copy(), a.indices.copy(),
-                                       a.indptr.copy()), shape=a.shape, dtype='d')
-            ret.eliminate_zeros()
-            return ret
-        else:
-            raise NotImplementedError("safe_real() doesn't work with %s matrices yet" % str(type(a)))
-    else:
-        return _np.real(a)
-
-
-def safe_imag(a, inplace=False, check=False):
-    """
-    Get the imaginary-part of `a`, where `a` can be either a dense array or a sparse matrix.
-
-    Parameters
-    ----------
-    a : numpy.ndarray or scipy.sparse matrix.
-        Array to take imaginary part of.
-
-    inplace : bool, optional
-        Whether this operation should be done in-place.
-
-    check : bool, optional
-        If True, raise a `ValueError` if `a` has a nonzero real part.
-
-    Returns
-    -------
-    numpy.ndarray or scipy.sparse matrix
-    """
-    if check:
-        assert(safe_norm(a, 'real') < 1e-6), "Check failed: taking imag-part of matrix w/nonzero real part"
-    if _sps.issparse(a):
-        if _sps.isspmatrix_csr(a):
-            if inplace:
-                ret = _sps.csr_matrix((_np.imag(a.data), a.indices, a.indptr), shape=a.shape, dtype='d')
-            else:  # copy
-                ret = _sps.csr_matrix((_np.imag(a.data).copy(), a.indices.copy(),
-                                       a.indptr.copy()), shape=a.shape, dtype='d')
-            ret.eliminate_zeros()
-            return ret
-        else:
-            raise NotImplementedError("safe_real() doesn't work with %s matrices yet" % str(type(a)))
-    else:
-        return _np.imag(a)
-
-
 def safe_norm(a, part=None):
     """
     Get the frobenius norm of a matrix or vector, `a`, when it is either a dense array or a sparse matrix.
@@ -2044,7 +1998,7 @@ def to_unitary(scaled_unitary):
     unitary : ndarray
         Such that `scale * unitary == scaled_unitary`.
     """
-    scaled_identity = _np.dot(scaled_unitary, _np.conjugate(scaled_unitary.T))
+    scaled_identity = gram_matrix(scaled_unitary, adjoint=True)
     scale = _np.sqrt(scaled_identity[0, 0])
     assert(_np.allclose(scaled_identity / (scale**2), _np.identity(scaled_identity.shape[0], 'd'))), \
         "Given `scaled_unitary` does not appear to be a scaled unitary matrix!"
@@ -2243,30 +2197,6 @@ def project_onto_antikite(mx, kite):
     return mx
 
 
-def remove_dependent_cols(mx, tol=1e-7):
-    """
-    Removes the linearly dependent columns of a matrix.
-
-    Parameters
-    ----------
-    mx : numpy.ndarray
-        The input matrix
-
-    Returns
-    -------
-        A linearly independent subset of the columns of `mx`.
-    """
-    last_rank = 0; cols_to_remove = []
-    for j in range(mx.shape[1]):
-        rnk = _np.linalg.matrix_rank(mx[:, 0:j + 1], tol)
-        if rnk == last_rank:
-            cols_to_remove.append(j)
-        else:
-            last_rank = rnk
-    #print("Removing %d cols" % len(cols_to_remove))
-    return _np.delete(mx, cols_to_remove, axis=1)
-
-
 def intersection_space(space1, space2, tol=1e-7, use_nice_nullspace=False):
     """
     TODO: docstring
@@ -2282,7 +2212,8 @@ def union_space(space1, space2, tol=1e-7):
     TODO: docstring
     """
     VW = _np.concatenate((space1, space2), axis=1)
-    return remove_dependent_cols(VW, tol)
+    indep_cols = independent_columns(VW, None, tol)
+    return VW[:, indep_cols]
 
 
 #UNUSED
diff --git a/pygsti/tools/rbtheory.py b/pygsti/tools/rbtheory.py
index 48cda8f9f..79e23f06c 100644
--- a/pygsti/tools/rbtheory.py
+++ b/pygsti/tools/rbtheory.py
@@ -218,7 +218,8 @@ def rb_gauge(model, target_model, weights=None, mx_basis=None, eigenvector_weigh
         vec_l_operator = vec_l_operator.real
 
     vec_l_operator[abs(vec_l_operator) < 10**(-15)] = 0.
-    l_operator = _mtls.unvec(vec_l_operator)
+    dim = int(_np.sqrt(vec_l_operator.size))
+    l_operator = vec_l_operator.reshape((dim, dim), order='F')
 
     return l_operator
 
@@ -791,7 +792,7 @@ def gate_dependence_of_errormaps(model, target_model, norm='diamond', mx_basis=N
                                             mx_basis=mx_basis))
         elif norm == '1to1':
             gate_dif = error_gs.operations[gate] - error_gs.operations['Gavg']
-            delta.append(_optls.norm1to1(gate_dif, num_samples=1000, mx_basis=mx_basis, return_list=False))
+            delta.append(_optls.norm1to1(gate_dif, num_samples=1000, mx_basis=mx_basis))
         else:
             raise ValueError("Only diamond or 1to1 norm available.")
 
diff --git a/pygsti/tools/slicetools.py b/pygsti/tools/slicetools.py
index ba49b1056..506045182 100644
--- a/pygsti/tools/slicetools.py
+++ b/pygsti/tools/slicetools.py
@@ -26,7 +26,8 @@ def length(s):
     -------
     int
     """
-    if not isinstance(s, slice): return len(s)
+    if not isinstance(s, slice): 
+        return len(s)
     if s.start is None or s.stop is None:
         return 0
     if s.step is None:
@@ -191,7 +192,8 @@ def indices(s, n=None):
     elif s.start < 0:
         assert(n is not None), "Must supply `n` to obtain indices of a slice with negative start point!"
         start = n + s.start
-    else: start = s.start
+    else: 
+        start = s.start
 
     if s.stop is None:
         assert(n is not None), "Must supply `n` to obtain indices of a slice with unspecified stop point!"
@@ -199,12 +201,56 @@ def indices(s, n=None):
     elif s.stop < 0:
         assert(n is not None), "Must supply `n` to obtain indices of a slice with negative stop point!"
         stop = n + s.stop
-    else: stop = s.stop
+    else: 
+        stop = s.stop
 
     if s.step is None:
         return list(range(start, stop))
-    return list(range(start, stop, s.step))
+    else:
+        return list(range(start, stop, s.step))
+    
+def indices_as_array(s, n=None):
+    """
+    Returns a numpy array of the indices specified by slice `s`.
+
+    Parameters
+    ----------
+    s : slice
+        The slice to operate upon.
+
+    n : int, optional
+        The number of elements in the array being indexed,
+        used for computing *negative* start/stop points.
+
+    Returns
+    -------
+    numpy ndarray array of integers
+    """
+    if s.start is None and s.stop is None:
+        return []
+
+    if s.start is None:
+        start = 0
+    elif s.start < 0:
+        assert(n is not None), "Must supply `n` to obtain indices of a slice with negative start point!"
+        start = n + s.start
+    else: 
+        start = s.start
+
+    if s.stop is None:
+        assert(n is not None), "Must supply `n` to obtain indices of a slice with unspecified stop point!"
+        stop = n
+    elif s.stop < 0:
+        assert(n is not None), "Must supply `n` to obtain indices of a slice with negative stop point!"
+        stop = n + s.stop
+    else: 
+        stop = s.stop
 
+    if s.step is None:
+        return _np.arange(start, stop, dtype=_np.int64)
+    else:
+        return _np.arange(start, stop, s.step, dtype=_np.int64)
+    
 
 def list_to_slice(lst, array_ok=False, require_contiguous=True):
     """
@@ -240,17 +286,23 @@ def list_to_slice(lst, array_ok=False, require_contiguous=True):
                 else:
                     raise ValueError("Slice must be contiguous!")
         return lst
-    if lst is None or len(lst) == 0: return slice(0, 0)
+    if lst is None or len(lst) == 0: 
+        return slice(0, 0)
     start = lst[0]
 
-    if len(lst) == 1: return slice(start, start + 1)
-    step = lst[1] - lst[0]; stop = start + step * len(lst)
+    if len(lst) == 1: 
+        return slice(start, start + 1)
+    step = lst[1] - lst[0]
+    stop = start + step * len(lst)
 
     if list(lst) == list(range(start, stop, step)):
         if require_contiguous and step != 1:
-            if array_ok: return _np.array(lst, _np.int64)
-            else: raise ValueError("Slice must be contiguous (or array_ok must be True)!")
-        if step == 1: step = None
+            if array_ok: 
+                return _np.array(lst, _np.int64)
+            else: 
+                raise ValueError("Slice must be contiguous (or array_ok must be True)!")
+        if step == 1: 
+            step = None
         return slice(start, stop, step)
     elif array_ok:
         return _np.array(lst, _np.int64)
@@ -272,7 +324,7 @@ def to_array(slc_or_list_like):
     numpy.ndarray
     """
     if isinstance(slc_or_list_like, slice):
-        return _np.array(indices(slc_or_list_like), _np.int64)
+        return indices_as_array(slc_or_list_like)
     else:
         return _np.array(slc_or_list_like, _np.int64)
 
diff --git a/scripts/api_names.yaml b/scripts/api_names.yaml
index 81f4e0d68..c09dfd954 100644
--- a/scripts/api_names.yaml
+++ b/scripts/api_names.yaml
@@ -3723,7 +3723,6 @@ tools:
     random_hermitian: null
     real_matrix_log: null
     safe_onenorm: null
-    safedot: safe_dot
     safeimag: safe_imag
     safenorm: safe_norm
     safereal: safe_real
diff --git a/setup.cfg b/setup.cfg
index 21316485c..dc121fd46 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
-description-file = README.md
-license-file = LICENSE
+description_file = README.md
+license_file = LICENSE
 
 [bdist_wheel]
 universal = 1
diff --git a/setup.py b/setup.py
index 118040267..bac69bb6e 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,6 @@
         'seaborn',
         'scipy',
         'ply',
-        'qibo<=0.1.7',
         'cirq-core',
         'notebook',
         'ipython',
@@ -157,7 +156,6 @@ def setup_with_extensions(extensions=None):
             'pygsti.evotypes.stabilizer',
             'pygsti.evotypes.stabilizer_slow',
             'pygsti.evotypes.chp',
-            'pygsti.evotypes.qibo',
             'pygsti.extras',
             'pygsti.extras.rb',
             'pygsti.extras.rpe',
diff --git a/test/test_packages/extras/test_interpygate.py b/test/test_packages/extras/test_interpygate.py
index 565e5c396..ea8ccfc83 100644
--- a/test/test_packages/extras/test_interpygate.py
+++ b/test/test_packages/extras/test_interpygate.py
@@ -3,7 +3,7 @@
 
 import pygsti
 from pygsti.extras import interpygate as interp
-from pygsti.extras.interpygate.process_tomography import run_process_tomography, vec, unvec
+from pygsti.extras.interpygate.process_tomography import run_process_tomography, unvec_square
 from pygsti.tools import change_basis
 from ..testutils import BaseTestCase
 
@@ -51,7 +51,8 @@ def advance(self, state, v, t):
         L = dephasing * self.dephasing_generator + decoherence * self.decoherence_generator
 
         process = change_basis(_expm((H + L) * t), 'pp', 'col')
-        state = unvec(_np.dot(process, vec(_np.outer(state, state.conj()))))
+        vec_state = _np.outer(state, state.conj()).ravel(order='F') 
+        state = unvec_square(_np.dot(process, vec_state), 'F')
         return state
 
     def create_process_matrix(self, v, comm=None):
@@ -102,7 +103,8 @@ def advance(self, state, v, times):
         L = dephasing * self.dephasing_generator + decoherence * self.decoherence_generator
 
         processes = [change_basis(_expm((H + L) * t), 'pp', 'col') for t in times]
-        states = [unvec(_np.dot(process, vec(_np.outer(state, state.conj())))) for process in processes]
+        vec_state = _np.outer(state, state.conj()).ravel(order='F')
+        states = [unvec_square(_np.dot(process, vec_state),'F') for process in processes]
 
         return states
 
@@ -318,12 +320,13 @@ def test_process_tomography(self):
         test_process = _np.kron(U.conj().T, U)
 
         def single_time_test_function(pure_state, test_process=test_process):
-            rho = vec(_np.outer(pure_state, pure_state.conj()))
-            return unvec(_np.dot(test_process, rho))
+            rho = _np.outer(pure_state, pure_state.conj()).ravel(order='F')
+            return unvec_square(_np.dot(test_process, rho),'F')
 
         def multi_time_test_function(pure_state, test_process=test_process):
-            rho = vec(_np.outer(pure_state, pure_state.conj()))
-            return [unvec(_np.dot(test_process, rho)), unvec(_np.dot(_np.linalg.matrix_power(test_process, 2), rho))]
+            rho = _np.outer(pure_state, pure_state.conj()).ravel(order='F')
+            temp = _np.dot(_np.linalg.matrix_power(test_process, 2), rho)
+            return [unvec_square(_np.dot(test_process, rho), 'F'), unvec_square(temp, 'F')]
 
         process_matrix = run_process_tomography(single_time_test_function, n_qubits=2, verbose=False)
         if _rank == 0:
diff --git a/test/test_packages/objects/test_qibogst.py b/test/test_packages/objects/test_qibogst.py
deleted file mode 100644
index 15013da1b..000000000
--- a/test/test_packages/objects/test_qibogst.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import unittest
-from ..testutils import BaseTestCase
-
-import pygsti
-from pygsti.modelpacks import smq1Q_XYI as std
-
-#qibo is also currently suffering from numpy 1.24.0 related deprecation problems
-#that result in this dying on this import.
-try:
-    from pygsti.evotypes import qibo as evo_qibo  # don't clobber qibo!
-except (ImportError, AttributeError):
-    pass
-
-@unittest.skip("Qibo import is currently broken because of numpy 1.24, re-enable once the devs fix it.")
-class TestQiboGSTCase(BaseTestCase):
-    def setUp(self):
-        evo_qibo.densitymx_mode = True
-        evo_qibo.minimal_space = 'HilbertSchmidt'  # maybe this should be set automatically?
-
-    def _rungst_comparison(self, ptype):
-        mdl_densitymx = std.target_model(ptype, evotype='densitymx', simulator='map')
-        mdl_qibo = std.target_model(ptype, evotype='qibo', simulator='map')
-
-        edesign = std.create_gst_experiment_design(1)
-        mdl_datagen = std.target_model().depolarize(op_noise=0.05, spam_noise=0.02)
-        ds = pygsti.data.simulate_data(mdl_datagen, edesign, 1000, seed=1234)
-        data = pygsti.protocols.ProtocolData(edesign, ds)
-
-        proto = pygsti.protocols.GST(mdl_densitymx, gaugeopt_suite=None, optimizer={'maxiter': 100}, verbosity=3)
-        results_densitymx = proto.run(data)
-
-        proto = pygsti.protocols.GST(mdl_qibo, gaugeopt_suite=None, optimizer={'maxiter': 3}, verbosity=3)
-        results_qibo = proto.run(data)  # profiling this shows that all time is bound up in qibo object construction overhead
-
-        #TODO: verify that results are the approximately the same
-
-    @unittest.skip("Qibo GST is currently too slow to test")
-    def test_qibo_gst_fullCPTP(self):
-        return self._rungst_comparison('full CPTP')
-
-    @unittest.skip("Qibo GST is currently too slow to test")
-    def test_qibo_gst_1plusCPTPLND(self):
-        return self._rungst_comparison('1+(CPTPLND)')
diff --git a/test/test_packages/reportb/test_workspace.py b/test/test_packages/reportb/test_workspace.py
index 0c08b3b2b..aeffb0e25 100644
--- a/test/test_packages/reportb/test_workspace.py
+++ b/test/test_packages/reportb/test_workspace.py
@@ -312,30 +312,6 @@ def test_plot_creation(self):
         with self.assertRaises(ValueError):
             w.ColorBoxPlot(("chi2",), self.gss, self.ds, self.mdl, typ="foobar")
 
-        from pygsti.algorithms import directx as dx
-        #specs = pygsti.construction.build_spam_specs(
-        #        prepStrs=prepStrs,
-        #        effectStrs=effectStrs,
-        #        prep_labels=list(self.mdl.preps.keys()),
-        #        effect_labels=self.mdl.get_effect_labels() )
-
-        baseStrs = [plaq.base for _, plaq in self.gss.iter_plaquettes()]
-        #print(f'{baseStrs=}')
-        #print(f'{prepStrs=}')
-        #print(f'{effectStrs=}')
-        #print(self.ds)
-        #print(f'{list(self.gss)=}')
-        #print(self.mdl)
-        
-        directModels = dx.direct_mlgst_models(
-            baseStrs, self.ds, prepStrs, effectStrs, self.tgt, svd_truncate_to=4)
-        #print(f'{directModels=}')
-        plts.append( w.ColorBoxPlot(["chi2","logl","blank"], self.gss,
-                                    self.ds, self.mdl, box_labels=False, direct_gst_models=directModels) )
-        plts.append( w.ColorBoxPlot(["errorrate"], self.gss,
-                                    self.ds, self.mdl, box_labels=False, sum_up=True,
-                                    direct_gst_models=directModels) )
-
         gmx = np.identity(4,'d'); gmx[3,0] = 0.5
         plts.append(w.MatrixPlot(gmx, -1, 1, ['a','b','c','d'], ['e','f','g','h'], "X", "Y",
                                  colormap = pygsti.report.colormaps.DivergingColormap(vmin=-2, vmax=2)))
diff --git a/test/unit/algorithms/test_directx.py b/test/unit/algorithms/test_directx.py
deleted file mode 100644
index f7249af84..000000000
--- a/test/unit/algorithms/test_directx.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import pygsti.circuits as pc
-import pygsti.data as pdata
-from pygsti.algorithms import directx
-from pygsti.baseobjs import Label as L
-from pygsti.circuits import Circuit
-from . import fixtures
-from ..util import BaseCase
-
-_SEED = 1234
-
-# TODO optimize!
-class DirectXTester(BaseCase):
-    @classmethod
-    def setUpClass(cls):
-        super(DirectXTester, cls).setUpClass()
-        cls._tgt = fixtures.model.copy()
-        cls.prepStrs = fixtures.prep_fids
-        cls.effectStrs = fixtures.meas_fids
-        cls.strs = [Circuit([], line_labels=(0,)),
-                    Circuit([L('Gxpi2',0)], line_labels=(0,)),
-                    Circuit([L('Gypi2',0)], line_labels=(0,)),
-                    Circuit([L('Gxpi2',0), L('Gxpi2',0)], line_labels=(0,)),
-                    Circuit([L('Gxpi2',0), L('Gypi2',0), L('Gxpi2',0)], line_labels=(0,))
-                    ]
-                    
-        expstrs = pc.create_circuits(
-            "f0+base+f1", order=['f0', 'f1', 'base'], f0=cls.prepStrs,
-            f1=cls.effectStrs, base=cls.strs
-        )
-        cls._ds = pdata.simulate_data(fixtures.datagen_gateset.copy(), expstrs, 1000, 'multinomial', seed=_SEED)
-
-    def setUp(self):
-        self.tgt = self._tgt.copy()
-        self.ds = self._ds.copy()
-
-    def test_model_with_lgst_circuit_estimates(self):
-        model = directx.model_with_lgst_circuit_estimates(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            svd_truncate_to=4, verbosity=10
-        )
-        # TODO assert correctness
-
-        model = directx.model_with_lgst_circuit_estimates(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            include_target_ops=False, svd_truncate_to=4, verbosity=10
-        )
-        # TODO assert correctness
-
-        circuit_labels = [L('G0'), L('G1'), L('G2'), L('G3'), L('G4')]
-        model = directx.model_with_lgst_circuit_estimates(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            circuit_labels=circuit_labels,
-            include_target_ops=False, svd_truncate_to=4, verbosity=10
-        )
-        self.assertEqual(
-            set(model.operations.keys()),
-            set(circuit_labels)
-        )
-
-    def test_direct_lgst_models(self):
-        gslist = directx.direct_lgst_models(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            op_label_aliases=None, svd_truncate_to=4, verbosity=10)
-        # TODO assert correctness
-
-    def test_direct_mc2gst_models(self):
-        gslist = directx.direct_mc2gst_models(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            op_label_aliases=None, min_prob_clip_for_weighting=1e-4,
-            prob_clip_interval=(-1e6, 1e6), svd_truncate_to=4, verbosity=10)
-        # TODO assert correctness
-
-    def test_direct_mlgst_models(self):
-        gslist = directx.direct_mlgst_models(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            op_label_aliases=None, min_prob_clip=1e-6, prob_clip_interval=(-1e6, 1e6),
-            svd_truncate_to=4, verbosity=10)
-        # TODO assert correctness
-
-    def test_focused_mc2gst_models(self):
-        gslist = directx.focused_mc2gst_models(
-            self.strs, self.ds, self.prepStrs, self.effectStrs, self.tgt,
-            op_label_aliases=None, min_prob_clip_for_weighting=1e-4,
-            prob_clip_interval=(-1e6, 1e6), verbosity=10)
-        # TODO assert correctness
diff --git a/test/unit/objects/test_fogi.py b/test/unit/objects/test_fogi.py
index 783de2390..d55314aa2 100644
--- a/test/unit/objects/test_fogi.py
+++ b/test/unit/objects/test_fogi.py
@@ -170,8 +170,8 @@ def test_crosstalk_free_fogi(self):
         nprefix = mdl.num_params - nfogi  # reparameterization *prefixes* FOGI params with "unused" params
         self.assertEqual(nprefix, 0)  # because include_spam=True above
 
-        self.assertArraysAlmostEqual(mdl.fogi_errorgen_components_array(include_fogv=False, normalized_elem_gens=True),
-                                     mdl.to_vector()[nprefix:])
+        temp = mdl.fogi_errorgen_components_array(include_fogv=False, normalized_elem_gens=True)
+        self.assertArraysAlmostEqual(temp, mdl.to_vector()[nprefix:])
 
         v = mdl.to_vector()  # just test this works
 
@@ -179,6 +179,7 @@ def test_crosstalk_free_fogi(self):
         w = np.random.rand(mdl.num_params)
         w[0:nprefix] = 0 # zero out all unused params (these can be SPAM and can't be any value?)
         mdl.from_vector(w)
+        pass
 
 
     def test_cloud_crosstalk_fogi(self):
@@ -218,8 +219,8 @@ def test_cloud_crosstalk_fogi(self):
         nprefix = mdl.num_params - nfogi  # reparameterization *prefixes* FOGI params with "unused" params
 
         self.assertEqual(nprefix, 0)  # because include_spam=True above
-        self.assertArraysAlmostEqual(mdl.fogi_errorgen_components_array(include_fogv=False, normalized_elem_gens=True),
-                                     mdl.to_vector()[nprefix:])
+        temp = mdl.fogi_errorgen_components_array(include_fogv=False, normalized_elem_gens=True)
+        self.assertArraysAlmostEqual(temp, mdl.to_vector()[nprefix:])
 
         v = mdl.to_vector()  # just test this works
 
diff --git a/test/unit/objects/test_qibo_evotype.py b/test/unit/objects/test_qibo_evotype.py
deleted file mode 100644
index 4fd798990..000000000
--- a/test/unit/objects/test_qibo_evotype.py
+++ /dev/null
@@ -1,127 +0,0 @@
-
-import unittest
-import numpy as np
-from packaging import version
-    
-from pygsti.processors import QubitProcessorSpec
-from pygsti.models import create_crosstalk_free_model
-from pygsti.circuits import Circuit
-from pygsti.modelpacks import smq2Q_XYI as std
-from pygsti.modelpacks import smq1Q_XYI as std1Q
-
-from pygsti.evotypes.densitymx_slow.opreps import OpRepIdentityPlusErrorgen
-from pygsti.evotypes.densitymx.opreps import OpRepDenseSuperop
-from ..util import BaseCase
-
-#also catch the attribute error here
-try:
-    np.int = int  # because old versions of qibo use deprecated (and now removed)
-    np.float = float  # types within numpy.  So this is a HACK to get around this.
-    np.complex = complex
-    import qibo as _qibo
-    if version.parse(_qibo.__version__) != version.parse("0.1.7"):
-        _qibo = None  # version too low - doesn't contain all the builtin gates, e.g. qibo.gates.S
-except (ImportError, AttributeError):
-    _qibo = None
-
-#Deprecated numpy calls are currently breaking the qibo import
-#so add in a catch for this exception and skip this test if that happens.
-try:
-    from pygsti.evotypes import qibo as evo_qibo  # don't clobber qibo!
-except AttributeError:
-    evo_qibo = None
-
-
-
-class QiboEvotypeTester(BaseCase):
-
-    def setUp(self):
-        self.pspec = QubitProcessorSpec(2, ['Gxpi2', 'Gypi2', 'Gcnot'], geometry='line')
-        self.test_circuit = Circuit("Gxpi2:0^2", line_labels=(0, 1))
-        # Circuit("Gxpi2:0Gypi2:1Gcnot:0:1", line_labels=(0,1))
-
-        self.mdl_densitymx = create_crosstalk_free_model(
-            self.pspec, evotype='densitymx', simulator='map',
-            depolarization_strengths={('Gxpi2',0): 0.075, ('Gypi2',0): 0.075})
-        self.probs_densitymx = self.mdl_densitymx.probabilities(self.test_circuit)
-
-    def check_probs(self, probs1, probs2, delta=1e-6):
-        for k, v in probs2.items():
-            self.assertAlmostEqual(probs1[k], v, delta=delta)
-
-    @unittest.skipIf(_qibo is None, "qibo package not installed so cannot test")
-    def test_qibo_circuitsim_statevec(self):
-        evo_qibo.densitymx_mode = False
-        evo_qibo.nshots = 1000
-        mdl_qibo = create_crosstalk_free_model(self.pspec, evotype='qibo', simulator='map',
-                                               depolarization_strengths={('Gxpi2',0): 0.075, ('Gypi2',0): 0.075})
-        probs = mdl_qibo.probabilities(self.test_circuit)
-        self.check_probs(probs, self.probs_densitymx, delta=0.04)  # loose check for 1000 shots
-
-    @unittest.skipIf(_qibo is None, "qibo package not installed so cannot test")
-    def test_qibo_circuitsim_densitymx(self):
-        evo_qibo.densitymx_mode = True
-        mdl_qibo = create_crosstalk_free_model(self.pspec, evotype='qibo', simulator='map',
-                                               depolarization_strengths={('Gxpi2',0): 0.075, ('Gypi2',0): 0.075})
-        probs = mdl_qibo.probabilities(self.test_circuit)
-        self.check_probs(probs, self.probs_densitymx, delta=1e-6)  # tight check (should be ~exact)
-
-    #Note: for FUTURE work - this doesn't work for map fwdsim like the densitymx version below
-    # because the qibo effect reps (needed for explicit models) only work for densitymx mode.  These
-    # 'matrix' simulator runs but really shouldn't (I think it uses the qibo std-basis matrices?) and
-    # gets bogus results, and we should probably at least make sure this errors appropriately.
-    #def test_qibo_stdmodel_statevec(self):
-    #    pass
-
-    @unittest.skipIf(_qibo is None, "qibo package not installed so cannot test")
-    def test_qibo_stdmodel_densitymx(self):
-        evo_qibo.densitymx_mode = True
-        mdl_std_qibo = std.target_model('static unitary', evotype='qibo', simulator='map')
-        probs = mdl_std_qibo.probabilities(self.test_circuit)
-        self.assertAlmostEqual(probs['00'], 0.0)
-        self.assertAlmostEqual(probs['01'], 0.0)
-        self.assertAlmostEqual(probs['10'], 1.0)
-        self.assertAlmostEqual(probs['11'], 0.0)
-
-    @unittest.skipIf(_qibo is None, "qibo package not installed so cannot test")
-    def test_FullCPTP_parameterization(self):  # maybe move or split this test elsewhere too?
-        evo_qibo.densitymx_mode = True
-        evo_qibo.minimal_space = 'HilbertSchmidt'  # maybe this should be set automatically?
-
-        # 'full CPTP' or test new '1+(CPTPLND)'
-        mdl_densitymx_slow = std1Q.target_model('full CPTP', evotype='densitymx_slow', simulator='map')
-        mdl_densitymx = std1Q.target_model('full CPTP', evotype='densitymx', simulator='map')
-        mdl_qibo = std1Q.target_model('full CPTP', evotype='qibo', simulator='map')
-
-        c = Circuit("Gxpi2:0", line_labels=(0,))
-        probs1 = mdl_densitymx_slow.probabilities(c)
-        probs2 = mdl_densitymx.probabilities(c)
-        probs3 = mdl_qibo.probabilities(c)
-        self.assertAlmostEqual(probs1['0'], 0.5)
-        self.assertAlmostEqual(probs1['1'], 0.5)
-        self.check_probs(probs1, probs2, delta=1e-6)
-        self.check_probs(probs1, probs3, delta=1e-6)
-
-    @unittest.skipIf(_qibo is None, "qibo package not installed so cannot test")
-    def test_1plusCPTPLND_parameterization(self):  # maybe move or split this test elsewhere too?
-        evo_qibo.densitymx_mode = True
-        evo_qibo.minimal_space = 'HilbertSchmidt'  # maybe this should be set automatically?
-
-        mdl_densitymx_slow = std1Q.target_model('1+(CPTPLND)', evotype='densitymx_slow', simulator='map')
-        mdl_densitymx = std1Q.target_model('1+(CPTPLND)', evotype='densitymx', simulator='map')
-        mdl_qibo = std1Q.target_model('1+(CPTPLND)', evotype='qibo', simulator='map')
-
-        self.assertTrue(isinstance(mdl_densitymx_slow.operations['Gxpi2', 0]._rep.factor_reps[1],
-                                   OpRepIdentityPlusErrorgen))
-        self.assertTrue(isinstance(mdl_densitymx.operations['Gxpi2', 0]._rep.factor_reps[1],
-                                   OpRepDenseSuperop))
-        # Note: we haven't mirrored OpRepIdentityPlusErrorgen in densitymx evotype
-
-        c = Circuit("Gxpi2:0", line_labels=(0,))
-        probs1 = mdl_densitymx_slow.probabilities(c)
-        probs2 = mdl_densitymx.probabilities(c)
-        probs3 = mdl_qibo.probabilities(c)
-        self.assertAlmostEqual(probs1['0'], 0.5)
-        self.assertAlmostEqual(probs1['1'], 0.5)
-        self.check_probs(probs1, probs2, delta=1e-6)
-        self.check_probs(probs1, probs3, delta=1e-6)
diff --git a/test/unit/protocols/test_rb.py b/test/unit/protocols/test_rb.py
index fcb616cf2..9f461f098 100644
--- a/test/unit/protocols/test_rb.py
+++ b/test/unit/protocols/test_rb.py
@@ -1,6 +1,7 @@
 from ..util import BaseCase
 
 import numpy as _np
+from pathlib import Path
 
 import pygsti
 from pygsti.protocols import rb as _rb
@@ -9,6 +10,8 @@
 from pygsti.circuits import Circuit
 from pygsti.baseobjs import Label
 
+FILE_PATH = str(Path(__file__).resolve().parent)
+
 class TestCliffordRBDesign(BaseCase):
 
     def setUp(self):
@@ -111,9 +114,9 @@ def test_serialization(self):
             citerations=self.citerations, compilerargs=self.compiler_args, seed=self.seed,
             verbosity=self.verbosity, num_processes=1)
 
-        crb_design.write('../../test_packages/temp_test_files/test_CliffordRBDesign_serialization')
+        crb_design.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_CliffordRBDesign_serialization')
         #then read this back in
-        crb_design_read = _rb.CliffordRBDesign.from_dir('../../test_packages/temp_test_files/test_CliffordRBDesign_serialization')
+        crb_design_read = _rb.CliffordRBDesign.from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_CliffordRBDesign_serialization')
 
         self.assertEqual(crb_design.all_circuits_needing_data, crb_design_read.all_circuits_needing_data)
         self.assertEqual(crb_design.interleaved_circuit, crb_design_read.interleaved_circuit)
@@ -163,9 +166,9 @@ def test_combined_design_access(self):
 
     def test_serialization(self):
 
-        self.irb_design.write('../../test_packages/temp_test_files/test_InterleavedRBDesign_serialization')
+        self.irb_design.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_InterleavedRBDesign_serialization')
         #then read this back in
-        irb_design_read = _rb.InterleavedRBDesign.from_dir('../../test_packages/temp_test_files/test_InterleavedRBDesign_serialization')
+        irb_design_read = _rb.InterleavedRBDesign.from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_InterleavedRBDesign_serialization')
 
         self.assertEqual(self.irb_design.all_circuits_needing_data, irb_design_read.all_circuits_needing_data)
         self.assertEqual(self.irb_design['crb'].all_circuits_needing_data, irb_design_read['crb'].all_circuits_needing_data)
@@ -248,9 +251,9 @@ def test_serialization(self):
             conditionaltwirl=True, citerations=self.citerations, compilerargs=self.compiler_args,
             partitioned=False, seed=self.seed, verbosity=self.verbosity, num_processes=1)
 
-        drb_design.write('../../test_packages/temp_test_files/test_DirectRBDesign_serialization')
+        drb_design.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_DirectRBDesign_serialization')
         #then read this back in
-        drb_design_read = _rb.DirectRBDesign.from_dir('../../test_packages/temp_test_files/test_DirectRBDesign_serialization')
+        drb_design_read = _rb.DirectRBDesign.from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_DirectRBDesign_serialization')
 
         self.assertEqual(drb_design.all_circuits_needing_data, drb_design_read.all_circuits_needing_data)
 
@@ -375,9 +378,9 @@ def test_serialization(self):
             localclifford=True, paulirandomize=True, seed=self.seed, verbosity=self.verbosity,
             num_processes=1)
 
-        mrb_design.write('../../test_packages/temp_test_files/test_MirrorRBDesign_serialization')
+        mrb_design.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_MirrorRBDesign_serialization')
         #then read this back in
-        mrb_design_read = _rb.MirrorRBDesign.from_dir('../../test_packages/temp_test_files/test_MirrorRBDesign_serialization')
+        mrb_design_read = _rb.MirrorRBDesign.from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_MirrorRBDesign_serialization')
 
         self.assertEqual(mrb_design.all_circuits_needing_data, mrb_design_read.all_circuits_needing_data)
 
@@ -424,9 +427,9 @@ def test_serialization(self):
                                                  sampler=self.sampler, samplerargs=self.samplerargs, 
                                                  seed=self.seed, verbosity=0)
         
-        birb_design.write('../../test_packages/temp_test_files/test_BinaryRBDesign_serialization')
+        birb_design.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_BinaryRBDesign_serialization')
         #then read this back in
-        birb_design_read = _rb.BinaryRBDesign.from_dir('../../test_packages/temp_test_files/test_BinaryRBDesign_serialization')
+        birb_design_read = _rb.BinaryRBDesign.from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_BinaryRBDesign_serialization')
 
         self.assertEqual(birb_design.all_circuits_needing_data, birb_design_read.all_circuits_needing_data)
         
@@ -533,8 +536,8 @@ def test_cliffordrb_protocol_ideal(self):
         self.assertTrue(abs(result.fits['A-fixed'].estimates['r'])<=3e-5)
 
         #also test writing and reading the results from disk.
-        result.write('../../test_packages/temp_test_files/test_RandomizedBenchmarking_results')
-        result_read = pygsti.io.read_results_from_dir('../../test_packages/temp_test_files/test_RandomizedBenchmarking_results')
+        result.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_RandomizedBenchmarking_results')
+        result_read = pygsti.io.read_results_from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_RandomizedBenchmarking_results')
         
     def test_cliffordrb_protocol_noisy(self):
         proto = pygsti.protocols.rb.RandomizedBenchmarking(datatype='success_probabilities', defaultfit='A-fixed', rtype='EI',
@@ -703,8 +706,8 @@ def test_interleavedrb_protocol_ideal(self):
         self.assertTrue(abs(estimated_irb_num) <= 1e-5)
 
         #also test writing and reading the results from disk.
-        result.write('../../test_packages/temp_test_files/test_InterleavedRandomizedBenchmarking_results')
-        result_read = pygsti.io.read_results_from_dir('../../test_packages/temp_test_files/test_InterleavedRandomizedBenchmarking_results')
+        result.write(f'{FILE_PATH}/../../test_packages/temp_test_files/test_InterleavedRandomizedBenchmarking_results')
+        result_read = pygsti.io.read_results_from_dir(f'{FILE_PATH}/../../test_packages/temp_test_files/test_InterleavedRandomizedBenchmarking_results')
         
         
     def test_interleavedrb_protocol_noisy(self):
diff --git a/test/unit/tools/test_matrixtools.py b/test/unit/tools/test_matrixtools.py
index 0bb1601ef..3b2b20a75 100644
--- a/test/unit/tools/test_matrixtools.py
+++ b/test/unit/tools/test_matrixtools.py
@@ -17,8 +17,8 @@ def test_is_hermitian(self):
         self.assertFalse(mt.is_hermitian(non_herm_mx))
 
     def test_is_pos_def(self):
-        pos_mx = np.array([[ 4, 0.2],
-                           [0.1, 3]], 'complex')
+        pos_mx = np.array([[ 4.0, 0.2],
+                            [0.2, 3.0]], 'complex')
         non_pos_mx = np.array([[ 0, 1],
                                [1, 0]], 'complex')
         self.assertTrue(mt.is_pos_def(pos_mx))
@@ -160,42 +160,6 @@ def test_fancy_assignment(self):
         self.assertEqual(mt._findx(a, ([0, 1], [0, 1], 0)).shape, (2, 2))
         self.assertEqual(mt._findx(a, ([], [0, 1], 0)).shape, (0, 2))
 
-    def test_safe_ops(self):
-        mx = np.array([[1+1j, 0],
-                       [2+2j, 3+3j]], 'complex')
-        smx = sps.csr_matrix(mx)
-        smx_lil = sps.lil_matrix(mx)  # currently unsupported
-
-        r = mt.safe_real(mx, inplace=False)
-        self.assertArraysAlmostEqual(r, np.real(mx))
-        i = mt.safe_imag(mx, inplace=False)
-        self.assertArraysAlmostEqual(i, np.imag(mx))
-
-        r = mt.safe_real(smx, inplace=False)
-        self.assertArraysAlmostEqual(r.toarray(), np.real(mx))
-        i = mt.safe_imag(smx, inplace=False)
-        self.assertArraysAlmostEqual(i.toarray(), np.imag(mx))
-
-        with self.assertRaises(NotImplementedError):
-            mt.safe_real(smx_lil, inplace=False)
-        with self.assertRaises(NotImplementedError):
-            mt.safe_imag(smx_lil, inplace=False)
-
-        with self.assertRaises(AssertionError):
-            mt.safe_real(mx, check=True)
-        with self.assertRaises(AssertionError):
-            mt.safe_imag(mx, check=True)
-
-        M = mx.copy(); M = mt.safe_real(M, inplace=True)
-        self.assertArraysAlmostEqual(M, np.real(mx))
-        M = mx.copy(); M = mt.safe_imag(M, inplace=True)
-        self.assertArraysAlmostEqual(M, np.imag(mx))
-
-        M = smx.copy(); M = mt.safe_real(M, inplace=True)
-        self.assertArraysAlmostEqual(M.toarray(), np.real(mx))
-        M = smx.copy(); M = mt.safe_imag(M, inplace=True)
-        self.assertArraysAlmostEqual(M.toarray(), np.imag(mx))
-
     def test_fast_expm(self):
         mx = np.array([[1, 2],
                        [2, 3]], 'd')