diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index b7f93ea7..810edf5f 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -11,13 +11,16 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] + python-version: ['3.12', '3.13', '3.14'] name: Tests - Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: ${{ matrix.python-version}} + enable-cache: true + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4.1.0 with: @@ -29,8 +32,8 @@ jobs: - name: Who am I? run: aws sts get-caller-identity - name: Install buildstock_query - run: pip install -e .[dev] + run: uv sync --group dev --python ${{ matrix.python-version }} - name: Pytest - run: python -m pytest -vv + run: uv run --python ${{ matrix.python-version }} pytest -vv - name: Lint - run: flake8 buildstock_query + run: uv run --python ${{ matrix.python-version }} flake8 buildstock_query diff --git a/.gitignore b/.gitignore index 03a5f798..1cfeebaf 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,23 @@ poetry.lock build/ *.egg-info *.parquet +# Track snapshot-cache parquets so the data-check survives a fresh clone. +!tests/query_snapshots/**/*.parquet +!tests/query_snapshots/**/*.sql +!tests/query_snapshots/**/*.json *.txt *.csv *.yml .bsq_cache +# Transient query-execution audit log written by query_core; not snapshot data. +.execution_history +# Per-session SqlCache hash log — populated by SqlCache.get/put, consumed by +# tests/cleanup_stale_caches.py to identify orphaned cache entries. +.cache_usage_log +# Per-session record_query JSONL log — populated by tests/snapshot_recorder.py, +# consumed by tests/normalize_invariant_snapshot.py to update from_invariants.json. +.from_invariants_log.jsonl +# Local-only test cache: full metadata parquets downloaded from S3 (~hundreds +# of MB) for pure-pandas methods like get_applied_options. Tests that use this +# cache require --include-local; default behavior is to skip them in CI. +tests/local_only/cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..74c6b89a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: ["--maxkb=500"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: ["--fix"] + - id: ruff-format diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..e4fba218 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/buildstock_query/__init__.py b/buildstock_query/__init__.py index 66f7aba7..1f93b68a 100644 --- a/buildstock_query/__init__.py +++ b/buildstock_query/__init__.py @@ -3,31 +3,27 @@ - - - - - - - - - A library to run AWS Athena queries to get various data from a BuildStock run. The main class is called BuildStockQuery. An object of BuildStockQuery needs to be created to perform various queries. In addition to supporting various -query member functions, the BuildStockQuery object contains 4 member objects that can be used to perform certain -class of queries and analysis. These 4 member objects can be accessed as follows:: +query member functions, the BuildStockQuery object contains 3 member objects that can be used to perform certain +class of queries and analysis. These member objects can be accessed as follows:: -bsq = BuildStockQuery(...) `BuildStockQuery` object -bsq.agg `buildstock_query.aggregate_query.BuildStockAggregate` -bsq.report `buildstock_query.report_query.BuildStockReport` -bsq.savings `buildstock_query.savings_query.BuildStockSavings` -bsq.utility `buildstock_query.utility_query.BuildStockUtility` +bsq = BuildStockQuery(...) `BuildStockQuery` object +bsq.agg `buildstock_query.aggregate_query.BuildStockAggregate` +bsq.report `buildstock_query.report_query.BuildStockReport` +bsq.utility `buildstock_query.utility_query.BuildStockUtility` ``` -# Some basic query can be done directly using the BuildStockQuery object. For example: -from buildstock_query import BuildStockQuery +# The core query API lives on the BuildStockQuery object itself: +from buildstock_query import BuildStockQuery bsq = BuildStockQuery(...) bsq.get_results_csv() bsq.get_upgrades_csv() +bsq.query(enduses=[...], annual_only=True, ...) # annual baseline / upgrade results +bsq.query(enduses=[...], annual_only=False, ...) # timeseries aggregations +bsq.query(enduses=[...], upgrade_id="1", include_savings=True) # savings shape -# Other more specific queries can be done using specific query class objects. For example: -bsq.agg.aggregate_annual(...) -bsq.agg.aggregate_timeseries(...) -... +# Reports and utility-specific helpers: bsq.report.get_success_report(...) bsq.report.get_successful_simulation_count(...) -... -bsq.savings.savings_shape(...) -... bsq.utility.aggregate_annual_by_eiaid(...) ``` diff --git a/buildstock_query/aggregate_query.py b/buildstock_query/aggregate_query.py index 72202b7e..a15c2b43 100644 --- a/buildstock_query/aggregate_query.py +++ b/buildstock_query/aggregate_query.py @@ -4,12 +4,14 @@ import numpy as np import logging from buildstock_query import main -from buildstock_query.schema.query_params import BaseQuery, TSQuery, Query +from buildstock_query.schema.query_params import Query import pandas as pd from buildstock_query.schema.helpers import gather_params -from typing import Union +from typing import Literal, Optional, Union from collections.abc import Sequence -from buildstock_query.schema.utilities import DBColType, RestrictTuple, validate_arguments +from buildstock_query.schema.utilities import ( + DBColType, RestrictTuple, SALabel, typed_literal, validate_arguments, +) from pydantic import Field logging.basicConfig(level=logging.INFO) @@ -17,6 +19,13 @@ FUELS = ["electricity", "natural_gas", "propane", "fuel_oil", "coal", "wood_cord", "wood_pellets"] +class UnsupportedQueryShape(NotImplementedError): + """Raised when the requested query shape is known to be unsupported on the + current schema. Caught by the snapshot harness and treated as a skip rather + than a failure. + """ + + class BuildStockAggregate: """A class to do aggregation queries for both timeseries and annual results.""" @@ -30,325 +39,409 @@ def __get_timeseries_bs_up_table( upgrade_id: str, applied_only: bool | None, restrict: Sequence[RestrictTuple] = Field(default_factory=list), + avoid: Sequence[RestrictTuple] = Field(default_factory=list), + bs_restrict: Sequence[RestrictTuple] = Field(default_factory=list), + bs_avoid: Sequence[RestrictTuple] = Field(default_factory=list), group_by: Sequence[DBColType] = Field(default_factory=list), + upgrade_only: bool = False, + timestamp_grouping_func: Optional[str] = None, + total_weight=None, + extra_bs_cols: Optional[Sequence[sa.Column]] = None, + skip_bs_per_bldg: bool = False, + join_list: Optional[Sequence[tuple]] = None, + join_list_restrict: Optional[Sequence[RestrictTuple]] = None, + join_list_group_by_cols: Optional[Sequence[sa.Column]] = None, ): if self._bsq.ts_table is None: raise ValueError("No timeseries table found in database.") ts = self._bsq.ts_table - base = self._bsq.bs_table - ucol = self._bsq._ts_upgrade_col + base = self._bsq.bs_table # canonical alias of md_table + + # Push any user-supplied bs_restrict (e.g. comstock `state='CO'`) into the + # inner ts ⋈ bs join condition. Without this, Athena scans the full metadata + # table before applying user filters — for comstock's tract-denormalized + # metadata that's the difference between minutes and timeouts. Adding to + # the JOIN ON clause (rather than wrapping bs in another subquery) keeps + # the SELECT list clean and lets Athena push the predicate into the bs + # table scan without enumerating all columns. + bs_restrict_clauses = self._bsq._get_restrict_clauses(bs_restrict, annual_only=True) + # bs-side avoid clauses (NOT IN / != predicates targeting metadata-side + # columns) get folded into bs_per_bldg's WHERE the same way bs_restrict + # is. Without this, an outer-level _add_avoid would resolve the bs col + # to self.bs_table and SA would introduce that table via a comma-join + # against the outer FROM (which is ts_aggr ⋈ bs_per_bldg, no bs). + bs_avoid_clauses = self._bsq._get_avoid_clauses(bs_avoid, annual_only=True) + + # Unified two-level shape used for both single-upgrade and + # upgrade-pair queries. + # + # ts_flat: per-row scalar projection. Each enduse expression + # (whether bare ts column or calc-col Label) is materialized as + # `ts__`. This pushes arithmetic into the scan layer. + # ts_aggr: per-(bldg_id, bucketed_time, state, ...) aggregate. + # Single-upgrade: SUM(ts__name) → bs__. Upgrade-pair: + # SUM(...) FILTER (WHERE upgrade=0/N) → bs__ / up__. + # outer: JOIN to bs (once) for weights/metadata, then user's GROUP BY. + # + # Pre-bucketing time at ts_aggr cuts the per-bldg shuffle key + # cardinality by 4×/96×/720×/35000× for hourly/daily/monthly/yearly. + # The shuffle is what made the old upgrade-pair pivot time out on + # national hourly queries and what slows down baseline TS queries + # at the same scale. + single_upgrade = upgrade_id == "0" or upgrade_only + ts_upgrade_ids = [upgrade_id] if single_upgrade else ["0", upgrade_id] - if upgrade_id == "0": - # For baseline, return original tables with group_by as-is - if self._bsq.up_table is None: # There are no upgrades so just return the timeseries table as is - tbljoin = ts.join( - base, - sa.and_( - self._bsq.bs_bldgid_column == self._bsq.ts_bldgid_column, - *self._bsq._get_restrict_clauses(restrict, annual_only=True), - ), - ) - else: - tbljoin = ts.join( - base, - sa.and_( - self._bsq.bs_bldgid_column == self._bsq.ts_bldgid_column, - ucol == upgrade_id, - *self._bsq._get_restrict_clauses(restrict, annual_only=True), - ), - ) - return ts, ts, tbljoin, list(group_by) - - # For upgrades, create subqueries with proper joins - # Split group_by into columns from timeseries vs baseline tables ts_group_by = [g for g in group_by if g.name in ts.columns] bs_group_by = [g for g in group_by if g.name not in ts.columns] - # Build column list for subquery - must_have_col_names = [self._bsq.building_id_column_name, self._bsq.timestamp_column_name] - must_have_cols = [ts.c[col_name] for col_name in must_have_col_names] - ts_group_cols = [g for g in ts_group_by if g.name not in must_have_col_names] - group_col_names = [g.name for g in ts_group_cols] - enduse_cols = [e for e in enduses if e.name not in must_have_col_names + group_col_names] - - # Include all necessary columns in the subquery - subquery_cols = must_have_cols + ts_group_cols + bs_group_by + enduse_cols - - # Create subquery with proper join to baseline table - subquery_base = sa.select(*subquery_cols).select_from( - ts.join(base, ts.c[self._bsq.building_id_column_name] == base.c[self._bsq.building_id_column_name]) - ) - ts_b = self._bsq._add_restrict(subquery_base, [[ucol, "0"], *restrict]).alias("ts_b") - ts_u = self._bsq._add_restrict(subquery_base, [[ucol, upgrade_id], *restrict]).alias("ts_u") - - # Remap group_by columns to reference the subquery alias - remapped_group_by = [ts_b.c[g.name] for g in group_by] - - # Create the table join - if applied_only: - tbljoin = ts_b.join( - ts_u, - sa.and_( - ts_b.c[self._bsq.building_id_column_name] == ts_u.c[self._bsq.building_id_column_name], - ts_b.c[self._bsq.timestamp_column_name] == ts_u.c[self._bsq.timestamp_column_name], - ), - ).join(base, ts_b.c[self._bsq.building_id_column_name] == base.c[self._bsq.building_id_column_name]) + ts_unique_keys = self._bsq._get_unique_keys("timeseries") + timestamp_col = self._bsq.timestamp_column_name + # When the user asks for year-collapse (`timestamp_grouping_func='year'`) + # the outer query never references timestamp — it's a single value per + # building. Carrying timestamp through the inner ts_flat / ts_aggr + # forces Athena to materialize the truncated timestamp before the + # inner GROUP BY, blocking partial-aggregation pushdown into the scan + # (a year-collapse query that previously scanned 1.1 GB ballooned to + # 4.2 GB after the unification). Skip the timestamp dimension entirely + # at the inner level for this case; the outer query collapses time + # via SUM regardless. + collapse_inner_time = timestamp_grouping_func == "year" + # Order keys for hash distribution: partition columns (typically + # `state`) first, then timestamp, then bldg_id last. Trino hashes + # by leftmost columns when shuffling for GROUP BY; partition-aligned + # ordering lets it distribute work along the parquet's existing + # layout instead of fighting it. + partition_cols = [k for k in ts_unique_keys if k != self._bsq.building_id_column_name] + ts_key_names_pieces = [*partition_cols] + if not collapse_inner_time: + ts_key_names_pieces.append(timestamp_col) + ts_key_names_pieces.append(self._bsq.building_id_column_name) + ts_key_names = list(dict.fromkeys(ts_key_names_pieces)) + ts_extra_group_names = [g.name for g in ts_group_by if g.name not in ts_key_names] + + # Bucketed time expression — pushed into ts_flat so ts_aggr GROUPs BY + # coarse buckets, not raw 15-min timestamps. Only built when we + # actually carry timestamp through the inner shape. + if timestamp_grouping_func and not collapse_inner_time: + sim_info = self._bsq._get_simulation_info() + raw_time = ts.c[timestamp_col] + if sim_info.offset > 0: + bucketed_time_expr = sa.func.date_trunc( + timestamp_grouping_func, + sa.func.date_add(sim_info.unit, -sim_info.offset, raw_time), + ) + else: + bucketed_time_expr = sa.func.date_trunc(timestamp_grouping_func, raw_time) + elif not collapse_inner_time: + bucketed_time_expr = ts.c[timestamp_col] else: - tbljoin = ts_b.outerjoin( - ts_u, - sa.and_( - ts_b.c[self._bsq.building_id_column_name] == ts_u.c[self._bsq.building_id_column_name], - ts_b.c[self._bsq.timestamp_column_name] == ts_u.c[self._bsq.timestamp_column_name], - ), - ).join(base, ts_b.c[self._bsq.building_id_column_name] == base.c[self._bsq.building_id_column_name]) - - return ts_b, ts_u, tbljoin, remapped_group_by - - @validate_arguments - def __get_annual_bs_up_table(self, upgrade_id: str, applied_only: bool | None): - if upgrade_id == "0": - return self._bsq.bs_table, self._bsq.bs_table, self._bsq.bs_table - - if self._bsq.up_table is None: - raise ValueError("No upgrades table found in database.") - if applied_only: - tbljoin = self._bsq.bs_table.join( - self._bsq.up_table, - sa.and_( - self._bsq.bs_table.c[self._bsq.building_id_column_name] - == self._bsq.up_table.c[self._bsq.building_id_column_name], - self._bsq._up_upgrade_col == upgrade_id, - self._bsq._up_successful_condition, - ), + bucketed_time_expr = None + + ts_restrict_clauses = self._bsq._get_restrict_clauses(restrict, annual_only=False) + # ts-side avoid (NOT IN / != on ts columns) is symmetric to ts_restrict + # at this layer — apply as additional WHERE clauses on ts_flat. + ts_avoid_clauses = self._bsq._get_avoid_clauses(avoid, annual_only=False) + + # Classify each enduse by which table(s) its leaf columns reference: + # - ts-only: every leaf is on ts. Routed through ts_flat / ts_aggr. + # - pure-bs: every leaf is on bs (no ts refs). Skips ts_flat entirely; + # projected at the outer SELECT directly. This is the right path + # for characteristic columns (sqft, vintage, etc.) — constant per + # bldg, no need to materialize per-15-min and re-aggregate. + # - mixed: at least one ts and one bs leaf. Routed through ts_flat + # with bs joined in (preserves today's inner-join shape). + from sqlalchemy.sql import visitors + + def _classify(expr): + target = expr.element if isinstance(expr, SALabel) else expr + ts_refs, bs_refs = [], [] + + def _visit(elem): + if isinstance(elem, sa.Column): + t = getattr(elem, "table", None) + if t is ts: + ts_refs.append(elem) + elif t is not None: + bs_refs.append(elem) + visitors.traverse(target, {}, {"column": _visit}) + if ts_refs and bs_refs: + return "mixed" + if bs_refs: + return "pure_bs" + return "ts_only" + + ts_only_enduses, bs_only_enduses, mixed_enduses = [], [], [] + for e in enduses: + kind = _classify(e) + if kind == "ts_only": + ts_only_enduses.append(e) + elif kind == "pure_bs": + bs_only_enduses.append(e) + else: + mixed_enduses.append(e) + + flat_enduses = ts_only_enduses + mixed_enduses + needs_bs_in_flat = bool(mixed_enduses) + + # Innermost flat subquery: precomputed scalars per ts row. Pure-bs + # enduses are NOT projected here — they go straight to the outer + # SELECT. `upgrade` is projected only for the upgrade-pair case + # (where ts_aggr uses FILTER per side); single-upgrade filters at + # ts_flat WHERE and skips the column. + flat_select_cols = [ + ts.c[k].label(k) for k in ts_key_names if k != timestamp_col + ] + if not collapse_inner_time: + flat_select_cols.append(bucketed_time_expr.label(timestamp_col)) + flat_select_cols.extend([ts.c[name].label(name) for name in ts_extra_group_names]) + if not single_upgrade: + flat_select_cols.append(ts.c["upgrade"].label("upgrade")) + for e in flat_enduses: + value_expr = e.element if isinstance(e, SALabel) else e + flat_select_cols.append(value_expr.label(f"ts__{e.name}")) + + # FROM: ts alone unless we have a mixed enduse referencing bs from + # within an arithmetic expression. _baseline_timeseries_join_condition + # bakes in bs.upgrade=0. + if needs_bs_in_flat: + flat_from = ts.join( + base, + self._bsq._baseline_timeseries_join_condition(base, ts), ) else: - tbljoin = self._bsq.bs_table.outerjoin( - self._bsq.up_table, - sa.and_( - self._bsq.bs_table.c[self._bsq.building_id_column_name] - == self._bsq.up_table.c[self._bsq.building_id_column_name], - self._bsq._up_upgrade_col == upgrade_id, - self._bsq._up_successful_condition, - ), + flat_from = ts + + ts_flat_subq = ( + sa.select(*flat_select_cols) + .select_from(flat_from) + .where( + ts.c["upgrade"].in_([typed_literal(ts.c["upgrade"], u) for u in ts_upgrade_ids]), + *ts_restrict_clauses, + *ts_avoid_clauses, ) + .subquery("ts_flat") + ) - return self._bsq.bs_table, self._bsq.up_table, tbljoin - - @gather_params(BaseQuery) - def aggregate_annual(self, *, params: BaseQuery): - join_list = list(params.join_list) if params.join_list else [] - weights = list(params.weights) if params.weights else [] - restrict = list(params.restrict) if params.restrict else [] - - [self._bsq._get_table(jl[0]) for jl in join_list] # ingress all tables in join list - if params.upgrade_id in {None, 0, "0"}: - enduse_cols = self._bsq._get_enduse_cols(params.enduses, table="baseline") - upgrade_id = None + # ts_aggr: per-(bldg, bucket, state, ...) aggregate over flat_enduses. + # Pure-bs enduses are not in flat_enduses, so they don't appear in + # ts_aggr — they get projected directly at the outer SELECT. + flat_group_keys = [ts_flat_subq.c[k] for k in ts_key_names] + flat_extra_group_cols = [ts_flat_subq.c[name] for name in ts_extra_group_names] + + enduse_aggr_cols = [] + if single_upgrade: + for e in flat_enduses: + v = ts_flat_subq.c[f"ts__{e.name}"] + enduse_aggr_cols.append(safunc.sum(v).label(f"bs__{e.name}")) + inner_rows = safunc.count(sa.text("*")).label("_inner_rows") else: - upgrade_id = self._bsq._validate_upgrade(params.upgrade_id) - enduse_cols = self._bsq._get_enduse_cols(params.enduses, table="upgrade") - total_weight = self._bsq._get_weight(weights) - agg_func, agg_weight = self._bsq._get_agg_func_and_weight(weights, params.agg_func) - enduse_selection = [ - agg_func(enduse if agg_weight is None else enduse * agg_weight).label( - self._bsq._simple_label(enduse.name, params.agg_func) - ) - for enduse in enduse_cols - ] - if params.get_quartiles: - enduse_selection += [ - sa.func.approx_percentile(enduse, [0, 0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98, 1]).label( - f"{self._bsq._simple_label(enduse.name)}__quartiles" - ) - for enduse in enduse_cols - ] - enduse_selection += [ - sa.func.approx_percentile(enduse, [0, 0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98, 1]).filter( - enduse != 0 - ).label( - f"{self._bsq._simple_label(enduse.name)}__nonzero_quartiles" - ) - for enduse in enduse_cols - ] - - if params.get_nonzero_count: - enduse_selection += [ - safunc.sum(sa.case((safunc.coalesce(enduse, 0) != 0, 1), else_=0) * total_weight).label( - f"{self._bsq._simple_label(enduse.name)}__nonzero_units_count" - ) - for enduse in enduse_cols - ] - - grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - safunc.sum(total_weight).label("units_count"), - ] + bs_filter = ts_flat_subq.c["upgrade"] == typed_literal(ts.c["upgrade"], "0") + up_filter = ts_flat_subq.c["upgrade"] == typed_literal(ts.c["upgrade"], upgrade_id) + for e in flat_enduses: + v = ts_flat_subq.c[f"ts__{e.name}"] + enduse_aggr_cols.append(safunc.sum(v).filter(bs_filter).label(f"bs__{e.name}")) + enduse_aggr_cols.append(safunc.sum(v).filter(up_filter).label(f"up__{e.name}")) + inner_rows = safunc.count(sa.text("*")).filter(bs_filter).label("_inner_rows") + + ts_aggr_subq = ( + sa.select(*flat_group_keys, *flat_extra_group_cols, *enduse_aggr_cols, inner_rows) + .select_from(ts_flat_subq) + .group_by(*flat_group_keys, *flat_extra_group_cols) + .subquery("ts_aggr") + ) - if not params.group_by: - query = sa.select(*(grouping_metrics_selection + enduse_selection)) - group_by_selection = [] - else: - group_by_selection = self._bsq._process_groupby_cols(params.group_by, annual_only=True) - query = sa.select(*(group_by_selection + grouping_metrics_selection + enduse_selection)) - # jj = self.bs_table.join(self.ts_table, self.ts_table.c['building_id']==self.bs_table.c['building_id']) - # self._compile(query.select_from(jj)) - if upgrade_id not in [None, 0, "0"]: - if self._bsq.up_table is None: - raise ValueError("The run doesn't contain upgrades") - tbljoin = self._bsq.bs_table.join( - self._bsq.up_table, - sa.and_( - self._bsq.bs_table.c[self._bsq.building_id_column_name] - == self._bsq.up_table.c[self._bsq.building_id_column_name], - self._bsq.up_table.c["upgrade"] == str(upgrade_id), - self._bsq._up_successful_condition, - ), + # Pre-aggregate bs to BUILDING grain (collapse tract fan-out). + # + # ComStock's `*_md_by_state_and_county_parquet` has multiple tract rows + # per (bldg_id, state) pair. A direct `ts ⋈ bs` JOIN fans out each + # ts/ts_aggr row by N_tracts-per-bldg, blowing up the post-join shuffle + # for the outer aggregate (Stage 5 of a national hourly query + # processed 6.28 B rows / 499 GB and aborted at 17h33m before this fix). + # + # All current outer aggregations are linear in `weight`, so we can + # collapse the tract dimension upfront: + # bldg_weight = SUM(weight) per (bldg, state) + # tract_count = COUNT(*) per (bldg, state) + # bldg__weighted = SUM(*weight) per (bldg, state) + # Outer aggregates that used `bs.weight` / `bs. * bs.weight` / + # `count_distinct(md_keys)` translate to references on this + # subquery's pre-summed columns. ResStock's md is one row per bldg, + # so the GROUP BY is a no-op there (sum of one term). + # + # `total_weight` was constructed above as `bs.weight × user_weights` + # bound to bs_table; we pass it in here so its multipliers are + # baked into bldg_weight before the outer SELECT references it. + bs_per_bldg_cols = [base.c[k].label(k) for k in ts_unique_keys] + # Carry bs-side group-by columns as TRUE GROUP BY keys of bs_per_bldg + # — NOT as `arbitrary()` collapsed values. ComStock's md is partitioned + # at (bldg_id, tract, state) granularity with `weight` divided across + # tract rows; a building's tracts can map to different counties. If we + # collapse to one row per (bldg, state) and pick `arbitrary(county)`, + # the outer SUM(weight × value) would attribute the FULL building to + # whichever county was picked — silently dropping the tract-fractional + # disaggregation that the data model encodes. Grouping bs_per_bldg by + # (bldg, state, ) preserves per-tract slices: each + # county/region a building straddles gets its proportional weight. + bs_per_bldg_extra_group_exprs = [] + for g in bs_group_by: + if g.name in ts_unique_keys: + continue + underlying = g.element if isinstance(g, SALabel) else g + bs_per_bldg_cols.append(underlying.label(g.name)) + bs_per_bldg_extra_group_exprs.append(underlying) + weight_expr = total_weight if total_weight is not None else base.c["weight"] + bs_per_bldg_cols.append(safunc.sum(weight_expr).label("bldg_weight")) + bs_per_bldg_cols.append(safunc.count(sa.text("*")).label("tract_count")) + # Pure-bs enduses (e.g. sqft, vintage) are per-bldg constants — pick + # one value per bldg via `arbitrary()` (Trino's any-value aggregate). + # The outer SELECT can then multiply by bldg_weight uniformly with + # everything else, no special path needed. + for e in bs_only_enduses: + value_expr = e.element if isinstance(e, SALabel) else e + bs_per_bldg_cols.append( + safunc.arbitrary(value_expr).label(e.name) ) - query = query.select_from(tbljoin) - - restrict = [(self._bsq._bs_completed_status_col, [self._bsq.db_schema.completion_values.success])] + restrict - query = self._bsq._add_join(query, join_list) - query = self._bsq._add_restrict(query, restrict) - query = self._bsq._add_avoid(query, params.avoid) - query = self._bsq._add_group_by(query, group_by_selection) - query = self._bsq._add_order_by(query, group_by_selection if params.sort else []) - - if params.get_query_only: - return self._bsq._compile(query) - - return self._bsq.execute(query) - - def _aggregate_timeseries_light(self, params: TSQuery): - """ - Lighter version of aggregate_timeseries where each enduse is submitted as a separate query to be light on - Athena. For information on the input parameters, check the documentation on aggregate_timeseries. - """ + # Extra bs columns the outer query needs (typically the left-side + # of join_list joins, e.g. `bs.in.county` for the utility eiaid + # join). Same semantics as bs_only_enduses: per-bldg constants + # collapsed via arbitrary(), labeled with the original column name + # so `_add_join`'s `bs.` reference resolves on bs_per_bldg. + for ec in extra_bs_cols or (): + if ec.name in {c.name for c in bs_per_bldg_cols}: + continue + bs_per_bldg_cols.append(safunc.arbitrary(ec).label(ec.name)) + + # Fold any join_list joins (e.g. utility eiaid_weights) INTO bs_per_bldg's + # FROM. They're metadata-side extensions of bs (eiaid is a + # per-county-per-bldg attribute), so absorbing them here keeps the + # outer query a clean ts_aggr ⋈ bs_per_bldg shape with no extra + # outer JOINs. Restricts and group-bys targeting these tables are + # routed via `extra_bs_cols` (per-bldg via arbitrary()) and + # `join_list_restrict` (added to bs_per_bldg's WHERE). + bs_per_bldg_from = base + for new_table_name, baseline_col, new_col in (join_list or ()): + jl_table = self._bsq._get_table(new_table_name) + # Resolve baseline_col on the canonical bs alias (so ON's left + # side binds to base, which is in bs_per_bldg's FROM). + if isinstance(baseline_col, str): + bs_side = base.c[baseline_col] + elif isinstance(baseline_col, sa.Column) and baseline_col.name in base.c: + bs_side = base.c[baseline_col.name] + else: + bs_side = baseline_col + new_side = self._bsq._get_column(new_col, candidate_tables=[jl_table]) + bs_per_bldg_from = bs_per_bldg_from.join(jl_table, bs_side == new_side) - enduse_cols = self._bsq._get_enduse_cols(params.enduses, table="timeseries") - batch_queries_to_submit = [] - for enduse in enduse_cols: - new_query = params.copy() - new_query.enduses = [enduse.name] - new_query.split_enduses = False - query = self.aggregate_timeseries(params=new_query) - batch_queries_to_submit.append(query) + join_list_restrict_clauses = ( + self._bsq._get_restrict_clauses(join_list_restrict, annual_only=True) + if join_list_restrict else [] + ) - if params.get_query_only: - logger.warning( - "Not recommended to use get_query_only and split_enduses used together." - " The results from the queries cannot be directly combined to get the desired result." - " There are further processing done in the function. The queries should be used for" - " information or debugging purpose only. Use get_query_only=False to get proper result." + bs_per_bldg = ( + sa.select(*bs_per_bldg_cols) + .select_from(bs_per_bldg_from) + .where( + self._bsq._upgrade_zero_filter(base), + *bs_restrict_clauses, + *bs_avoid_clauses, + *join_list_restrict_clauses, ) - return batch_queries_to_submit - - batch_query_id = self._bsq.submit_batch_query(batch_queries_to_submit) - - result_dfs = self._bsq.get_batch_query_result(batch_id=batch_query_id, combine=False) - logger.info("Joining the individual enduses result into a single DataFrame") - group_by = self._bsq._clean_group_by(params.group_by) - for res in result_dfs: - res.set_index(group_by, inplace=True) - self.result_dfs = result_dfs - joined_enduses_df = result_dfs[0].drop(columns=["query_id"]) - for enduse, res in list(zip(params.enduses, result_dfs))[1:]: - if not isinstance(enduse, str): - enduse = enduse.name - joined_enduses_df = joined_enduses_df.join(res[[enduse]]) - - logger.info("Joining Completed.") - return joined_enduses_df.reset_index() - - @gather_params(TSQuery) - def aggregate_timeseries(self, params: TSQuery): - if self._bsq.ts_table is None: - raise ValueError("Not timeseries table available") - - upgrade_id = self._bsq._validate_upgrade(params.upgrade_id) - if params.timestamp_grouping_func and params.timestamp_grouping_func not in ["hour", "day", "month"]: - raise ValueError("timestamp_grouping_func must be one of ['hour', 'day', 'month']") - - if params.split_enduses: - return self._aggregate_timeseries_light(params) - [self._bsq._get_table(jl[0]) for jl in params.join_list] # ingress all tables in join list - enduses_cols = self._bsq._get_enduse_cols(params.enduses, table="timeseries") - total_weight = self._bsq._get_weight(params.weights) - agg_func, agg_weight = self._bsq._get_agg_func_and_weight(params.weights, params.agg_func) - enduse_selection = [ - agg_func(enduse if agg_weight is None else enduse * agg_weight).label( - self._bsq._simple_label(enduse.name, params.agg_func) + .group_by( + *(base.c[k] for k in ts_unique_keys), + *bs_per_bldg_extra_group_exprs, ) - for enduse in enduses_cols - ] - group_by = list(params.group_by) - if self._bsq.timestamp_column_name not in group_by and params.collapse_ts: - logger.info("Aggregation done across timestamps. Result no longer a timeseries.") - # The aggregation is done across time so we should correct sample_count and units_count - rows_per_building = self._bsq._get_rows_per_building() - grouping_metrics_selection = [ - (safunc.sum(1) / rows_per_building).label("sample_count"), - safunc.sum(total_weight / rows_per_building).label("units_count"), - ] - elif self._bsq.timestamp_column_name not in group_by: - group_by.append(self._bsq.timestamp_column_name) - grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - safunc.sum(total_weight).label("units_count"), - ] - elif params.collapse_ts: - raise ValueError("collapse_ts is true, but there is timestamp column in group_by.") - else: - grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - safunc.sum(total_weight).label("units_count"), - ] - - if (colname := self._bsq.timestamp_column_name) in group_by and params.timestamp_grouping_func: - # sample_count = count(distinct(building_id)) - # units_count = count(distinct(buuilding_id)) * sum(total_weight) / sum(1) - grouping_metrics_selection = [ - safunc.count(safunc.distinct(self._bsq.ts_bldgid_column)).label("sample_count"), - ( - safunc.count(safunc.distinct(self._bsq.ts_bldgid_column)) * safunc.sum(total_weight) / safunc.sum(1) - ).label("units_count"), - (safunc.sum(1) / safunc.count(safunc.distinct(self._bsq.ts_bldgid_column))).label("rows_per_sample"), - ] - indx = group_by.index(colname) - sim_info = self._bsq._get_simulation_info() - if sim_info.offset > 0: - # If timestamps are not period beginning we should make them so for timestamp_grouping_func aggregation. - new_col = sa.func.date_trunc( - params.timestamp_grouping_func, - sa.func.date_add(sim_info.unit, -sim_info.offset, self._bsq.timestamp_column), - ).label(colname) - else: - new_col = sa.func.date_trunc(params.timestamp_grouping_func, self._bsq.timestamp_column).label(colname) - group_by[indx] = new_col + .subquery("bs_per_bldg") + ) - group_by_selection = self._bsq._process_groupby_cols(group_by, annual_only=False) + # Outer JOIN: ts_aggr ⋈ bs_per_bldg on the ts unique keys. + # bs_per_bldg has one row per (bldg, state, ). + # When the user groups by a tract-derived dimension (e.g. county), a + # building straddling counties produces N rows here — each with its + # proportional bldg_weight slice — so the outer SUM correctly + # disaggregates the building across the user's groups. No tract + # fan-out: the inner GROUP BY collapsed equal-county rows already. + bs_join_cond = sa.and_( + *(bs_per_bldg.c[k] == ts_aggr_subq.c[k] for k in ts_unique_keys), + ) + # applied_only=True for the upgrade_only path is enforced upstream by + # _query, which appends a `_build_applied_subquery(all_of=[upgrade_id])` + # filter to bs_restrict (routed into ts_restrict at ts_flat WHERE). + + tbljoin = ts_aggr_subq.join(bs_per_bldg, bs_join_cond) + + # SideView adapter: indexes columns by enduse name across BOTH + # ts_aggr (ts-side and mixed enduses, prefixed `bs__` / `up__`) and + # bs_per_bldg (pure-bs enduses, projected by their original name). + # This way `get_col(bs_tbl, e)` resolves uniformly regardless of + # which side the enduse came from. + class _SideView: + """Adapter exposing aggregate-subquery columns indexed by enduse name.""" + def __init__(self, ts_subq, prefix, ts_enduses, group_cols, bs_subq, bs_enduses): + self._cols_by_name = {} + for e in ts_enduses: + self._cols_by_name[e.name] = ts_subq.c[f"{prefix}__{e.name}"] + for e in bs_enduses: + if e.name in bs_subq.c: + self._cols_by_name[e.name] = bs_subq.c[e.name] + for c in group_cols: + if c.name not in self._cols_by_name: + self._cols_by_name[c.name] = ts_subq.c[c.name] + if "_inner_rows" in ts_subq.c: + self._cols_by_name["_inner_rows"] = ts_subq.c["_inner_rows"] + + @property + def c(self): + return self._cols_by_name + + passthrough_cols = flat_group_keys + flat_extra_group_cols + ts_b = _SideView(ts_aggr_subq, "bs", flat_enduses, passthrough_cols, bs_per_bldg, bs_only_enduses) + # Pure-bs enduses are upgrade-invariant (sqft is sqft regardless of + # upgrade), so up-side resolves to the same bs_per_bldg column. + ts_u = ts_b if single_upgrade else _SideView( + ts_aggr_subq, "up", flat_enduses, passthrough_cols, bs_per_bldg, bs_only_enduses, + ) - query = sa.select(*(group_by_selection + grouping_metrics_selection + enduse_selection)) - query = query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.ts_bldgid_column) - if params.join_list: - query = self._bsq._add_join(query, params.join_list) + # Remap user's group_by: + # ts-side group_bys → ts_aggr_subq column + # bs-side group_bys → bs_per_bldg column (passed through above) + remapped_group_by = [] + for g in group_by: + if g.name in ts.columns: + remapped_group_by.append(ts_aggr_subq.c[g.name]) + elif g.name in bs_per_bldg.c: + remapped_group_by.append(bs_per_bldg.c[g.name]) + else: + remapped_group_by.append(g) - group_by_names = [g.name for g in group_by_selection] - upgrade_in_restrict = any(entry[0] == "upgrade" for entry in params.restrict) - if self._bsq.up_table is not None and not upgrade_in_restrict and "upgrade" not in group_by_names: - logger.info(f"Restricting query to Upgrade {upgrade_id}.") - params.restrict = list(params.restrict) + [(self._bsq._ts_upgrade_col, [upgrade_id])] + return ts_b, ts_u, tbljoin, remapped_group_by, bs_per_bldg - query = self._bsq._add_restrict(query, params.restrict) - query = self._bsq._add_avoid(query, params.avoid) - query = self._bsq._add_group_by(query, group_by_selection) - query = self._bsq._add_order_by(query, group_by_selection if params.sort else []) - query = query.limit(params.limit) if params.limit else query - - if params.get_query_only: - return self._bsq._compile(query) + @validate_arguments + def __get_annual_bs_up_table(self, upgrade_id: str, applied_only: bool | None): + # `self._bsq.bs_table` / `.md_table` / `.md_key` may be routed to + # the alt metadata table by the `_routing_context` swap inside + # `_query`. Reading from `self._bsq.*` thus inherits routing + # automatically — no explicit threading needed here. + bs = self._bsq.bs_table + if upgrade_id == "0": + # Baseline-only path: no join. The caller filters to baseline rows + # via `_md_baseline_successful_condition` in the outer WHERE. + return bs, bs, bs + + up = self._bsq.md_table.alias("up") + up_col = up.c["upgrade"] + up_id = typed_literal(up_col, upgrade_id) + join_cond = sa.and_( + self._bsq._baseline_upgrade_join_condition(bs, up), + up_col == up_id, + self._bsq._get_success_condition(up), + ) + if applied_only: + tbljoin = bs.join(up, join_cond) + else: + tbljoin = bs.outerjoin(up, join_cond) - return self._bsq.execute(query) + return bs, up, tbljoin @validate_arguments def get_building_average_kws_at( @@ -357,6 +450,8 @@ def get_building_average_kws_at( at_hour: Union[list[float], float], at_days: list[float], enduses: list[str], + upgrade_id: Union[int, str] = "0", + restrict: Sequence[RestrictTuple] = Field(default_factory=list), get_query_only: bool = False, ): """ @@ -378,6 +473,14 @@ def get_building_average_kws_at( enduses: The list of enduses for which to calculate the average kWs + upgrade_id: Which upgrade scenario to compute against. Defaults to "0" (baseline). The TS-side join + constrains `ts.upgrade = upgrade_id` so the join doesn't cross-product across all upgrades + present in the TS table — without this filter, the scan multiplies by the number of + upgrades, which on OEDI is 3 TB+ per call. + + restrict: Optional WHERE clauses (e.g. `[("state", ["CO"])]`) to narrow the scan. Strongly recommended + on partitioned TS tables — without a state restrict, the join scans every state's partition. + get_query_only: Skips submitting the query to Athena and just returns the query strings. Useful for batch submitting multiple queries or debugging. @@ -410,7 +513,7 @@ def get_building_average_kws_at( for enduse in enduse_cols ] grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), + safunc.sum(1).label("metadata_rows_count"), safunc.sum(total_weight).label("units_count"), ] @@ -445,10 +548,37 @@ def get_lower_timestamps(day, hour): lower_timestamps = [get_lower_timestamps(d - 1, h) for d, h in zip(at_days, at_hour)] upper_timestamps = [get_upper_timestamps(d - 1, h) for d, h in zip(at_days, at_hour)] - query = sa.select(*[self._bsq.ts_bldgid_column] + grouping_metrics_selection + enduse_selection) - query = query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.ts_bldgid_column) - query = self._bsq._add_group_by(query, [self._bsq.ts_bldgid_column]) - query = self._bsq._add_order_by(query, [self._bsq.ts_bldgid_column]) + ts_key_cols = self._bsq.ts_key_cols + ts = self._bsq.ts_table + if ts is None: + raise ValueError("No timeseries table found in database.") + ucol = self._bsq._ts_upgrade_col + + # Constrain the TS-side upgrade in the join condition. Without this, the + # join cross-products against every upgrade present in the TS table — + # the bs subquery's `WHERE upgrade = ...` doesn't filter the TS scan. + # Also push any user-supplied restrict into the bs/ts split so partition + # filters (e.g. state='CO') ride the JOIN ON instead of the outer WHERE. + upgrade_str = "0" if upgrade_id in (None, "0") else str(upgrade_id) + bs_restrict_split, ts_restrict_split, extra_restrict_split = self._bsq._split_restrict(list(restrict)) + bs_restrict_clauses = self._bsq._get_restrict_clauses(bs_restrict_split, annual_only=True) + ts_restrict_clauses = self._bsq._get_restrict_clauses(ts_restrict_split, annual_only=False) + + bs = self._bsq.bs_table # canonical alias + query = sa.select(*ts_key_cols + grouping_metrics_selection + enduse_selection) + query = query.join( + bs, + sa.and_( + self._bsq._baseline_timeseries_join_condition(bs, ts), + ucol == typed_literal(ucol, upgrade_str), + *bs_restrict_clauses, + *ts_restrict_clauses, + ), + ) + query = self._bsq._add_group_by(query, ts_key_cols) + query = self._bsq._add_order_by(query, ts_key_cols) + if extra_restrict_split: + query = self._bsq._add_restrict(query, extra_restrict_split, annual_only=False) lower_val_query = self._bsq._add_restrict(query, [(self._bsq.timestamp_column_name, lower_timestamps)]) upper_val_query = self._bsq._add_restrict(query, [(self._bsq.timestamp_column_name, upper_timestamps)]) @@ -477,8 +607,15 @@ def get_lower_timestamps(day, hour): ] ) avg_lower_weight = 1 - avg_upper_weight - # modify the lower vals to make it weighted average of upper and lower vals - lower_vals[enduses] = lower_vals[enduses] * avg_lower_weight + upper_vals[enduses] * avg_upper_weight + # The result columns use the simple-label form (stripped of `out.` + # prefix), not the raw enduse strings the user passed. Translate + # before indexing so the weighted-average update lands on the right + # columns. + enduse_label_cols = [self._bsq._simple_label(e) for e in enduses] + lower_vals[enduse_label_cols] = ( + lower_vals[enduse_label_cols] * avg_lower_weight + + upper_vals[enduse_label_cols] * avg_upper_weight + ) return lower_vals def validate_partition_by(self, partition_by: Sequence[str]) -> Sequence[str]: @@ -501,46 +638,242 @@ def _query( annual_only=params.annual_only, upgrade_id=upgrade_id, ) - bs_restrict = self._bsq._add_applied_in_restrict( - params.restrict, - applied_in=params.applied_in, - annual_only=params.annual_only, - ) + # Route to the smaller alt metadata table when the query is + # eligible. When ineligible — or when the schema declares no + # alt — fall through to today's primary-table behavior. See + # INVESTIGATION_partition_overhead.md for the ~2× engine-time + # win this typically delivers (Fix #2 in the priority list). + # + # Currently restricted to annual queries: the TS-flow joins MD on + # `(bldg_id, state)` where `state` is the bare partition column on + # both TS and primary MD. The alt MD's state column is `in.state` + # (different physical name), so the existing equi-join helpers + # can't bridge it without a cross-name mapping. Until that's + # added, TS-flow stays on the primary table. + if params.annual_only: + # Strip placeholder time tokens — `time` and the timestamp + # column name are reinjected later as bucketing expressions, + # not group-by columns the alt table must support. Only + # apply the strip to plain string entries; MappedColumns and + # SA Columns are passed through unchanged (they're never + # time-marker tokens). + time_aliases = ("time", self._bsq.timestamp_column_name) + routing_group_by = [ + g for g in params.group_by + if not (isinstance(g, str) and g in time_aliases) + ] + md_choice = self._bsq._pick_metadata_table(routing_group_by, params.restrict) + else: + md_choice = "primary" + try: + with self._bsq._routing_context(md_choice): + return self._query_inner( + params=params, upgrade_id=upgrade_id, md_choice=md_choice, + ) + except Exception: + raise + + def _query_inner( + self, + *, + params: Query, + upgrade_id: str, + md_choice: str, + ) -> Union[pd.DataFrame, str]: + # On TS paths, `applied_only=True` must filter the surviving md_keys to + # buildings where the upgrade applied — the annual flow does this via the + # md self-join on (bs.bldg_id = up.bldg_id AND up.applicability=true), but + # the TS flow has no such join in the single-upgrade or upgrade-pair shapes. + # Append a `_build_applied_subquery(all_of=[upgrade_id])` filter to the + # restrict list (which enforces `_md_successful_condition` on the upgrade + # rows). Without this filter, inapplicable buildings (which have TS rows + # under inapplicables_have_ts) would silently inflate totals across all + # `applied_only=True` TS queries. + bs_restrict: list[RestrictTuple] = list(params.restrict) if params.restrict else [] + if not params.annual_only and params.applied_only and upgrade_id != "0": + use_ts_side = self._bsq.ts_table is not None + key_kind: Literal["metadata", "timeseries"] = "timeseries" if use_ts_side else "metadata" + applied_select = self._bsq._build_applied_subquery( + all_of=[upgrade_id], any_of=None, key_kind=key_kind + ) + assert applied_select is not None # all_of=[upgrade_id] is non-empty + bs_restrict.append( + self._bsq._make_applied_filter_tuple(applied_select, key_kind=key_kind) + ) enduse_cols = self._bsq._get_enduse_cols( params.enduses, table="baseline" if params.annual_only else "timeseries" ) partition_by = self.validate_partition_by(params.partition_by) total_weight = self._bsq._get_weight(params.weights) agg_func, agg_weight = self._bsq._get_agg_func_and_weight(params.weights, params.agg_func) - time_indx = 0 - if "time" in params.group_by: # time will be added as necessary later - time_indx = params.group_by.index("time") - params.group_by = [g for g in params.group_by if g != "time"] + # The library accepts both the canonical alias `"time"` and the schema's + # actual timestamp column name (e.g. `"timestamp"` on OEDI) as a marker + # for "insert the time column at this position". Strip whichever one the + # user passed; the time-column expression is re-inserted later, so + # leaving it in `group_by` would project the column twice (Athena + # rejects with DUPLICATE_COLUMN_NAME). + # + # Default placement: AFTER the user's group_by columns (typically + # state/county). Trino hashes by leftmost GROUP BY columns when + # shuffling; leading with the partition column keeps the outer + # aggregate aligned with the parquet's existing layout instead of + # forcing a re-shuffle by timestamp. If the user explicitly + # positions `"time"` in their group_by list, their position wins. + time_indx = len(params.group_by) + time_aliases = {"time", self._bsq.timestamp_column_name} + for alias in time_aliases: + if alias in params.group_by: + time_indx = params.group_by.index(alias) + params.group_by = [g for g in params.group_by if g not in time_aliases] + break group_by_selection = self._bsq._process_groupby_cols(params.group_by, annual_only=params.annual_only) + pivot_bucketed_time = False if params.annual_only: bs_tbl, up_tbl, tbljoin = self.__get_annual_bs_up_table(upgrade_id, params.applied_only) + md_alias = bs_tbl # annual: bs_tbl IS the metadata-side handle + extra_restrict: list = [] + extra_avoid: list = [] else: - bs_restrict, ts_restrict = self._bsq._split_restrict(bs_restrict) - bs_tbl, up_tbl, tbljoin, group_by_selection = self.__get_timeseries_bs_up_table( - enduse_cols, upgrade_id, params.applied_only, ts_restrict, group_by_selection + bs_restrict, ts_restrict, extra_restrict = self._bsq._split_restrict(bs_restrict) + # Split avoid the same way: bs-side avoid clauses (e.g. NOT IN + # applied-buildings subquery on bldg_id) must be folded into + # bs_per_bldg's WHERE because the outer FROM (ts_aggr ⋈ bs_per_bldg) + # has no bs_table — _add_avoid at the outer level would comma-join + # bs against the FROM and silently drop the predicate. + bs_avoid, ts_avoid, extra_avoid = self._bsq._split_restrict( + list(params.avoid) if params.avoid else [] + ) + # When the caller wants only upgrade values (no savings, no baseline column), + # skip the pivot subquery. For `applied_only=True` the only-upgrade-rows + # behavior is the definition. For `applied_only=False`, the pivot's bs side + # exists solely for the COALESCE fallback when a building is missing an + # upgrade row — but `inapplicables_have_ts=True` (forced for this codebase) + # guarantees every building has a TS row for every upgrade, so the fallback + # never fires. Taking the single-scan path halves the TS scan and skips the + # CASE/GROUP-BY pivot, restoring the pre-pivot timing for this shape. + upgrade_only = ( + upgrade_id != "0" + and not params.include_savings + and not params.include_baseline + ) + # Folding (below) puts join_list joins INSIDE bs_per_bldg, so + # we don't need to expose the bs-side join columns at the outer + # level via arbitrary(). Leaving as empty. + join_bs_cols = [] + # Utility queries with join_list bring in additional metadata- + # side tables (e.g. eiaid_weights mapping bldg→eiaid). Fold + # those joins INTO bs_per_bldg so the outer query stays a + # clean ts_aggr ⋈ bs_per_bldg shape. Detect: + # - join_list_restrict: extra_restrict clauses targeting any + # of the join_list tables (e.g. eiaid_weights.eiaid IN [...]). + # - extra_restrict_remaining: restricts that don't target any + # join_list table (e.g. utility join_list table extras with + # no relevant clauses) — kept at outer. + # join_list entries: jl[0] can be a string table name or an SA + # Table object. Build a set of name-strings for matching. + jl_name_set = set() + for jl in (params.join_list or ()): + t0 = jl[0] + jl_name_set.add(t0 if isinstance(t0, str) else getattr(t0, "name", None)) + join_list_restrict, extra_restrict = [], extra_restrict + if params.join_list and extra_restrict: + kept = [] + for col_ref, vals in extra_restrict: + targets_jl = False + if isinstance(col_ref, sa.Column): + t = getattr(col_ref, "table", None) + targets_jl = t is not None and getattr(t, "name", None) in jl_name_set + if targets_jl: + join_list_restrict.append([col_ref, vals]) + else: + kept.append([col_ref, vals]) + extra_restrict = kept + bs_tbl, up_tbl, tbljoin, group_by_selection, md_alias = self.__get_timeseries_bs_up_table( + enduse_cols, upgrade_id, params.applied_only, ts_restrict, + avoid=ts_avoid, + bs_restrict=bs_restrict, bs_avoid=bs_avoid, + group_by=group_by_selection, + upgrade_only=upgrade_only, + timestamp_grouping_func=params.timestamp_grouping_func, + total_weight=total_weight, + extra_bs_cols=join_bs_cols, + join_list=params.join_list, + join_list_restrict=join_list_restrict, ) + # md_alias is now the bs_per_bldg subquery (per-(bldg, state) row + # with sum(weight) AS bldg_weight, count(*) AS tract_count, and + # SUM(*weight) AS _w__ for any pure-bs enduses). The + # outer SELECT below references these pre-summed columns instead + # of bs.weight / count_distinct(md_keys) / etc. directly. This + # eliminates ComStock's tract fan-out at the post-join shuffle. + # + # The outer per-row weight becomes md_alias.c["bldg_weight"] — + # already includes sample_wt × user_weights from total_weight, + # pre-summed at building grain. + # + # When skip_bs_per_bldg fired (e.g. utility join_list queries), + # md_alias IS just the canonical bs alias — no `bldg_weight` + # column. Outer SELECT keeps using bs.weight × user_weights + # directly (the pre-refactor shape). + uses_bs_per_bldg = "bldg_weight" in getattr(md_alias, "c", {}) + if uses_bs_per_bldg: + ts_total_weight = md_alias.c["bldg_weight"] + total_weight = ts_total_weight + if agg_weight is not None: + agg_weight = ts_total_weight + # Inner ts_aggr always pre-buckets time when grouping_func is + # set — true for both single-upgrade (upgrade_id=="0" or + # upgrade_only) and upgrade-pair branches. The outer SELECT + # references `ts_aggr.` directly (already bucketed) + # and uses `_inner_rows` instead of raw sum(1) for the + # rows_per_sample / units_count denominator. + inner_bucketed_time = params.timestamp_grouping_func is not None + # Legacy alias kept for the rest of _query() which references + # the prior name. TODO: rename once the dust settles. + pivot_bucketed_time = inner_bucketed_time def get_col(tbl, col): # column could be MappedColumn not available in tbl return tbl.c[col.name] if col.name in tbl.c else col + def rebind_to(col, target_tbl): + """Bind an enduse expression to `target_tbl`. + + For bare columns (Column / SACol): if the column name exists on + `target_tbl`, return that column; otherwise return the original. + For Labels (from get_calculated_column): the underlying expression + references columns on whichever table get_calculated_column was + given (typically bs_tbl). Use SA's ClauseAdapter to rewrite each + column reference to its counterpart on `target_tbl`, then re-label. + Falls through unchanged for `_SideView` (pivot subquery columns + already carry per-side prefix, so target_tbl's `.c[name]` lookup + already returns the correct pivot column — no traversal needed). + """ + if isinstance(col, SALabel): + # _SideView adapters expose calc-col labels directly (already + # per-side); plain Aliases / subqueries don't, so we adapt the + # underlying expression's column refs to point at target_tbl. + if col.name in getattr(target_tbl, "c", {}): + return target_tbl.c[col.name] + from sqlalchemy.sql.util import ClauseAdapter + # adapt_on_names=True needed because bs_tbl and up_tbl are + # both aliases of the same md_table; SA's default + # corresponding_column resolution doesn't bridge cross-alias + # references (it stops at the alias boundary). + adapted = ClauseAdapter(target_tbl, adapt_on_names=True).traverse(col.element) + return adapted.label(col.name) + return get_col(target_tbl, col) + query_cols = [] for col in enduse_cols: if params.annual_only: baseline_col = get_col(bs_tbl, col) if upgrade_id != "0": - # "and not params.include_savings" is added to restore the behavior of savings_shape query. - # Can be removed once savings_shape is removed. - if params.applied_only and not params.include_savings: - upgrade_col = get_col(up_tbl, col) + if params.applied_only: + upgrade_col = rebind_to(col, up_tbl) else: upgrade_col = sa.case( - (self._bsq._get_success_condition(up_tbl), get_col(up_tbl, col)), else_=baseline_col + (self._bsq._get_success_condition(up_tbl), rebind_to(col, up_tbl)), else_=baseline_col ) else: upgrade_col = baseline_col @@ -548,15 +881,17 @@ def get_col(tbl, col): # column could be MappedColumn not available in tbl else: baseline_col = get_col(bs_tbl, col) if upgrade_id != "0": - # "and not params.include_savings" is added to restore the behavior of savings_shape query. - # Can be removed once savings_shape is removed. - if params.applied_only and not params.include_savings: - upgrade_col = get_col(up_tbl, col) + if params.applied_only or bs_tbl is up_tbl: + # Single-scan path (applied_only=True OR the + # upgrade_only short-circuit path which returns + # bs_tbl == up_tbl == ts): the upgrade col is just + # the ts row's value, no COALESCE fallback needed. + upgrade_col = rebind_to(col, up_tbl) else: - upgrade_col = sa.case( - (up_tbl.c[self._bsq.building_id_column_name] == None, baseline_col), # noqa: E711 - else_=get_col(up_tbl, col), - ) + # Pivot path: per-(bldg_id, timestamp) row, up_ + # is NULL when the upgrade didn't produce a row for + # this bldg. Fall back to baseline via COALESCE. + upgrade_col = safunc.coalesce(rebind_to(col, up_tbl), baseline_col) else: upgrade_col = baseline_col savings_col = safunc.coalesce(baseline_col, 0) - safunc.coalesce(upgrade_col, 0) @@ -629,81 +964,186 @@ def get_col(tbl, col): # column could be MappedColumn not available in tbl ) ) + # Helper: model_count = count of distinct simulation models (distinct + # bldg_id) contributing to the outer group. Equals metadata_rows_count + # when the underlying metadata is one-row-per-bldg (ResStock; the + # state_agg-routed ComStock alt at queries that can't trip cross-state + # duplication of bldgs). Smaller than metadata_rows_count otherwise — + # most commonly when ComStock's primary table contributes multiple + # tract/state slices per bldg into the same outer group. + # + # Optimization for the bs_per_bldg-based TS flow: when each outer + # group sees each bldg at most once post-join, `count(*)` equals + # `count(distinct bldg_id)` and is ~22% cheaper at scale (Athena + # does not auto-rewrite count-distinct to count-star even when the + # cardinality is provably equivalent). Safety check: count(*) is OK + # iff every ts_unique_keys partition column other than bldg_id is + # in the outer group_by. For ts_unique_keys=[bldg_id] (ResStock) + # that's vacuously true. For ts_unique_keys=[bldg_id, state] + # (ComStock) we need state in the outer group. + # + # The optimization only applies when bs_per_bldg is in use (TS flow + # with the per-bldg pre-aggregation): bs_per_bldg has at most one + # row per ts_unique_key tuple, so post-join row identity matches + # `(ts_aggr_keys, bs_per_bldg_extra_group_cols)`. On the annual + # path or the no-bs_per_bldg TS path, post-join rows can include + # multiple tract rows per bldg per outer group, so count(*) would + # over-count distinct bldgs. + bldg_id_col = self._bsq.building_id_column_name + outer_group_names = { + getattr(g, "name", g) for g in (group_by_selection or ()) + } + ts_partition_keys_minus_bldg = [ + k for k in self._bsq._get_unique_keys("timeseries") + if k != bldg_id_col + ] + bs_per_bldg_in_use = ( + not params.annual_only + and "bldg_weight" in getattr(md_alias, "c", {}) + ) + model_count_via_count_star = ( + bs_per_bldg_in_use + and all(k in outer_group_names for k in ts_partition_keys_minus_bldg) + ) + + def _model_count_from(alias): + if model_count_via_count_star: + # count(*) is exact under the safety predicate above; the + # framework doesn't auto-rewrite count(distinct) -> count(*) + # so we emit it explicitly to skip the distinct-aggregation + # work on the post-join row stream. + return safunc.count(sa.text("*")).label("model_count") + return self._bsq._count_distinct([alias.c[bldg_id_col]]).label("model_count") + if params.annual_only: # Use annual tables grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - # '1 *' included in savings query to match existing behavior for testing. - # Can be removed after saving_shape is removed. - safunc.sum(1 * total_weight if params.include_savings else total_weight).label("units_count"), + safunc.sum(1).label("metadata_rows_count"), + _model_count_from(bs_tbl), + safunc.sum(total_weight).label("units_count"), ] elif params.timestamp_grouping_func == "year": # Use timeseries tables but collapse timeseries - rows_per_building = self._bsq._get_rows_per_building() - grouping_metrics_selection = [ - (safunc.sum(1) / rows_per_building).label("sample_count"), - safunc.sum(total_weight / rows_per_building).label("units_count"), - ] + uses_bs_per_bldg = "bldg_weight" in getattr(md_alias, "c", {}) + if uses_bs_per_bldg: + # bs_per_bldg shape: pre-summed columns at building grain. + grouping_metrics_selection = [ + safunc.sum(md_alias.c["tract_count"]).label("metadata_rows_count"), + _model_count_from(md_alias), + safunc.sum(md_alias.c["bldg_weight"]).label("units_count"), + (safunc.sum(bs_tbl.c["_inner_rows"]) / self._bsq._count_distinct( + [bs_tbl.c[k] for k in self._bsq._get_unique_keys("timeseries")] + )).label("rows_per_sample"), + ] + else: + # Direct ts ⋈ bs join shape (e.g. utility join_list queries). + md_key_cols = [md_alias.c[k] for k in self._bsq.md_key] + distinct_md_keys = self._bsq._count_distinct(md_key_cols) + grouping_metrics_selection = [ + distinct_md_keys.label("metadata_rows_count"), + _model_count_from(md_alias), + (distinct_md_keys * safunc.sum(total_weight) / safunc.sum(1)).label("units_count"), + (safunc.sum(1) / distinct_md_keys).label("rows_per_sample"), + ] elif params.timestamp_grouping_func: colname = self._bsq.timestamp_column_name - bldg_id_col = bs_tbl.c[self._bsq.building_id_column_name] - grouping_metrics_selection = [ - safunc.count(sa.func.distinct(bldg_id_col)).label("sample_count"), - ( - safunc.count(sa.func.distinct(bldg_id_col)) - * safunc.sum(total_weight) - / safunc.sum(1) - ).label("units_count"), - (safunc.sum(1) / safunc.count(sa.func.distinct(bldg_id_col))).label("rows_per_sample"), - ] - sim_info = self._bsq._get_simulation_info() - time_col = bs_tbl.c[self._bsq.timestamp_column_name] - if sim_info.offset > 0: - # If timestamps are not period beginning we should make them so for timestamp_grouping_func aggregation. - new_col = sa.func.date_trunc( - params.timestamp_grouping_func, sa.func.date_add(sim_info.unit, -sim_info.offset, time_col) - ).label(colname) + uses_bs_per_bldg = "bldg_weight" in getattr(md_alias, "c", {}) + if uses_bs_per_bldg: + grouping_metrics_selection = [ + safunc.sum(md_alias.c["tract_count"]).label("metadata_rows_count"), + _model_count_from(md_alias), + safunc.sum(md_alias.c["bldg_weight"]).label("units_count"), + (safunc.sum(bs_tbl.c["_inner_rows"]) / self._bsq._count_distinct( + [bs_tbl.c[k] for k in self._bsq._get_unique_keys("timeseries")] + )).label("rows_per_sample"), + ] else: - new_col = sa.func.date_trunc(params.timestamp_grouping_func, time_col).label(colname) - - # If include_savings is True, then the order of the columns is different. Do this - # to match the behavior of savings_shape query. Can be simplified after savings_shape function is removed. - if params.include_savings: - group_by_selection.append(new_col) + md_key_cols = [md_alias.c[k] for k in self._bsq.md_key] + distinct_md_keys = self._bsq._count_distinct(md_key_cols) + grouping_metrics_selection = [ + distinct_md_keys.label("metadata_rows_count"), + _model_count_from(md_alias), + (distinct_md_keys * safunc.sum(total_weight) / safunc.sum(1)).label("units_count"), + (safunc.sum(1) / distinct_md_keys).label("rows_per_sample"), + ] + time_col = bs_tbl.c[self._bsq.timestamp_column_name] + if pivot_bucketed_time: + # Pivot subquery already date_trunc'd the time at the inner + # GROUP BY (the perf-critical optimization). The outer SELECT + # just references the bucketed column directly. + new_col = time_col.label(colname) else: - group_by_selection.insert(time_indx, new_col) + sim_info = self._bsq._get_simulation_info() + if sim_info.offset > 0: + # If timestamps are not period beginning we should make them so + # for timestamp_grouping_func aggregation. + new_col = sa.func.date_trunc( + params.timestamp_grouping_func, sa.func.date_add(sim_info.unit, -sim_info.offset, time_col) + ).label(colname) + else: + new_col = sa.func.date_trunc(params.timestamp_grouping_func, time_col).label(colname) + group_by_selection.insert(time_indx, new_col) else: + # Raw 15-min TS output (no timestamp_grouping_func). The outer + # SELECT references the ts_aggr (or pivot subquery) timestamp + # column directly. units_count uses bs_per_bldg's pre-summed + # weight; metadata_rows_count counts tract rows via tract_count. time_col = bs_tbl.c[self._bsq.timestamp_column_name].label(self._bsq.timestamp_column_name) - grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - # '1 *' included in savings query to match existing behavior for testing. - # Can be removed after saving_shape is removed. - safunc.sum(1 * total_weight if params.include_savings else total_weight).label("units_count"), - ] - if params.include_savings: - group_by_selection.append(time_col) + uses_bs_per_bldg = "bldg_weight" in getattr(md_alias, "c", {}) + if uses_bs_per_bldg: + grouping_metrics_selection = [ + safunc.sum(md_alias.c["tract_count"]).label("metadata_rows_count"), + _model_count_from(md_alias), + safunc.sum(md_alias.c["bldg_weight"]).label("units_count"), + ] else: - group_by_selection.insert(time_indx, time_col) - - # If include_savings is True, then the order of the columns is different. Do this - # to match the behavior of savings_shape query. Can be simplified after savings_shape function is removed. - if params.include_savings: - if params.annual_only or params.timestamp_grouping_func == "year": - query_cols = grouping_metrics_selection + query_cols + list(group_by_selection) - else: # time is the first column in this case and needs to be moved to the front to match - # the behavior of savings_shape query - query_cols = [ - group_by_selection[-1], - *grouping_metrics_selection, - *query_cols, - *group_by_selection[:-1], + grouping_metrics_selection = [ + safunc.sum(1).label("metadata_rows_count"), + _model_count_from(bs_tbl), + safunc.sum(total_weight).label("units_count"), ] - else: - query_cols = list(group_by_selection) + grouping_metrics_selection + query_cols + group_by_selection.insert(time_indx, time_col) + + query_cols = list(group_by_selection) + grouping_metrics_selection + query_cols query = sa.select(*query_cols).select_from(tbljoin) - query = self._bsq._add_join(query, params.join_list) + # For TS queries with bs_per_bldg, join_list joins are folded INTO + # bs_per_bldg's FROM (not added at outer). For annual queries, _add_join + # at outer is the only place. Pass an empty list to skip outer + # _add_join when folding happened. + if not params.annual_only and "bldg_weight" in getattr(md_alias, "c", {}): + outer_join_list = [] # already folded into bs_per_bldg + else: + outer_join_list = params.join_list + query = self._bsq._add_join(query, outer_join_list, bs_alias=md_alias) if params.annual_only: - query = query.where(self._bsq._bs_successful_condition) - query = self._bsq._add_restrict(query, bs_restrict) - query = self._bsq._add_avoid(query, params.avoid, annual_only=params.annual_only) + # Successful baseline rows on the bs alias that's in the FROM. For + # upgrade-pair queries the join's ON already enforces bs.upgrade=0 + # — Trino dedupes the duplicate predicate at planning time. + query = query.where( + sa.and_( + self._bsq._get_success_condition(bs_tbl), + self._bsq._upgrade_zero_filter(bs_tbl), + ) + ) + # Annual queries have no inner join helper to fold bs_restrict into, + # so the outer WHERE is the only place to apply it. `_get_column` + # inside resolves against the routed `self.bs_table`, so a + # state='CO' filter resolves to the alt-table column on the + # state_agg path automatically. + query = self._bsq._add_restrict(query, bs_restrict, annual_only=params.annual_only) + # TS queries fold bs_restrict into the inner ts ⋈ bs JOIN ON inside + # __get_timeseries_bs_up_table, so adding it again here would just + # produce duplicate predicates that Trino has to dedupe. + # Restricts on join_list tables (e.g. utility eiaid_weights.eiaid) didn't + # land on bs or ts — they go to the outer WHERE after _add_join has + # introduced their referenced tables. + if extra_restrict: + query = self._bsq._add_restrict(query, extra_restrict, annual_only=params.annual_only) + # On TS, bs_avoid was folded into bs_per_bldg's WHERE and ts_avoid was + # folded into ts_flat's WHERE; only extra_avoid (avoids on join_list + # tables that aren't bs or ts) remains for the outer level. On annual + # the outer FROM has bs_table directly, so all avoid clauses apply + # straightforwardly. + outer_avoid = params.avoid if params.annual_only else extra_avoid + query = self._bsq._add_avoid(query, outer_avoid, annual_only=params.annual_only) query = self._bsq._add_group_by(query, group_by_selection) query = self._bsq._add_order_by(query, group_by_selection if params.sort else []) query = query.limit(params.limit) if params.limit else query diff --git a/buildstock_query/aggregate_query.pyi b/buildstock_query/aggregate_query.pyi index a86b97bc..f3e123f1 100644 --- a/buildstock_query/aggregate_query.pyi +++ b/buildstock_query/aggregate_query.pyi @@ -2,220 +2,12 @@ from typing import Literal from collections.abc import Sequence import pandas as pd import typing -from buildstock_query.schema.query_params import TSQuery, BaseQuery, Query from buildstock_query import main -from buildstock_query.schema.utilities import AnyColType, AnyTableType, RestrictTuple -from pydantic import Field -from typing_extensions import deprecated + class BuildStockAggregate: def __init__(self, buildstock_query: main.BuildStockQuery) -> None: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=True.") - def aggregate_annual( - self, - *, - enduses: Sequence[AnyColType], - get_query_only: Literal[True], - group_by: Sequence[AnyColType | tuple[str, str]] = [], - sort: bool = False, - upgrade_id: int | str = "0", - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - get_quartiles: bool = False, - get_nonzero_count: bool = False, - agg_func: str | None = "sum", - ) -> str: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=True.") - def aggregate_annual( - self, - *, - enduses: Sequence[AnyColType], - get_query_only: Literal[False] = False, - group_by: Sequence[AnyColType | tuple[str, str]] = [], - sort: bool = False, - upgrade_id: int | str = "0", - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - get_quartiles: bool = False, - get_nonzero_count: bool = False, - agg_func: str | None = "sum", - ) -> pd.DataFrame: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=True.") - def aggregate_annual( - self, - *, - enduses: Sequence[AnyColType], - get_query_only: bool, - group_by: Sequence[AnyColType | tuple[str, str]] = [], - sort: bool = False, - upgrade_id: int | str = "0", - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - get_quartiles: bool = False, - get_nonzero_count: bool = False, - agg_func: str | None = "sum", - ) -> pd.DataFrame | str: - """ - Aggregates the baseline annual result on select enduses. - Check the argument description below to learn about additional features and options. - Args: - enduses: The list of enduses to aggregate. Defaults to all electricity enduses - group_by: The list of columns to group the aggregation by. - - sort: Whether to sort the results by group_by columns - - upgrade_id: The upgrade to query for. Only valid with runs with upgrade. If not provided, use the baseline - - join_list: Additional table to join to baseline table to perform operation. All the inputs (`enduses`, - `group_by` etc) can use columns from these additional tables. It should be specified as a list of - tuples. - Example: `[(new_table_name, baseline_column_name, new_column_name), ...]` - where baseline_column_name and new_column_name are the columns on which the new_table - should be joined to baseline table. - - weights: The additional columns to use as weight. The "build_existing_model.sample_weight" is already used. - It is specified as either list of string or list of tuples. When only string is used, the string - is the column name, when tuple is passed, the second element is the table name. - - restrict: The list of where condition to restrict the results to. It should be specified as a list of tuple. - Example: `[('state',['VA','AZ']), ("build_existing_model.lighting",['60% CFL']), ...]` - avoid: Just like restrict, but the opposite. It will only include rows that do not match (any of) the - conditions. - get_quartiles: If true, return the following quartiles in addition to the sum for each enduses: - [0, 0.02, .25, .5, .75, .98, 1]. The 0% quartile is the minimum and the 100% quartile - is the maximum. - get_nonzero_count: If true, return the number of non-zero rows for each enduses. Useful, for example, for - finding the number of natural gas customers by using natural gas total fuel use as the enduse. - - get_query_only: Skips submitting the query to Athena and just returns the query string. Useful for batch - submitting multiple queries or debugging - - agg_func: The aggregation function to use. Defaults to 'sum'. - See other options in https://prestodb.io/docs/current/functions/aggregate.html - - Returns: - if get_query_only is True, returns the query_string, otherwise returns the dataframe - """ - - @typing.overload - @deprecated("Please use my_run.query with annual_only=True.") - def aggregate_annual(self, *, params: BaseQuery) -> str | pd.DataFrame: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=False.") - def aggregate_timeseries( - self, - *, - enduses: Sequence[AnyColType], - get_query_only: Literal[True], - group_by: Sequence[AnyColType | tuple[str, str]] = [], - upgrade_id: int | str = "0", - sort: bool = False, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: str | None = None, - limit: int | None = None, - agg_func: str | None = "sum", - ) -> str: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=False.") - def aggregate_timeseries( - self, - *, - enduses: Sequence[AnyColType], - group_by: Sequence[AnyColType | tuple[str, str]] = [], - upgrade_id: int | str = "0", - sort: bool = False, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: str | None = None, - get_query_only: Literal[False] = False, - limit: int | None = None, - agg_func: str | None = "sum", - ) -> pd.DataFrame: ... - @typing.overload - @deprecated("Please use my_run.query with annual_only=False.") - def aggregate_timeseries( - self, - *, - enduses: Sequence[AnyColType], - get_query_only: bool, - group_by: Sequence[AnyColType | tuple[str, str]] = [], - upgrade_id: int | str = "0", - sort: bool = False, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = [], - weights: Sequence[str | tuple] = [], - restrict: Sequence[RestrictTuple] = [], - avoid: Sequence[RestrictTuple] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: str | None = None, - limit: int | None = None, - agg_func: str | None = "sum", - ) -> str | pd.DataFrame: - """ - Aggregates the timeseries result on select enduses. - Check the argument description below to learn about additional features and options. - Args: - enduses: The list of enduses to aggregate. Defaults to all electricity enduses - - group_by: The list of columns to group the aggregation by. - - upgrade_id: The upgrade to query for. Only valid with runs with upgrade. If not provided, use the baseline - - order_by: The columns by which to sort the result. - - join_list: Additional table to join to baseline table to perform operation. All the inputs (`enduses`, - `group_by` etc) can use columns from these additional tables. It should be specified as a list of - tuples. - Example: `[(new_table_name, baseline_column_name, new_column_name), ...]` - where baseline_column_name and new_column_name are the columns on which the new_table - should be joined to baseline table. - - weights: The additional column to use as weight. The "build_existing_model.sample_weight" is already used. - - restrict: The list of where condition to restrict the results to. It should be specified as a list of tuple. - Example: `[('state',['VA','AZ']), ("build_existing_model.lighting",['60% CFL']), ...]` - limit: The maximum number of rows to query - - split_enduses: Whether to query for each enduses in a separate query to reduce Athena load for query. Useful - when Athena runs into "Query exhausted resources ..." errors. - timestamp_grouping_func: One of 'hour', 'day' or 'month' or None. If provided, perform timeseries - aggregation of specified granularity. - get_query_only: Skips submitting the query to Athena and just returns the query string. Useful for batch - submitting multiple queries or debugging - - agg_func: The aggregation function to use. Defaults to 'sum'. - See other options in https://prestodb.io/docs/current/functions/aggregate.html - Returns: - if get_query_only is True, returns the query_string, otherwise, returns the DataFrame - - """ - - @typing.overload - @deprecated("Please use my_run.query with annual_only=False.") - def aggregate_timeseries( - self, - *, - params: TSQuery, - ) -> str | pd.DataFrame: ... @typing.overload def get_building_average_kws_at( self, diff --git a/buildstock_query/db_schema/comstock_default.toml b/buildstock_query/db_schema/comstock_default.toml deleted file mode 100644 index d29a96c8..00000000 --- a/buildstock_query/db_schema/comstock_default.toml +++ /dev/null @@ -1,31 +0,0 @@ -[table_suffix] -baseline = "_baseline" -timeseries = "_timeseries" -upgrades = "_upgrades" - -[column_prefix] -characteristics = "build_existing_model." -output = "simulation_output_report." - - -[column_names] -building_id = "building_id" -sample_weight = "" -sqft = "build_existing_model.geometry_floor_area" -timestamp = "time" -completed_status = "completed_status" -unmet_hours_cooling_hr = "" -unmet_hours_heating_hr = "" -fuel_totals = [ - "simulation_output_report.total_site_electricity_kwh", - "simulation_output_report.total_site_energy_mbtu", - "simulation_output_report.total_site_natural_gas_therm", - "simulation_output_report.total_site_other_fuel_mbtu"] - -[completion_values] -success = "Success" -fail = "Fail" -inapplicable = "Invalid" - -[structure] -inapplicables_have_ts = "false" diff --git a/buildstock_query/db_schema/comstock_oedi_agg_state_and_county.toml b/buildstock_query/db_schema/comstock_oedi_agg_state_and_county.toml new file mode 100644 index 00000000..e10daa2d --- /dev/null +++ b/buildstock_query/db_schema/comstock_oedi_agg_state_and_county.toml @@ -0,0 +1,36 @@ +[table_suffix] +annual_and_metadata = "_md_agg_by_state_and_county_parquet" +timeseries = "_ts_by_state" + + +[column_prefix] +characteristics = "in." +output = "out." + +[column_names] +building_id = "bldg_id" +sample_weight = "weight" +sqft = "in.sqft" +timestamp = "timestamp" +completed_status = "applicability" +unmet_hours_cooling_hr = "" +unmet_hours_heating_hr = "" +upgrade_name = "in.upgrade_name" +fuel_totals = ["out.electricity.total.energy_consumption..kwh", + "out.natural_gas.total.energy_consumption..kwh", + "out.other_fuel.total.energy_consumption..kwh", + "out.district_cooling.total.energy_consumption..kwh", + "out.district_heating.total.energy_consumption..kwh" + ] + +[completion_values] +success = "true" +fail = "" +inapplicable = "false" + +[structure] +inapplicables_have_ts = "true" + +[unique_keys] +metadata = ["bldg_id", "county", "state"] +timeseries = ["bldg_id", "state"] diff --git a/buildstock_query/db_schema/comstock_oedi_state_and_county.toml b/buildstock_query/db_schema/comstock_oedi_state_and_county.toml new file mode 100644 index 00000000..11005411 --- /dev/null +++ b/buildstock_query/db_schema/comstock_oedi_state_and_county.toml @@ -0,0 +1,36 @@ +[table_suffix] +annual_and_metadata = "_md_by_state_and_county_parquet" +timeseries = "_ts_by_state" + + +[column_prefix] +characteristics = "in." +output = "out." + +[column_names] +building_id = "bldg_id" +sample_weight = "weight" +sqft = "in.sqft" +timestamp = "timestamp" +completed_status = "applicability" +unmet_hours_cooling_hr = "" +unmet_hours_heating_hr = "" +upgrade_name = "in.upgrade_name" +fuel_totals = ["out.electricity.total.energy_consumption..kwh", + "out.natural_gas.total.energy_consumption..kwh", + "out.other_fuel.total.energy_consumption..kwh", + "out.district_cooling.total.energy_consumption..kwh", + "out.district_heating.total.energy_consumption..kwh" + ] + +[completion_values] +success = "true" +fail = "" +inapplicable = "false" + +[structure] +inapplicables_have_ts = "true" + +[unique_keys] +metadata = ["bldg_id", "in.nhgis_tract_gisjoin", "state"] +timeseries = ["bldg_id", "state"] diff --git a/buildstock_query/db_schema/comstock_oedi_unified.toml b/buildstock_query/db_schema/comstock_oedi_unified.toml new file mode 100644 index 00000000..6aa0e66c --- /dev/null +++ b/buildstock_query/db_schema/comstock_oedi_unified.toml @@ -0,0 +1,37 @@ +[table_suffix] +annual_and_metadata = "_md_by_state_and_county_parquet" +annual_and_metadata_state_agg = "_md_agg_national_parquet" +timeseries = "_ts_by_state" + + +[column_prefix] +characteristics = "in." +output = "out." + +[column_names] +building_id = "bldg_id" +sample_weight = "weight" +sqft = "in.sqft" +timestamp = "timestamp" +completed_status = "applicability" +unmet_hours_cooling_hr = "" +unmet_hours_heating_hr = "" +upgrade_name = "in.upgrade_name" +fuel_totals = ["out.electricity.total.energy_consumption..kwh", + "out.natural_gas.total.energy_consumption..kwh", + "out.other_fuel.total.energy_consumption..kwh", + "out.district_cooling.total.energy_consumption..kwh", + "out.district_heating.total.energy_consumption..kwh" + ] + +[completion_values] +success = "true" +fail = "" +inapplicable = "false" + +[structure] +inapplicables_have_ts = "true" + +[unique_keys] +metadata = ["bldg_id", "in.nhgis_tract_gisjoin", "state"] +timeseries = ["bldg_id", "state"] diff --git a/buildstock_query/db_schema/db_schema_model.py b/buildstock_query/db_schema/db_schema_model.py index 361b3793..0cd78077 100644 --- a/buildstock_query/db_schema/db_schema_model.py +++ b/buildstock_query/db_schema/db_schema_model.py @@ -1,11 +1,31 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field, model_validator from typing import Optional class TableSuffix(BaseModel): - baseline: str + """Suffixes for the underlying physical tables. + + `annual_and_metadata` is one parquet that holds every upgrade's annual + results plus the building characteristics, with `upgrade=0` rows being + the baseline. `timeseries` is the per-timestamp parquet, also covering + every upgrade. Upgrade selection is a WHERE clause on the relevant + table at query time — there is no separate baseline parquet. + + `annual_and_metadata_state_agg` (optional) names a *coarser-grained* + alternative metadata table that the framework can route eligible + queries to. It must contain the same logical content as + `annual_and_metadata` but at state-or-coarser grain (e.g. tract + columns absent, weights pre-summed across tract slices per state). + Routing is automatic: when a query's `group_by` and `restrict` + reference only columns present on the alt table, + `_pick_metadata_table` picks `state_agg` and the query scans the + smaller table. Schemas without this field always use + `annual_and_metadata`. + """ + + annual_and_metadata: str timeseries: str - upgrades: str + annual_and_metadata_state_agg: Optional[str] = None class ColumnPrefix(BaseModel): @@ -22,6 +42,11 @@ class ColumnNames(BaseModel): unmet_hours_cooling_hr: str unmet_hours_heating_hr: str map_eiaid_column: Optional[str] = None # Only for ResStock utility queries + # Column on the upgrade table that carries the human-readable upgrade name. + # Defaults to the classic ResStock/ComStock convention. OEDI ComStock + # overrides this to "in.upgrade_name". OEDI ResStock has no name column; + # `get_upgrade_names` then degrades to NULL upgrade_name values. + upgrade_name: str = "apply_upgrade.upgrade_name" fuel_totals: list[str] @@ -32,8 +57,31 @@ class CompletionValues(BaseModel): class Structure(BaseModel): - # whether the baseline timeseries is copied for unapplicable buildings in an upgrade - inapplicables_have_ts: bool + # Vestigial: the codebase now assumes inapplicables_have_ts=True universally. + # Kept here so existing TOMLs still validate; the value is ignored. + inapplicables_have_ts: bool = True + + +class UniqueKeys(BaseModel): + metadata: Optional[list[str]] = None + timeseries: Optional[list[str]] = None + # Unique-keys for the optional alt metadata table. Defaults to + # [bldg_id] when omitted (one row per (bldg, state) is fine — the + # alt table is *coarser* than primary, so its key is typically a + # subset of primary's). No subset check vs metadata: the alt table + # is allowed to have a narrower key. + metadata_state_agg: Optional[list[str]] = None + + @model_validator(mode="after") + def _timeseries_subset_of_metadata(self) -> "UniqueKeys": + if self.metadata is not None and self.timeseries is not None: + extra = set(self.timeseries) - set(self.metadata) + if extra: + raise ValueError( + "unique_keys.timeseries must be a subset of unique_keys.metadata; " + f"unexpected key(s): {sorted(extra)}" + ) + return self class DBSchema(BaseModel): @@ -41,4 +89,5 @@ class DBSchema(BaseModel): column_prefix: ColumnPrefix column_names: ColumnNames completion_values: CompletionValues - structure: Structure + structure: Structure = Field(default_factory=Structure) + unique_keys: UniqueKeys = Field(default_factory=UniqueKeys) diff --git a/buildstock_query/db_schema/resstock_default.toml b/buildstock_query/db_schema/resstock_default.toml deleted file mode 100644 index bd3214bb..00000000 --- a/buildstock_query/db_schema/resstock_default.toml +++ /dev/null @@ -1,37 +0,0 @@ -[table_suffix] -baseline = "_baseline" -timeseries = "_timeseries" -upgrades = "_upgrades" - -[column_prefix] -characteristics = "build_existing_model." -output = "report_simulation_output." - - -[column_names] -building_id = "building_id" -sample_weight = "build_existing_model.sample_weight" -sqft = "build_existing_model.geometry_floor_area" -timestamp = "time" -completed_status = "completed_status" -unmet_hours_cooling_hr = "report_simulation_output.unmet_hours_cooling_hr" -unmet_hours_heating_hr = "report_simulation_output.unmet_hours_heating_hr" -map_eiaid_column = "county" -fuel_totals = [ - 'report_simulation_output.energy_use_total_m_btu', - 'report_simulation_output.fuel_use_coal_total_m_btu', - 'report_simulation_output.fuel_use_electricity_total_m_btu', - 'report_simulation_output.fuel_use_fuel_oil_total_m_btu', - 'report_simulation_output.fuel_use_natural_gas_total_m_btu', - 'report_simulation_output.fuel_use_propane_total_m_btu', - 'report_simulation_output.fuel_use_wood_cord_total_m_btu', - 'report_simulation_output.fuel_use_wood_pellets_total_m_btu' - ] - -[completion_values] -success = "Success" -fail = "Fail" -inapplicable = "Invalid" - -[structure] -inapplicables_have_ts = "false" diff --git a/buildstock_query/db_schema/resstock_oedi.toml b/buildstock_query/db_schema/resstock_oedi.toml index dd330810..0685ea5d 100644 --- a/buildstock_query/db_schema/resstock_oedi.toml +++ b/buildstock_query/db_schema/resstock_oedi.toml @@ -1,7 +1,6 @@ [table_suffix] -baseline = "_metadata" +annual_and_metadata = "_metadata" timeseries = "_by_state" -upgrades = "_metadata" [column_prefix] characteristics = "in." diff --git a/buildstock_query/db_schema/resstock_oedi_new.toml b/buildstock_query/db_schema/resstock_oedi_new.toml new file mode 100644 index 00000000..90a490f3 --- /dev/null +++ b/buildstock_query/db_schema/resstock_oedi_new.toml @@ -0,0 +1,30 @@ +[table_suffix] +annual_and_metadata = "_md_national_parquet" +timeseries = "_ts_by_state" + +[column_prefix] +characteristics = "in." +output = "out." + +[column_names] +building_id = "bldg_id" +sample_weight = "weight" +sqft = "in.sqft..ft2" +timestamp = "timestamp" +completed_status = "applicability" +unmet_hours_cooling_hr = "" +unmet_hours_heating_hr = "" +map_eiaid_column = "gisjoin" +fuel_totals = ["out.electricity.total.energy_consumption..kwh", + "out.natural_gas.total.energy_consumption..kwh", + "out.fuel_oil.total.energy_consumption..kwh", + "out.propane.total.energy_consumption..kwh", + ] + +[completion_values] +success = "true" +fail = "" +inapplicable = "false" + +[structure] +inapplicables_have_ts = "true" diff --git a/buildstock_query/db_schema/comstock_oedi.toml b/buildstock_query/db_schema/resstock_oedi_vu.toml similarity index 58% rename from buildstock_query/db_schema/comstock_oedi.toml rename to buildstock_query/db_schema/resstock_oedi_vu.toml index 1190be6c..260990d5 100644 --- a/buildstock_query/db_schema/comstock_oedi.toml +++ b/buildstock_query/db_schema/resstock_oedi_vu.toml @@ -1,25 +1,24 @@ [table_suffix] -baseline = "_metadata" -timeseries = "_by_state" -upgrades = "_upgrades" +annual_and_metadata = "_metadata" +timeseries = "_by_state_vu" [column_prefix] characteristics = "in." output = "out." -[column_names] +[column_names] building_id = "bldg_id" sample_weight = "weight" -sqft = "in.sqft" +sqft = "in.sqft" timestamp = "timestamp" completed_status = "applicability" unmet_hours_cooling_hr = "" unmet_hours_heating_hr = "" +map_eiaid_column = "gisjoin" fuel_totals = ["out.electricity.total.energy_consumption", "out.natural_gas.total.energy_consumption", - "out.other_fuel.total.energy_consumption", - "out.district_cooling.total.energy_consumption", - "out.district_heating.total.energy_consumption" + "out.fuel_oil.total.energy_consumption", + "out.propane.total.energy_consumption", ] [completion_values] @@ -28,4 +27,4 @@ fail = "" inapplicable = "false" [structure] -inapplicables_have_ts = "true" \ No newline at end of file +inapplicables_have_ts = "true" diff --git a/buildstock_query/helpers.py b/buildstock_query/helpers.py index 82e40939..b65cdbaf 100644 --- a/buildstock_query/helpers.py +++ b/buildstock_query/helpers.py @@ -2,13 +2,10 @@ from pyathena.sqlalchemy.base import AthenaDialect from pyathena.pandas.result_set import AthenaPandasResultSet import datetime -import pickle -import os import pandas as pd from pathlib import Path import json from typing import Literal, TYPE_CHECKING -from filelock import FileLock if TYPE_CHECKING: from buildstock_query.schema.utilities import MappedColumn # noqa: F401 @@ -141,22 +138,6 @@ def __init__(self, message, existing_data=None): self.existing_data = existing_data -def save_pickle(path, obj): - lock_path = str(path) + ".lock" - with FileLock(lock_path): - with open(path, "wb") as f: - pickle.dump(obj, f) - - -def load_pickle(path): - if not os.path.exists(path): - raise FileNotFoundError(f"File {path} not found for loading table") - lock_path = str(path) + ".lock" - with FileLock(lock_path): - with open(path, "rb") as f: - return pickle.load(f) - - def read_csv(csv_file_path, **kwargs) -> pd.DataFrame: default_na_values = pd._libs.parsers.STR_NA_VALUES df = pd.read_csv(csv_file_path, na_values=list(default_na_values - {"None"}), keep_default_na=False, **kwargs) diff --git a/buildstock_query/main.py b/buildstock_query/main.py index ada5b4f4..9e3e49d6 100644 --- a/buildstock_query/main.py +++ b/buildstock_query/main.py @@ -1,6 +1,5 @@ import sqlalchemy as sa from sqlalchemy.sql import func as safunc -from sqlalchemy.sql import sqltypes from typing import Union from collections.abc import Sequence import logging @@ -15,12 +14,14 @@ from datetime import datetime from buildstock_query.schema.run_params import BSQParams from buildstock_query.schema.utilities import DBColType, SALabel, SACol, AnyColType, AnyTableType, RestrictTuple -from buildstock_query.schema.utilities import validate_arguments +from buildstock_query.schema.utilities import validate_arguments, typed_literal from buildstock_query.schema.utilities import MappedColumn from buildstock_query.schema.query_params import Query -import os +import pathlib +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass +from tqdm.auto import tqdm logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -41,7 +42,7 @@ def __init__( self, workgroup: str, db_name: str, - table_name: Union[str, tuple[str, Optional[str], Optional[str]]], + table_name: Union[str, tuple[str, Optional[str]]], db_schema: Optional[str | dict] = None, buildstock_type: Literal["resstock", "comstock"] = "resstock", sample_weight_override: Optional[Union[int, float]] = None, @@ -58,10 +59,11 @@ def __init__( workgroup (str): The workgroup for athena. The cost will be charged based on workgroup. db_name (str): The athena database name buildstock_type (str, optional): 'resstock' or 'comstock' runs. Defaults to 'resstock' - table_name (str or Union[str, tuple[str, Optional[str], Optional[str]]]): If a single string is provided, - say, 'mfm_run', then it must correspond to tables in athena named mfm_run_baseline and optionally - mfm_run_timeseries and mf_run_upgrades. Or, tuple of three elements can be provided for the table names - for baseline, timeseries and upgrade. Timeseries and upgrade can be None if no such table exist. + table_name (str or tuple[str, Optional[str]]): If a single string is provided, say, 'mfm_run', it must + correspond to tables in athena whose names are formed by appending the schema's + `[table_suffix].annual_and_metadata` and `.timeseries` to it. Or, a tuple `(annual_and_metadata_name, + timeseries_name)` can be provided to override that derivation. The timeseries entry may be None when no + timeseries table exists. db_schema (str | dict, optional): The database structure in Athena is different between ResStock and ComStock run. It is also different between the version in OEDI and default version from BuildStockBatch. This argument controls the assumed schema. Allowed values are whatever files exist @@ -99,14 +101,11 @@ def __init__( super(BuildStockQuery, self).__init__(params=self._run_params) from buildstock_query.report_query import BuildStockReport from buildstock_query.aggregate_query import BuildStockAggregate - from buildstock_query.savings_query import BuildStockSavings from buildstock_query.utility_query import BuildStockUtility #: `buildstock_query.report_query.BuildStockReport` object to perform report queries self.report: BuildStockReport = BuildStockReport(self) #: `buildstock_query.aggregate_query.BuildStockAggregate` object to perform aggregate queries self.agg: BuildStockAggregate = BuildStockAggregate(self) - #: `buildstock_query.savings_query.BuildStockSavings` object to perform savings queries - self.savings = BuildStockSavings(self) #: `buildstock_query.utility_query.BuildStockUtility` object to perform utility queries self.utility = BuildStockUtility(self) @@ -118,7 +117,6 @@ def __init__( print(self.report.get_success_report()) if self.ts_table is not None: self.report.check_ts_bs_integrity() - self.save_cache() def get_buildstock_df(self) -> pd.DataFrame: """Returns the building characteristics data by querying Athena tables using the same format as that produced @@ -180,16 +178,45 @@ def get_upgrade_names(self, get_query_only: Literal[True]) -> str: ... @validate_arguments def get_upgrade_names(self, get_query_only: bool = False) -> Union[str, dict]: - if self.up_table is None: - raise ValueError("This run has no upgrades") - upgrade_table = self.up_table - query = f""" - Select cast(upgrade as integer) as upgrade, arbitrary("apply_upgrade.upgrade_name") as upgrade_name - from {upgrade_table} - group by 1 order by 1 + """Return a dict of {upgrade_id: upgrade_name} for all upgrades in the run. + + The column carrying the human-readable upgrade name is configured per + schema via `column_names.upgrade_name` in the TOML. Classic schemas + default to `apply_upgrade.upgrade_name`; OEDI ComStock overrides to + `in.upgrade_name`. If the configured column doesn't actually exist on + the upgrade table (e.g. OEDI ResStock, where the names live in run + config rather than the Athena tables), the name field degrades to NULL + for every upgrade — the returned dict still has one entry per upgrade + so downstream iteration keeps working regardless of schema. """ + upgrade_col = self.md_table.c["upgrade"] + upgrade_name_col_name = self.db_schema.column_names.upgrade_name + has_name_col = upgrade_name_col_name in self.md_table.c + if has_name_col: + upgrade_name_col = self.md_table.c[upgrade_name_col_name] + name_select = safunc.arbitrary(upgrade_name_col).label("upgrade_name") + else: + # Schema configures a name column but the upgrade table doesn't + # actually have it (e.g. OEDI ResStock). Project a literal NULL + # labeled `upgrade_name` so the result shape stays the same as + # the classic-schema path. + name_select = sa.cast(sa.null(), sa.String).label("upgrade_name") + query = ( + sa.select( + sa.cast(upgrade_col, sa.Integer).label("upgrade"), + name_select, + ) + .select_from(self.md_table) # explicit FROM matches the column binds + # Exclude baseline rows from the upgrade names listing. The unified + # annual_and_metadata table has upgrade=0 baseline rows that pre-2-table- + # pivot lived on a separate parquet — historical callers expect this + # method to return upgrades only (1+). + .where(upgrade_col != typed_literal(upgrade_col, "0")) + .group_by(sa.literal_column("1")) + .order_by(sa.literal_column("1")) + ) if get_query_only: - return query + return self._compile(query) up_name_dict = self.execute(query).set_index("upgrade").to_dict()["upgrade_name"] return up_name_dict @@ -201,20 +228,19 @@ def _get_rows_per_building(self, get_query_only: Literal[True]) -> str: ... @validate_arguments def _get_rows_per_building(self, get_query_only: bool = False) -> Union[int, str]: - select_cols = [] - if self.up_table is not None and self.ts_table is not None: - select_cols.append(self.ts_table.c["upgrade"]) - select_cols.extend((self.ts_bldgid_column, safunc.count().label("row_count"))) + if self.ts_table is None: + raise ValueError("No timeseries table is available.") + ts_join_keys = self._get_unique_keys("timeseries") + group_cols: list = [self.ts_table.c["upgrade"]] + group_cols.extend(self.ts_table.c[key] for key in ts_join_keys) + select_cols = [*group_cols, safunc.count().label("row_count")] ts_query = sa.select(*select_cols) - if self.up_table is not None: - ts_query = ts_query.group_by(sa.text("1"), sa.text("2")) - else: - ts_query = ts_query.group_by(sa.text("1")) + ts_query = ts_query.group_by(*(sa.text(str(i + 1)) for i in range(len(group_cols)))) if get_query_only: return self._compile(ts_query) df = self.execute(ts_query) - if (df["row_count"] == df["row_count"][0]).all(): # verify all buildings got same number of rows + if (df["row_count"] == df["row_count"][0]).all(): return df["row_count"][0] else: raise ValueError("Not all buildings have same number of rows.") @@ -233,9 +259,14 @@ def get_distinct_vals( Returns: pd.Series: The distinct vals. """ - table_name = self.bs_table.name if table_name is None else table_name - tbl = self._get_table(table_name) - query = sa.select(tbl.c[column]).distinct() + # Default to the unified metadata table when table_name is None. + defaulted = table_name is None + tbl = self.md_table if defaulted else self._get_table(table_name) + query = sa.select(tbl.c[column]).select_from(tbl).distinct() + if defaulted: + # Restrict to baseline rows so the result matches the legacy + # baseline-only contract. + query = query.where(tbl.c["upgrade"] == typed_literal(tbl.c["upgrade"], "0")) if get_query_only: return self._compile(query) @@ -256,10 +287,33 @@ def get_distinct_count( Returns: pd.Series: The distinct counts. """ - tbl = self.bs_table if table_name is None else self._get_table(table_name) + # When table_name is None, use the canonical bs_table alias so column + # references in the SELECT (e.g. self.sample_wt → bs_table.weight) + # bind to the same table that's in the FROM. Selecting from md_table + # directly would cause SA to auto-add bs_table as a comma-join (and + # potentially produce a duplicate `upgrade` column on SELECT *). + # When the user passes an explicit table_name that's the unified + # metadata table, route it through the bs_table alias for the same + # reason — users naturally pass the real Athena table name. + if table_name is None or self._get_table(table_name) is self.md_table: + tbl = self.bs_table + else: + tbl = self._get_table(table_name) + # Rebind sample_wt to whichever table is actually in scope. The cached + # `self.sample_wt` was bound to bs_table at init; if the user passed an + # auxiliary table that also has a "weight" column, use that one to + # avoid pulling bs_table into the FROM. + if isinstance(self.sample_wt, sa.Column) and self.sample_wt.name in tbl.c: + sample_wt = tbl.c[self.sample_wt.name] + else: + sample_wt = self.sample_wt query = sa.select( - tbl.c[column], safunc.sum(1).label("sample_count"), safunc.sum(self.sample_wt).label("weighted_count") - ) + tbl.c[column], safunc.sum(1).label("metadata_rows_count"), safunc.sum(sample_wt).label("weighted_count") + ).select_from(tbl) + if table_name is None or tbl is self.bs_table: + # Default-table case (or user-passed-md): restrict to baseline rows + # so the count matches the legacy baseline-only contract. + query = query.where(self._md_baseline_filter(tbl)) query = query.group_by(tbl.c[column]).order_by(tbl.c[column]) if get_query_only: return self._compile(query) @@ -309,70 +363,191 @@ def get_results_csv( Pandas dataframe that is a subset of the results csv, that belongs to provided list of utilities """ restrict = list(restrict) if restrict else [] - query = sa.select("*").select_from(self.bs_table) + # Select through the canonical bs_table alias so any restrict column + # references (resolved via _get_column → bs_table.c[...]) bind to the + # alias that's in the FROM. Selecting from md_table directly would + # produce a comma-join + duplicate `upgrade` column in `SELECT *`. + query = sa.select("*").select_from(self.bs_table).where(self._md_baseline_filter()) query = self._add_restrict(query, restrict, annual_only=True) compiled_query = self._compile(query) if get_query_only: return compiled_query - self._session_queries.add(compiled_query) - if compiled_query in self._query_cache: - return self._query_cache[compiled_query].copy().set_index(self.bs_bldgid_column.name) logger.info("Making results_csv query ...") - result = self.execute(query) - return result.set_index(self.bs_bldgid_column.name) + return self.execute(query).set_index(list(self.md_key)) + + def _s3_list_all(self, bucket: str, prefix: str) -> list[dict]: + """Return all S3 objects under `prefix` by paginating list_objects_v2.""" + paginator = self._aws_s3.get_paginator("list_objects_v2") + contents: list[dict] = [] + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + contents.extend(page.get("Contents", [])) + return contents + + @staticmethod + def _upgrade_file_variants(upgrade_id: Union[str, int]) -> list[str]: + """Return the set of filename tokens that would mark a parquet as belonging to + the given upgrade. + + For baseline (upgrade 0): baseline.parquet, up00.parquet, up0.parquet, upgrade00.parquet, + upgrade0.parquet. + For upgrade N: up{N}.parquet, up{N:02}.parquet, upgrade{N}.parquet, upgrade{N:02}.parquet. + """ + try: + num = int(upgrade_id) + except (TypeError, ValueError): + num = None + tokens: list[str] = [] + if num is not None: + short = str(num) + padded = f"{num:02d}" + for p in ("up", "upgrade"): + tokens.append(f"{p}{short}.parquet") + if padded != short: + tokens.append(f"{p}{padded}.parquet") + if num == 0: + tokens.append("baseline.parquet") + else: + s = str(upgrade_id) + for p in ("up", "upgrade"): + tokens.append(f"{p}{s}.parquet") + # preserve order while dedup + return list(dict.fromkeys(tokens)) + + def download_metadata_and_annual_results( + self, + upgrade_id: Union[str, int] = "0", + folder: Optional[Union[str, pathlib.Path]] = None, + ) -> pathlib.Path: + """Download all annual-results parquet files for a given upgrade from S3. + + The Glue-registered table for metadata lives at `s3:////...`. Many runs + store their parquet inside Hive-style partition subfolders (e.g. `state=AK/county=.../`), + each partition holding one parquet per upgrade. This method recursively walks the glue + location and downloads every parquet whose filename ends with one of the known + upgrade-specific tokens (see `_upgrade_file_variants`). + + Files already present locally are skipped. Downloads are done via a thread pool + (size = min(10, N files)). Local layout mirrors the S3 layout under `folder`. + + Args: + upgrade_id: 0/"0" for baseline, else the upgrade number. + folder: Destination root; defaults to `cache_folder/metadata_and_annual_results/`. - def _download_results_csv(self) -> str: - """Downloads the results csv from s3 and returns the path to the downloaded file. Returns: - str: The path to the downloaded file. + The local destination folder (pathlib.Path). """ - local_copy_path = self.cache_folder / f"{self.db_name}_{self.bs_table.name}.parquet" - if os.path.exists(local_copy_path): - return local_copy_path - + upgrade_id_str = str(upgrade_id) + if folder is None: + folder = self.cache_folder / "metadata_and_annual_results" + folder = pathlib.Path(folder) + # nest per-upgrade so baseline (upgrade_id="0") and upgrade-N live in separate subdirs. + # Callers want to pd.read_parquet(folder) and get only that upgrade's rows. + upgrade_root = folder / f"upgrade={upgrade_id_str}" + + # The unified annual_and_metadata parquet holds rows for every upgrade, + # baseline included; the per-upgrade selection happens via the file-name + # token filter in `_upgrade_file_variants` below. if isinstance(self.table_name, str): - db_table_name = f"{self.table_name}{self.db_schema.table_suffix.baseline}" + db_table_name = f"{self.table_name}{self.db_schema.table_suffix.annual_and_metadata}" else: db_table_name = self.table_name[0] - baseline_path = self._aws_glue.get_table(DatabaseName=self.db_name, Name=db_table_name)["Table"][ + + table_loc = self._aws_glue.get_table(DatabaseName=self.db_name, Name=db_table_name)["Table"][ "StorageDescriptor" ]["Location"] - bucket = baseline_path.split("/")[2] - key = "/".join(baseline_path.split("/")[3:]) - s3_data = self._aws_s3.list_objects(Bucket=bucket, Prefix=key) - - if "Contents" not in s3_data: - raise ValueError(f"Results parquet not found in s3 at {baseline_path}") - matching_files = [ - path["Key"] - for path in s3_data["Contents"] - if "up00.parquet" in path["Key"] or "baseline.parquet" in path["Key"] - ] - - if len(matching_files) > 1: + bucket = table_loc.split("/")[2] + key_prefix = "/".join(table_loc.split("/")[3:]) + if not key_prefix.endswith("/"): + key_prefix += "/" + + tokens = self._upgrade_file_variants(upgrade_id_str) + contents = self._s3_list_all(bucket, key_prefix) + if not contents: + raise ValueError(f"No parquet files found in s3://{bucket}/{key_prefix}") + + def matches(path_key: str) -> bool: + basename = path_key.rsplit("/", 1)[-1] + return any(basename.endswith(tok) for tok in tokens) + + matching_keys = [obj["Key"] for obj in contents if matches(obj["Key"])] + if not matching_keys: + sample = [obj["Key"] for obj in contents[:10]] raise ValueError( - f"Multiple results parquet found in s3 at {baseline_path} for baseline." - f"These files matched: {matching_files}" + f"No results parquet matching upgrade={upgrade_id_str} found in s3://{bucket}/{key_prefix}. " + f"Looked for filenames ending in {tokens}. Example files: {sample}" ) - if len(matching_files) == 0: + + # group-by-directory uniqueness guard: in any single S3 "folder", we should have at + # most one file per upgrade. Multiple matches in the same folder means ambiguity. + by_dir: dict[str, list[str]] = {} + for k in matching_keys: + d = k.rsplit("/", 1)[0] + by_dir.setdefault(d, []).append(k) + ambiguous = {d: ks for d, ks in by_dir.items() if len(ks) > 1} + if ambiguous: raise ValueError( - f"No results parquet found in s3 at {baseline_path} for baseline." - f"Here are the files: {[content[0]['Key'] for content in s3_data['Contents']]}" + f"Multiple parquet files match upgrade={upgrade_id_str} in the same S3 folder: " + f"{ambiguous}" + ) + + tasks: list[tuple[str, pathlib.Path]] = [] + for k in matching_keys: + rel = k[len(key_prefix):] if k.startswith(key_prefix) else k + local_path = upgrade_root / rel + if local_path.exists(): + continue + tasks.append((k, local_path)) + + total_matches = len(matching_keys) + already_cached = total_matches - len(tasks) + if not tasks: + logger.info( + f"All {total_matches} parquet file(s) for upgrade={upgrade_id_str} already present locally " + f"at {upgrade_root}; skipping download." ) + else: + if already_cached: + logger.info( + f"{already_cached}/{total_matches} parquet file(s) for upgrade={upgrade_id_str} already " + f"present locally; downloading the remaining {len(tasks)}." + ) + else: + logger.info( + f"Downloading {len(tasks)} parquet file(s) for upgrade={upgrade_id_str} to {upgrade_root}." + ) + max_workers = min(10, len(tasks)) + + def _download(k_and_path): + k, local_path = k_and_path + local_path.parent.mkdir(parents=True, exist_ok=True) + self._aws_s3.download_file(bucket, k, str(local_path)) + return local_path + + desc = f"Downloading parquet for upgrade={upgrade_id_str}" + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = [pool.submit(_download, t) for t in tasks] + for fut in tqdm(as_completed(futures), total=len(futures), desc=desc, unit="file"): + fut.result() + + return upgrade_root - self._aws_s3.download_file(bucket, matching_files[0], local_copy_path) - return local_copy_path + def _download_results_csv(self) -> pathlib.Path: + """Download the baseline results parquet(s). See `download_metadata_and_annual_results`.""" + return self.download_metadata_and_annual_results(upgrade_id="0") def get_results_csv_full(self) -> pd.DataFrame: """Returns the full results csv table. This is the same as get_results_csv without any restrictions. It uses the stored parquet files in s3 to download the results which is faster than querying athena. Returns: - pd.DataFrame: The full results csv. + pd.DataFrame: The full results csv, indexed by md_key. """ local_copy_path = self._download_results_csv() df = pd.read_parquet(local_copy_path) - if df.index.name != self.bs_bldgid_column.name: - df = df.set_index(self.bs_bldgid_column.name) + index_keys = list(self.md_key) + if list(df.index.names) != index_keys: + if df.index.name is not None or any(n is not None for n in df.index.names): + df = df.reset_index() + df = df.set_index(index_keys) return df @typing.overload @@ -421,90 +596,51 @@ def get_upgrades_csv( Pandas dataframe that is a subset of the results csv, that belongs to provided list of utilities """ restrict = list(restrict) if restrict else [] - query = sa.select("*").select_from(self.up_table) - if upgrade_id: - if self.up_table is None: - raise ValueError("This run has no upgrades") - query = query.where(self.up_table.c["upgrade"] == str(upgrade_id)) + # Select through the canonical bs_table alias so restrict columns + # resolved via _get_column bind to the alias that's in the FROM (not + # md_table directly, which would produce a comma-join). + up_col = self.bs_table.c["upgrade"] + query = sa.select("*").select_from(self.bs_table).where( + up_col == typed_literal(up_col, upgrade_id) + ) - query = self._add_restrict(query, restrict, annual_only=True) + rewritten_restrict = [] + for col, vals in restrict: + if isinstance(col, str) and col in self.bs_table.c: + rewritten_restrict.append((self.bs_table.c[col], vals)) + else: + rewritten_restrict.append((col, vals)) + query = self._add_restrict(query, rewritten_restrict, annual_only=True) compiled_query = self._compile(query) if get_query_only: return compiled_query - self._session_queries.add(compiled_query) - if compiled_query in self._query_cache: - return self._query_cache[compiled_query].copy().set_index(self.bs_bldgid_column.name) logger.info("Making results_csv query for upgrade ...") - return self.execute(query).set_index(self.bs_bldgid_column.name) + return self.execute(query).set_index(list(self.md_key)) - def _download_upgrades_csv(self, upgrade_id: Union[int, str]) -> str: - """Downloads the upgrades csv from s3 and returns the path to the downloaded file.""" - if self.up_table is None: - raise ValueError("This run has no upgrades") - - available_upgrades = list(self.get_available_upgrades()) - available_upgrades.remove("0") + def _download_upgrades_csv(self, upgrade_id: Union[int, str]) -> pathlib.Path: + """Download the upgrade-N results parquet(s). See `download_metadata_and_annual_results`.""" if isinstance(upgrade_id, int): upgrade_id = f"{upgrade_id:02}" - + available_upgrades = list(self.get_available_upgrades()) + if "0" in available_upgrades: + available_upgrades.remove("0") if str(upgrade_id) not in available_upgrades: raise ValueError(f"Upgrade {upgrade_id} not found") - local_copy_path = self.cache_folder / f"{self.db_name}_{self.up_table.name}_{upgrade_id}.parquet" - if os.path.exists(local_copy_path): - return local_copy_path - - if isinstance(self.table_name, str): - db_table_name = f"{self.table_name}{self.db_schema.table_suffix.upgrades}" - else: - db_table_name = self.table_name[2] - upgrades_path = self._aws_glue.get_table(DatabaseName=self.db_name, Name=db_table_name)["Table"][ - "StorageDescriptor" - ]["Location"] - bucket = upgrades_path.split("/")[2] - key = "/".join(upgrades_path.split("/")[3:]) - s3_data = self._aws_s3.list_objects(Bucket=bucket, Prefix=key) - - if "Contents" not in s3_data: - raise ValueError(f"Results parquet not found in s3 at {upgrades_path}") - - # out of the contents find the key with name matching the pattern results_up{upgrade_id}.parquet - def is_match(upgrade_id, key): - try: - upgrade_id = int(upgrade_id) - alternative_id = f"{upgrade_id:02}" - except ValueError: - alternative_id = str(upgrade_id) - for prefix in ["up", "upgrade"]: - if f"{prefix}{upgrade_id}.parquet" in key or f"{prefix}{alternative_id}.parquet" in key: - return True - return False - - matching_files = [path["Key"] for path in s3_data["Contents"] if is_match(upgrade_id, path["Key"])] - - if len(matching_files) > 1: - raise ValueError( - f"Multiple results parquet found in s3 at {upgrades_path} for upgrade {upgrade_id}." - f"These files matched: {matching_files}" - ) - if len(matching_files) == 0: - raise ValueError( - f"No results parquet found in s3 at {upgrades_path} for upgrade {upgrade_id}." - f"Here are the files: {[content[0]['Key'] for content in s3_data['Contents']]}" - ) - - self._aws_s3.download_file(bucket, matching_files[0], local_copy_path) - return local_copy_path + return self.download_metadata_and_annual_results(upgrade_id=str(upgrade_id)) def get_upgrades_csv_full(self, upgrade_id: Union[int, str]) -> pd.DataFrame: """Returns the full results csv table for upgrades. This is the same as get_upgrades_csv without any restrictions. It uses the stored parquet files in s3 to download the results which is faster than querying - athena. + athena. Indexed by md_key. """ local_copy_path = self._download_upgrades_csv(upgrade_id) df = pd.read_parquet(local_copy_path) - if df.index.name != self.up_bldgid_column.name: - df = df.set_index(self.up_bldgid_column.name) + index_keys = list(self.md_key) + if list(df.index.names) != index_keys: + if df.index.name is not None or any(n is not None for n in df.index.names): + df = df.reset_index() + df = df.set_index(index_keys) if "upgrade" not in df.columns: df.insert(0, "upgrade", upgrade_id) return df @@ -513,7 +649,8 @@ def get_upgrades_csv_full(self, upgrade_id: Union[int, str]) -> pd.DataFrame: def get_building_ids( self, *, - restrict: Sequence[tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), + restrict: Sequence[RestrictTuple] = Field(default_factory=list), + avoid: Sequence[RestrictTuple] = Field(default_factory=list), get_query_only: Literal[False] = False, ) -> pd.DataFrame: ... @@ -521,7 +658,8 @@ def get_building_ids( def get_building_ids( self, *, - restrict: Sequence[tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), + restrict: Sequence[RestrictTuple] = Field(default_factory=list), + avoid: Sequence[RestrictTuple] = Field(default_factory=list), get_query_only: Literal[True], ) -> str: ... @@ -529,35 +667,131 @@ def get_building_ids( def get_building_ids( self, *, - restrict: Sequence[tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), + restrict: Sequence[RestrictTuple] = Field(default_factory=list), + avoid: Sequence[RestrictTuple] = Field(default_factory=list), get_query_only: bool, ) -> Union[pd.DataFrame, str]: ... @validate_arguments def get_building_ids( self, - restrict: Sequence[tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), + restrict: Sequence[RestrictTuple] = Field(default_factory=list), + avoid: Sequence[RestrictTuple] = Field(default_factory=list), get_query_only: bool = False, ) -> Union[str, pd.DataFrame]: - """ - Returns the list of buildings based on the restrict list + """Return the list of building keys. + + For applied-buildings filtering, compose with `get_applied_buildings_filter`: + f = bsq.get_applied_buildings_filter(all_of=[1, 2]) + ids = bsq.get_building_ids(restrict=[f] if f else []) + # Or to get the complement (universe \\ applied set): + ids = bsq.get_building_ids(avoid=[f] if f else []) + Args: - restrict (list[Tuple[str, List]], optional): The list of where condition to restrict the results to. It - should be specified as a list of tuple. - Example: `[('state',['VA','AZ']), ("build_existing_model.lighting",['60% CFL']), ...]` - get_query_only (bool): If set to true, returns the query string instead of the result. Default is False. + restrict: Standard restrict list. Each entry is either a `(column, value)` + scalar/list comparison, a `(column, subquery)` IN-subquery, or a + `(tuple-of-columns, tuple-subquery)` composite-key membership. + avoid: Same shape as `restrict`, but each entry becomes a NOT-IN / + inequality predicate. Use to select buildings outside a given + set (e.g. `avoid=[applied_filter]` returns buildings the + upgrade did NOT apply to). + get_query_only: If True, return the SQL string instead of executing. Returns: - Pandas dataframe consisting of the building ids belonging to the provided list of locations. - + DataFrame of building keys (`md_key_cols`). """ restrict = list(restrict) if restrict else [] - query = sa.select(self.bs_bldgid_column) + avoid = list(avoid) if avoid else [] + # md_table holds rows for every upgrade — filter to baseline so the + # result is one row per (building × keys), not (building × upgrade × keys). + query = sa.select(*self.md_key_cols).select_from(self.bs_table).where(self._md_baseline_filter()) query = self._add_restrict(query, restrict, annual_only=True) + query = self._add_avoid(query, avoid, annual_only=True) if get_query_only: return self._compile(query) return self.execute(query) + @typing.overload + def get_applied_buildings( + self, + *, + any_of: Optional[Sequence[Union[str, int]]] = None, + all_of: Optional[Sequence[Union[str, int]]] = None, + get_query_only: Literal[False] = False, + ) -> pd.DataFrame: ... + + @typing.overload + def get_applied_buildings( + self, + *, + any_of: Optional[Sequence[Union[str, int]]] = None, + all_of: Optional[Sequence[Union[str, int]]] = None, + get_query_only: Literal[True], + ) -> str: ... + + @typing.overload + def get_applied_buildings( + self, + *, + any_of: Optional[Sequence[Union[str, int]]] = None, + all_of: Optional[Sequence[Union[str, int]]] = None, + get_query_only: bool, + ) -> Union[pd.DataFrame, str]: ... + + @validate_arguments + def get_applied_buildings( + self, + *, + any_of: Optional[Sequence[Union[str, int]]] = None, + all_of: Optional[Sequence[Union[str, int]]] = None, + get_query_only: bool = False, + ) -> Union[pd.DataFrame, str]: + """Return building keys for buildings matching an applied-upgrade predicate. + + - `all_of`: must have applicability=true rows for every listed upgrade. + - `any_of`: must have applicability=true rows for at least one listed upgrade. + - Both: AND of the two predicates. + - At least one of `any_of` or `all_of` must be provided. + - Passing 0 (baseline) in either list raises ValueError. + + Args: + any_of: list of upgrade ids — building must have applied to at least one. + all_of: list of upgrade ids — building must have applied to all listed. + get_query_only: If True, return the SQL string instead of executing. + + Returns: + DataFrame of `md_key_cols` for matching buildings. + """ + if not any_of and not all_of: + raise ValueError("get_applied_buildings: must provide any_of or all_of") + select = self._build_applied_subquery(any_of=any_of, all_of=all_of, key_kind="metadata") + # _build_applied_subquery returns a Select; with at least one list non-empty + # it cannot be None (empty-list case returns None and is rejected above). + assert select is not None + if get_query_only: + return self._compile(select) + return self.execute(select) + + def get_applied_buildings_filter( + self, + *, + any_of: Optional[Sequence[Union[str, int]]] = None, + all_of: Optional[Sequence[Union[str, int]]] = None, + ) -> Optional[RestrictTuple]: + """Return a `(cols_or_col, subquery)` tuple to drop into `restrict=[...]` or + `avoid=[...]`. Returns None when both lists are empty/None. + + Typical use: + f = bsq.get_applied_buildings_filter(all_of=[1, 2]) + df = bsq.query(..., restrict=[f, ("state", ["CO"])] if f else [("state", ["CO"])]) + + See `get_applied_buildings` for predicate semantics. + """ + select = self._build_applied_subquery(any_of=any_of, all_of=all_of, key_kind="metadata") + if select is None: + return None + return self._make_applied_filter_tuple(select, key_kind="metadata") + @typing.overload def _get_simulation_info(self, get_query_only: Literal[False] = False) -> SimInfo: ... @@ -571,12 +805,14 @@ def _get_simulation_info(self, get_query_only: bool = False) -> Union[str, SimIn bldg_df = self.execute(query0) bldg_id = bldg_df.values[0][0] upgrade_id = bldg_df.values[0][1] - query1 = sa.select(self.timestamp_column.distinct().label(self.timestamp_column_name)).where( - self.ts_bldgid_column == bldg_id + ucol = self._ts_upgrade_col + query1 = ( + sa.select(self.timestamp_column.distinct().label(self.timestamp_column_name)) + .where(self.ts_bldgid_column == bldg_id) + .where(ucol == typed_literal(ucol, upgrade_id)) + .order_by(self.timestamp_column) + .limit(2) ) - if self.up_table is not None: - query1 = query1.where(self._ts_upgrade_col == str(upgrade_id)) - query1 = query1.order_by(self.timestamp_column).limit(2) if get_query_only: return self._compile(query1) @@ -626,11 +862,9 @@ def _get_special_column( def _get_gcol( self, column: AnyColType, annual_only: bool = False ) -> DBColType: # gcol => group by col - """Get a DB column for the purpose of grouping. If the provided column doesn't exist as is, - tries to get the column by prepending self._char_prefix.""" - + """Get a DB column for the purpose of grouping.""" if isinstance(column, sa.Column): - return column.label(self._simple_label(column.name)) # already a col + return column.label(self._simple_label(column.name)) if isinstance(column, SALabel): return column @@ -639,15 +873,7 @@ def _get_gcol( return sa.literal(column).label(self._simple_label(column.name)) if isinstance(column, str): - try: - return self._get_column(column, annual_only=annual_only).label(self._simple_label(column)) - except (ValueError, KeyError): - if column.startswith(self._char_prefix): - new_name = column.removeprefix(self._char_prefix) - return self._get_column(new_name, annual_only=annual_only).label(column) - else: - new_name = f"{self._char_prefix}{column}" - return self._get_column(new_name, annual_only=annual_only).label(column) + return self._get_column(column, annual_only=annual_only).label(self._simple_label(column)) raise ValueError(f"Invalid column name type {column}: {type(column)}") @@ -686,7 +912,12 @@ def get_calculated_column(self, column_name: str, column_expr: str, table="basel return resolved_col.label(self._simple_label(column_name)) def _get_enduse_cols(self, enduses: Sequence[AnyColType], table="baseline") -> Sequence[DBColType]: - tbls_dict = {"baseline": self.bs_table, "upgrade": self.up_table, "timeseries": self.ts_table} + # "baseline" and "upgrade" both resolve to the unified metadata table — + # the columns are the same; the per-upgrade selection happens via WHERE + # at the call site. "timeseries" stays distinct. We bind to bs_table + # (the canonical alias of md_table) so column references in outer + # aggregation queries pick up the alias that's actually in the FROM. + tbls_dict = {"baseline": self.bs_table, "upgrade": self.bs_table, "timeseries": self.ts_table} tbl = tbls_dict[table] enduse_cols: list[DBColType] = [] for enduse in enduses: @@ -712,7 +943,7 @@ def get_groupby_cols(self) -> list[str]: Returns: list[str]: List of building characteristics. """ - cols = {y.removeprefix(self._char_prefix) for y in self.bs_table.c.keys() if y.startswith(self._char_prefix)} + cols = {y.removeprefix(self._char_prefix) for y in self.md_table.c.keys() if y.startswith(self._char_prefix)} return list(cols) def _validate_group_by(self, group_by: Sequence[Union[str, tuple[str, str]]]): @@ -730,7 +961,9 @@ def get_available_upgrades(self) -> Sequence[str]: Returns: list: List of upgrades """ - return list([str(u) for u in self.report.get_success_report().index]) + query = sa.select(self.md_table.c["upgrade"]).select_from(self.md_table).distinct().order_by(sa.text("1")) + upgrades = self.execute(query)["upgrade"].dropna().map(str).to_list() + return list(dict.fromkeys(["0", *upgrades])) def _validate_upgrade(self, upgrade_id: Union[int, str]) -> str: upgrade_id = "0" if upgrade_id in (None, "0") else str(upgrade_id) @@ -742,16 +975,35 @@ def _validate_upgrade(self, upgrade_id: Union[int, str]) -> str: return str(upgrade_id) def _split_restrict(self, restrict): - # Some cols like "state" might be available in both ts and bs table - bs_restrict = [] # restrict to apply to baseline table - ts_restrict = [] # restrict to apply to timeseries table + # Some cols (e.g. comstock `state`, `upgrade`) live on both md and ts tables. + # When that happens, restrict BOTH sides — Athena's planner often can't push + # a ts-side filter back through the bldg_id join to the md scan, so a single- + # sided filter leaves the metadata subquery scanning the full table. + # + # `extra_restrict` holds clauses whose column targets neither md nor ts — + # typically a join_list table (e.g. `eiaid_weights.eiaid` from the utility + # methods). These can't ride the inner ts/md join ON-clause because the + # referenced table isn't in scope yet; they must go to the outer WHERE. + md_restrict = [] + ts_restrict = [] + extra_restrict = [] for col, restrict_vals in restrict: - if self._restrict_targets_ts(col): # prioritize ts table - col_name = col if isinstance(col, str) else col.name - ts_restrict.append([self.ts_table.c[col_name], restrict_vals]) - else: - bs_restrict.append([self._get_gcol(col, annual_only=True), restrict_vals]) - return bs_restrict, ts_restrict + targets_ts = self._restrict_targets_ts(col) + targets_md = self._restrict_targets_md(col) + if targets_ts: + if isinstance(col, tuple): + ts_restrict.append([col, restrict_vals]) + else: + col_name = col if isinstance(col, str) else col.name + ts_restrict.append([self.ts_table.c[col_name], restrict_vals]) + if targets_md: + if isinstance(col, tuple): + md_restrict.append([col, restrict_vals]) + else: + md_restrict.append([self._get_gcol(col, annual_only=True), restrict_vals]) + if not targets_ts and not targets_md: + extra_restrict.append([col, restrict_vals]) + return md_restrict, ts_restrict, extra_restrict def _restrict_targets_ts(self, col: AnyColType) -> bool: if self.ts_table is None: @@ -763,6 +1015,38 @@ def _restrict_targets_ts(self, col: AnyColType) -> bool: if isinstance(col, SALabel): source_col = getattr(col, "element", None) return isinstance(source_col, SACol) and getattr(source_col, "table", None) is self.ts_table + if isinstance(col, tuple) and col: + return all( + isinstance(c, SACol) and getattr(c, "table", None) is self.ts_table for c in col + ) + return False + + def _restrict_targets_md(self, col: AnyColType) -> bool: + # md_table and bs_table share columns (bs is an alias of md), so check + # both — restrict columns may be bound to either depending on call site. + md_handles = (self.md_table, self.bs_table) + if isinstance(col, str): + # Try both bare name and prefixed form. Char/output columns on + # md often carry a prefix (e.g. `in.`, `out.`); a + # user-supplied bare `` should still classify as md so + # _split_restrict can route the clause into the inner JOIN / + # bs_per_bldg WHERE rather than the outer WHERE (which would + # produce a comma-join against the canonical bs alias). + if col in self.bs_table.columns: + return True + for prefix in (self._char_prefix, self._out_prefix): + if f"{prefix}{col}" in self.bs_table.columns: + return True + return False + if isinstance(col, SACol): + return getattr(col, "table", None) in md_handles + if isinstance(col, SALabel): + source_col = getattr(col, "element", None) + return isinstance(source_col, SACol) and getattr(source_col, "table", None) in md_handles + if isinstance(col, tuple) and col: + return all( + isinstance(c, SACol) and getattr(c, "table", None) in md_handles for c in col + ) return False def _is_timeseries_upgrade_restrict(self, col: AnyColType) -> bool: @@ -840,21 +1124,6 @@ def _process_groupby_cols(self, group_by, annual_only=False) -> list[DBColType]: return [] return [self._get_gcol(entry, annual_only=annual_only) for entry in group_by] - def _get_simulation_timesteps_count(self): - # find the simulation time interval - query = sa.select(self.ts_bldgid_column, safunc.sum(1).label("count")) - query = query.group_by(self.ts_bldgid_column) - sim_timesteps_count = self.execute(query) - bld0_step_count = sim_timesteps_count["count"].iloc[0] - n_buildings_with_same_count = sum(sim_timesteps_count["count"] == bld0_step_count) - if n_buildings_with_same_count != len(sim_timesteps_count): - logger.warning( - "Not all buildings have the same number of timestamps. This can cause wrong" - "scaled_units_count and other problems." - ) - - return bld0_step_count - @typing.overload def get_buildings_by_locations( self, location_col: str, locations: list[str], get_query_only: Literal[False] = False @@ -885,102 +1154,263 @@ def get_buildings_by_locations( Pandas dataframe consisting of the building ids belonging to the provided list of locations. """ - query = sa.select(self.bs_bldgid_column) + md_key_cols = self.md_key_cols + # md_table holds every upgrade — restrict to baseline rows so each + # (key) appears once, not once per upgrade. + query = sa.select(*md_key_cols).where(self._md_baseline_filter()) query = query.where(self._get_column(location_col, [self.bs_table]).in_(locations)) - query = self._add_order_by(query, [self.bs_bldgid_column]) + query = self._add_order_by(query, md_key_cols) if get_query_only: return self._compile(query) res = self.execute(query) return res @property - def _bs_completed_status_col(self): - if not isinstance(self.bs_table.c[self.db_schema.column_names.completed_status].type, sqltypes.String): - return sa.cast(self.bs_table.c[self.db_schema.column_names.completed_status], sa.String).label( - "completed_status" - ) - else: - return self.bs_table.c[self.db_schema.column_names.completed_status] + def _md_completed_status_col(self): + return self.bs_table.c[self.db_schema.column_names.completed_status] @property - def _up_completed_status_col(self): - if self.up_table is None: - raise ValueError("No upgrades table") - if not isinstance(self.up_table.c[self.db_schema.column_names.completed_status].type, sqltypes.String): - return sa.cast(self.up_table.c[self.db_schema.column_names.completed_status], sa.String).label( - "completed_status" - ) - else: - return self.up_table.c[self.db_schema.column_names.completed_status] + def _md_successful_condition(self): + """`md.applicability=true`. No upgrade filter — callers pin + `md.upgrade=N` explicitly when they want a specific upgrade.""" + col = self._md_completed_status_col + return col == typed_literal(col, self.db_schema.completion_values.success) @property - def _bs_successful_condition(self): - return self._bs_completed_status_col == self.db_schema.completion_values.success - - @property - def _up_successful_condition(self): - return self._up_completed_status_col == self.db_schema.completion_values.success + def _md_baseline_successful_condition(self): + """`md.applicability=true AND md.upgrade=0` — combined helper for the + common case "successful baseline rows", matching the legacy + `_bs_successful_condition` semantics on the unified table.""" + return sa.and_(self._md_successful_condition, self._md_baseline_filter()) @property def _ts_upgrade_col(self): - if not isinstance(self.ts_table.c["upgrade"].type, sqltypes.String): - return sa.cast(self.ts_table.c["upgrade"], sa.String).label("upgrade") - else: - return self.ts_table.c["upgrade"] + return self.ts_table.c["upgrade"] @property - def _up_upgrade_col(self): - if self.up_table is None: - raise ValueError("No upgrades table") - if not isinstance(self.up_table.c["upgrade"].type, sqltypes.String): - return sa.cast(self.up_table.c["upgrade"], sa.String).label("upgrade") - else: - return self.up_table.c["upgrade"] + def _md_upgrade_col(self): + return self.bs_table.c["upgrade"] def _get_completed_status_col(self, table: AnyTableType): - if not isinstance(table.c[self.db_schema.column_names.completed_status].type, sqltypes.String): - return sa.cast(table.c[self.db_schema.column_names.completed_status], sa.String).label("completed_status") - else: - return table.c[self.db_schema.column_names.completed_status] + return table.c[self.db_schema.column_names.completed_status] def _get_success_condition(self, table: AnyTableType): - return self._get_completed_status_col(table) == self.db_schema.completion_values.success + col = self._get_completed_status_col(table) + return col == typed_literal(col, self.db_schema.completion_values.success) + + @property + def _state_agg_columns(self) -> set[str]: + """Names of columns physically present on the alt metadata table. + Empty set when the schema declares no alt table. Used by + `_pick_metadata_table` to decide whether routing is safe — if any + group_by or restrict column is absent here, we must scan the + primary table instead. + """ + if self.bs_table_state_agg is None: + return set() + return set(self.bs_table_state_agg.c.keys()) + + def _column_name_or_none(self, col_ref) -> Optional[str]: + """Resolve a user-facing column reference to its physical column + name on `bs_table`, or return None if it can't be resolved + (calculated columns, MappedColumns, etc., are not propagatable). + Mirrors `_get_column`'s `in.` prefix logic so that user-supplied + `state` resolves to ResStock's `in.state` and ComStock's bare + `state` consistently. + """ + try: + resolved = self._get_column(col_ref, annual_only=True) + except (ValueError, AttributeError): + return None + if not isinstance(resolved, sa.Column): + return None + if resolved.table is not self.bs_table: + return None + return resolved.name + + def _pick_metadata_table( + self, + group_by: Sequence, + restrict: Sequence, + ) -> Literal["primary", "state_agg"]: + """Decide whether to scan the primary `annual_and_metadata` table + (today's default) or the smaller `annual_and_metadata_state_agg` + alt table for this query. + + Routing rule: pick `state_agg` iff the schema declares an alt + table AND every column referenced by `group_by` or `restrict` + physically exists on the alt table. The alt table omits + finer-grain columns (county, tract gisjoin, tract demographics) + — any reference to them disqualifies routing. + + Calculated/MappedColumn group-by entries can't be resolved to a + single physical column; they conservatively disqualify routing. + + Returns: + "primary" — use today's bs_table (always safe). + "state_agg" — use bs_table_state_agg (smaller, faster when + eligible; see INVESTIGATION_partition_overhead.md + for measurements). + """ + if self.bs_table_state_agg is None: + return "primary" + alt_cols = self._state_agg_columns + # Check group_by: each entry must resolve to a column present + # on the alt table. + for g in group_by or (): + if isinstance(g, str): + # Try both the user-supplied name and the in.-prefixed + # form, since the alt table uses the prefixed convention + # for some schemas (e.g. ResStock: in.state). + char_prefix = self.db_schema.column_prefix.characteristics + if g in alt_cols: + continue + if g.startswith(char_prefix) and g.removeprefix(char_prefix) in alt_cols: + continue + if not g.startswith(char_prefix) and f"{char_prefix}{g}" in alt_cols: + continue + return "primary" + elif isinstance(g, sa.Column): + if g.name not in alt_cols: + return "primary" + elif isinstance(g, SALabel): + # Calculated columns: conservatively unroutable. They + # may reference columns from outside `bs_table` (e.g. + # ts-side enduses) which the alt table also lacks. + return "primary" + elif isinstance(g, MappedColumn): + # MappedColumns are user-supplied literal mappings — + # they don't depend on table schema, so they're safe. + continue + else: + # Unknown entry shape — be conservative. + return "primary" + # Check restrict: each col_ref must resolve to a bs_table column + # that's also on the alt table. + for entry in restrict or (): + col_ref = entry[0] if isinstance(entry, (tuple, list)) else None + if col_ref is None: + continue + # Multi-column tuple LHS (e.g. applied-buildings filter): + # safe to route iff every component column is on the alt. + if isinstance(col_ref, tuple): + for c in col_ref: + name = c.name if isinstance(c, sa.Column) else None + if name is None or name not in alt_cols: + return "primary" + continue + # Single column ref: resolve through the same prefix logic. + if isinstance(col_ref, str): + char_prefix = self.db_schema.column_prefix.characteristics + if col_ref in alt_cols: + continue + if col_ref.startswith(char_prefix) and col_ref.removeprefix(char_prefix) in alt_cols: + continue + if not col_ref.startswith(char_prefix) and f"{char_prefix}{col_ref}" in alt_cols: + continue + return "primary" + if isinstance(col_ref, sa.Column): + if col_ref.name not in alt_cols: + return "primary" + continue + # Unknown shape — be conservative. + return "primary" + return "state_agg" - def _get_applied_in_subquery(self, applied_in: Optional[Sequence[str | int]]): - """Return a building-id subquery for buildings where all listed upgrades applied successfully.""" - if not applied_in: + def _build_applied_subquery( + self, + *, + any_of: Optional[Sequence[str | int]] = None, + all_of: Optional[Sequence[str | int]] = None, + key_kind: Literal["metadata", "timeseries"] = "metadata", + ): + """Return a unique-key subquery for buildings matching the applied predicate. + + - `all_of`: must have applicability=true rows for every listed upgrade. + - `any_of`: must have applicability=true rows for at least one listed upgrade. + - Both: AND of the two predicates. + - Neither: returns None. + - 0 in either list: ValueError. Baseline isn't an "applied" upgrade. + + `key_kind` selects which unique-key columns to project — use "timeseries" when + the subquery will filter the timeseries table (whose key may be narrower). + """ + all_ids = self._normalize_applied_list(all_of) if all_of else [] + any_ids = self._normalize_applied_list(any_of) if any_of else [] + if not all_ids and not any_ids: return None - if self.up_table is None: - raise ValueError("No upgrades table found.") - upgrade_ids = list(dict.fromkeys(self._validate_upgrade(upgrade_id) for upgrade_id in applied_in)) - bldg_id_col = self.up_table.c[self.building_id_column_name] - return ( - sa.select(bldg_id_col) + up_col = self._md_upgrade_col + union_ids = list(dict.fromkeys(all_ids + any_ids)) + typed_union = [typed_literal(up_col, uid) for uid in union_ids] + key_names = self._get_unique_keys(key_kind) + md_key_cols = [self.bs_table.c[name] for name in key_names] + + select = ( + sa.select(*md_key_cols) .where( - self._up_upgrade_col.in_(upgrade_ids), - self._up_successful_condition, + up_col.in_(typed_union), + self._md_successful_condition, ) - .group_by(bldg_id_col) - .having(sa.func.count(sa.func.distinct(self._up_upgrade_col)) == len(upgrade_ids)) + .group_by(*md_key_cols) ) - def _add_applied_in_restrict( + if all_ids and not any_ids: + # all_of-only: identical SQL shape to the prior `applied_in` form. + select = select.having(sa.func.count(sa.func.distinct(up_col)) == len(all_ids)) + elif any_ids and not all_ids: + # any_of-only: GROUP BY + WHERE upgrade IN (...) is sufficient. A + # surviving row means at least one matching applicable upgrade + # existed; GROUP BY collapses to one row per key. + pass + else: + # Both lists: AND the two predicates via CASE-filtered counts. + typed_all = [typed_literal(up_col, uid) for uid in all_ids] + typed_any = [typed_literal(up_col, uid) for uid in any_ids] + all_case = sa.case((up_col.in_(typed_all), up_col), else_=None) + any_case = sa.case((up_col.in_(typed_any), up_col), else_=None) + select = select.having( + sa.and_( + sa.func.count(sa.func.distinct(all_case)) == len(all_ids), + sa.func.count(sa.func.distinct(any_case)) >= 1, + ) + ) + return select + + def _normalize_applied_list(self, upgrades: Sequence[str | int]) -> list[str]: + """Validate and normalize an upgrade-id list. Rejects 0 (baseline).""" + normalized: list[str] = [] + for raw in upgrades: + uid = self._validate_upgrade(raw) + if str(uid) == "0": + raise ValueError( + "0 (baseline) is not a valid applied upgrade — applicability is " + "meaningful only for upgrades" + ) + if uid not in normalized: + normalized.append(uid) + return normalized + + def _make_applied_filter_tuple( self, - restrict: Sequence[RestrictTuple], + select, *, - applied_in: Optional[Sequence[str | int]], - annual_only: bool, - ) -> list[RestrictTuple]: - """Append the applied-in building filter to the existing restrict list when requested.""" - updated_restrict = list(restrict) if restrict else [] - applied_subquery = self._get_applied_in_subquery(applied_in) - if applied_subquery is None: - return updated_restrict + key_kind: Literal["metadata", "timeseries"] = "metadata", + ) -> RestrictTuple: + """Wrap a Select into the `(cols_or_col, subquery)` shape used by restrict/avoid. - bldg_col = self.bs_bldgid_column if annual_only or self.ts_table is None else self.ts_bldgid_column - updated_restrict.append((bldg_col, applied_subquery)) - return updated_restrict + When `key_kind="timeseries"`, the LHS columns are bound to ts_table so + that `_split_restrict` routes the filter to the TS-side WHERE (where + `inapplicables_have_ts` rows would otherwise inflate totals). + """ + key_names = self._get_unique_keys(key_kind) + if key_kind == "timeseries" and self.ts_table is not None: + cols = [self.ts_table.c[name] for name in key_names] + else: + cols = [self.bs_table.c[name] for name in key_names] + if len(cols) == 1: + return (cols[0], select) + return (tuple(cols), select) @typing.overload def query( @@ -1000,7 +1430,6 @@ def query( restrict: Sequence[RestrictTuple] = Field(default_factory=list), avoid: Sequence[RestrictTuple] = Field(default_factory=list), applied_only: bool = False, - applied_in: Sequence[str | int] | None = None, get_quartiles: bool = False, get_nonzero_count: bool = False, unload_to: str = "", @@ -1028,7 +1457,6 @@ def query( restrict: Sequence[RestrictTuple] = Field(default_factory=list), avoid: Sequence[RestrictTuple] = Field(default_factory=list), applied_only: bool = False, - applied_in: Sequence[str | int] | None = None, get_quartiles: bool = False, get_nonzero_count: bool = False, unload_to: str = "", @@ -1056,7 +1484,6 @@ def query( restrict: Sequence[RestrictTuple] = Field(default_factory=list), avoid: Sequence[RestrictTuple] = Field(default_factory=list), applied_only: bool = False, - applied_in: Sequence[str | int] | None = None, get_quartiles: bool = False, get_nonzero_count: bool = False, unload_to: str = "", @@ -1084,9 +1511,6 @@ def query(self, *args, **kwargs) -> str | pd.DataFrame: where baseline_column_name and new_column_name are the columns on which the new_table should be joined to baseline table. applied_only: Calculate savings shape based on only buildings to which the upgrade applied - applied_in: Optional list of upgrade ids. When set alongside `applied_only=True`, the query is further - restricted to buildings for which all listed upgrades satisfy the run's success/applicability - condition. weights: The additional columns to use as weight. The "build_existing_model.sample_weight" is already used. It is specified as either list of string or list of tuples. When only string is used, the string is the column name, when tuple is passed, the second element is the table name. @@ -1109,6 +1533,4 @@ def query(self, *args, **kwargs) -> str | pd.DataFrame: Returns: if get_query_only is True, returns the query_string, otherwise returns a pandas dataframe """ - # TODO: Replace with contents of agg._query(*args, **kwargs) when aggregate_query module is deprecated - # or implement via a Mixin return self.agg._query(*args, **kwargs) diff --git a/buildstock_query/query_core.py b/buildstock_query/query_core.py index ad03f240..5c42c7af 100644 --- a/buildstock_query/query_core.py +++ b/buildstock_query/query_core.py @@ -21,7 +21,8 @@ from collections import OrderedDict import types from buildstock_query.helpers import CachedFutureDf, AthenaFutureDf, DataExistsException, CustomCompiler -from buildstock_query.helpers import save_pickle, load_pickle, read_csv +from buildstock_query.helpers import read_csv +from buildstock_query.sql_cache import SqlCache, hash_sql, normalize_sql from typing import TypedDict, NewType from botocore.config import Config import urllib3 @@ -35,9 +36,11 @@ MappedColumn, SALabel, DBTableType, - validate_arguments + typed_literal, + validate_arguments, ) -import hashlib +import json as _json_module +import re import toml import uuid from sqlalchemy.sql.selectable import SelectBase, Subquery @@ -84,14 +87,14 @@ def __init__(self, *, params: RunParams) -> None: workgroup (str): The workgroup for athena. The cost will be charged based on workgroup. db_name (str): The athena database name buildstock_type (str, optional): 'resstock' or 'comstock' runs. Defaults to 'resstock' - table_name (str or Union[str, tuple[str, Optional[str], Optional[str]]]): If a single string is provided, - say, 'mfm_run', then it must correspond to tables in athena named mfm_run_baseline and optionally - mfm_run_timeseries and mf_run_upgrades. Or, tuple of three elements can be provided for the table names - for baseline, timeseries and upgrade. Timeseries and upgrade can be None if no such table exist. - db_schema (str, optional): The database structure in Athena is different between ResStock and ComStock run. - It is also different between the version in OEDI and default version from BuildStockBatch. This argument - controls the assumed schema. Allowed values are 'resstock_default', 'resstock_oedi', 'comstock_default' - and 'comstock_oedi'. Defaults to 'resstock_default' for resstock and 'comstock_default' for comstock. + table_name (str or tuple[str, Optional[str]]): If a single string is provided, say, 'mfm_run', then it + must correspond to tables in athena formed by appending the schema's + `[table_suffix].annual_and_metadata` and `.timeseries` to it. Or, a tuple `(annual_and_metadata, + timeseries)` can be provided directly. Timeseries may be None if no such table exists. + db_schema (str, optional): The database structure in Athena is different between ResStock and ComStock + run. It is also different between the version in OEDI and default version from BuildStockBatch. This + argument controls the assumed schema. Allowed values are TOML files in the db_schema/ folder + (e.g. 'resstock_oedi', 'comstock_oedi_state_and_county'). sample_weight (str, optional): Specify a custom sample_weight. Otherwise, the default is 1 for ComStock and uses sample_weight in the run for ResStock. region_name (str, optional): the AWS region where the database exists. Defaults to 'us-west-2'. @@ -106,10 +109,11 @@ def __init__(self, *, params: RunParams) -> None: self.run_params = params self.workgroup = params.workgroup self.buildstock_type = params.buildstock_type - self._query_cache: dict[str, pd.DataFrame] = {} # {"query": query_result_df} to cache queries - self._session_queries: set[str] = set() # Set of all queries that is run in current session. - self._aws_s3 = boto3.client("s3") + # pool matches the download_metadata_and_annual_results thread pool (10 workers) with + # headroom for incidental per-request metadata calls, so parallel downloads don't + # churn the HTTPS pool and spam "Connection pool is full" warnings. + self._aws_s3 = boto3.client("s3", config=Config(max_pool_connections=32)) self._aws_athena = boto3.client("athena", region_name=params.region_name) self._aws_glue = boto3.client("glue", region_name=params.region_name) self._async_conn = Connection( @@ -145,80 +149,197 @@ def __init__(self, *, params: RunParams) -> None: self.table_name = params.table_name self.cache_folder = pathlib.Path(params.cache_folder) self.athena_query_reuse = params.athena_query_reuse - os.makedirs(self.cache_folder, exist_ok=True) + self._cache = SqlCache(self.cache_folder) self._initialize_tables() self._initialize_book_keeping(params.execution_history) - with contextlib.suppress(FileNotFoundError): - self.load_cache() - - @staticmethod - def _get_compact_cache_name(table_name: str) -> str: - table_name = str(table_name) - if len(table_name) > 64: - return hashlib.sha256(table_name.encode()).hexdigest() + def _initialize_tables(self): + self.md_table, self.ts_table = self._get_tables(self.table_name) + # `bs_table` is a stable alias of md_table that callers use as the + # canonical metadata-side handle in outer queries. Keeping a single + # named alias (not constructing fresh `md.alias(...)` per call) lets + # things like `self.sample_wt = bs_table.c["weight"]` and + # `self.md_key_cols = [bs_table.c[k] for k in md_key]` bind once at + # init time and remain valid in any query that selects through the + # bs alias. Self-join sites construct an additional `md.alias("up")` + # locally for the upgrade-side row set. + self.bs_table = self.md_table.alias("bs") + # Alt metadata table for the state-aggregate routing path (set + # by `_get_tables` when the schema declares + # `table_suffix.annual_and_metadata_state_agg`). Most callers + # ignore this; only `_pick_metadata_table` and the `_query` + # path that consumes its output reference it. The shared alias + # name "bs" matches the primary table so generated SQL is + # interchangeable when routed. + self.md_table_state_agg = getattr(self, "_md_table_state_agg_raw", None) + if self.md_table_state_agg is not None: + self.bs_table_state_agg = self.md_table_state_agg.alias("bs") + self.md_state_agg_key: tuple[str, ...] = tuple( + self._get_unique_keys("metadata_state_agg") + ) else: - return table_name + self.bs_table_state_agg = None + self.md_state_agg_key = () - def _get_cache_file_path(self) -> pathlib.Path: - return self.cache_folder / f"{self._get_compact_cache_name(self.table_name)}_query_cache.pkl" + self.md_bldgid_column = self.bs_table.c[self.building_id_column_name] + if self.ts_table is not None: + self.timestamp_column = self.ts_table.c[self.timestamp_column_name] + self.ts_bldgid_column = self.ts_table.c[self.building_id_column_name] - @validate_arguments - def load_cache(self, path: Optional[str] = None): - """Read and update query cache from pickle file. + self.md_key: tuple[str, ...] = tuple(self._get_unique_keys("metadata")) + self.ts_key: tuple[str, ...] = tuple(self._get_unique_keys("timeseries")) - Args: - path (str, optional): The path to the pickle file. If not provided, reads from current directory. - """ - pickle_path = pathlib.Path(path) if path else self._get_cache_file_path() - before_count = len(self._query_cache) - saved_cache = load_pickle(pickle_path) - logger.info(f"{len(saved_cache)} queries cache read from {pickle_path}.") - self._query_cache.update(saved_cache) - self.last_saved_queries = set(saved_cache) - after_count = len(self._query_cache) - if diff := after_count - before_count: - logger.info(f"{diff} queries cache is updated.") - else: - logger.info("Cache already upto date.") + self.sample_wt = self._get_sample_weight(self.sample_weight) - @validate_arguments - def save_cache(self, path: Optional[str] = None, trim_excess: bool = False): - """Saves queries cache to a pickle file. It is good idea to run this after making queries so that on the next - session these queries won't have to be run on Athena and can be directly loaded from the file. + @property + def md_key_cols(self) -> list[sa.Column]: + return [self.bs_table.c[k] for k in self.md_key] - Args: - path (str, optional): The path to the pickle file. If not provided, the file will be saved on the current - directory. - trim_excess (bool, optional): If true, any queries in the cache that is not run in current session will be - removed before saving it to file. This is useful if the cache has accumulated a bunch of stray queries over - several sessions that are no longer used. Defaults to False. + @property + def ts_key_cols(self) -> list[sa.Column]: + if self.ts_table is None: + raise ValueError("No timeseries table is available.") + return [self.ts_table.c[k] for k in self.ts_key] + + @staticmethod + def _unique_columns_by_name(columns: Sequence[DBColType]) -> list[DBColType]: + unique_columns: list[DBColType] = [] + seen_names = set() + for column in columns: + if column.name in seen_names: + continue + seen_names.add(column.name) + unique_columns.append(column) + return unique_columns + + def _get_unique_keys( + self, kind: Literal["metadata", "timeseries", "metadata_state_agg"] + ) -> list[str]: + # When routing is active (`_routing_context` swapped self.md_table + # to the alt), `kind="metadata"` should return the alt-table's + # narrower key. Detect by table identity rather than a separate + # flag — this keeps the routing visibility consistent with how + # other helpers detect it (via `self.bs_table` / `self.md_table`). + if ( + kind == "metadata" + and getattr(self, "md_table_state_agg", None) is not None + and self.md_table is self.md_table_state_agg + ): + kind = "metadata_state_agg" + configured_keys = getattr(self.db_schema.unique_keys, kind, None) + return configured_keys or [self.building_id_column_name] + + @contextlib.contextmanager + def _routing_context(self, md_choice: Literal["primary", "state_agg"]): + """Temporarily swap `self.bs_table` / `self.md_table` / `self.md_key` + / `self.sample_wt` / `self.md_bldgid_column` to the routed alternates + for the duration of one `_query` call. Restores originals on exit + (including on exception). + + This sweeps the routing decision across every helper that reads + these attributes — e.g. `_get_column`, `_get_weight`, + `_get_enduse_cols`, `_md_baseline_filter` — without threading + `bs_table` arguments through dozens of call sites. + + **Caveat:** not thread-safe within a single BSQ instance. Concurrent + `_query` calls on the same instance would race; the framework is + single-threaded per BSQ today (the ThreadPoolExecutor in + `_download_results_csv` doesn't run queries concurrently on the + same instance). If multi-threaded use ever materializes, swap to + a contextvars-based override instead. """ - cached_queries = set(self._query_cache) - if self.last_saved_queries == cached_queries: - logger.info("No new queries to save.") + if md_choice == "primary" or self.bs_table_state_agg is None: + yield return + # Save originals. + prev_md_table = self.md_table + prev_bs_table = self.bs_table + prev_md_key = self.md_key + prev_sample_wt = self.sample_wt + prev_md_bldgid = self.md_bldgid_column + # Swap to alt. + self.md_table = self.md_table_state_agg + self.bs_table = self.bs_table_state_agg + self.md_key = self.md_state_agg_key + # Re-derive bound expressions on the alt alias. + self.sample_wt = self._get_sample_weight(self.sample_weight) + self.md_bldgid_column = self.bs_table.c[self.building_id_column_name] + try: + yield + finally: + self.md_table = prev_md_table + self.bs_table = prev_bs_table + self.md_key = prev_md_key + self.sample_wt = prev_sample_wt + self.md_bldgid_column = prev_md_bldgid + + def _join_condition( + self, + left_table: AnyTableType, + right_table: AnyTableType, + kind: Literal["metadata", "timeseries"], + extra_keys: Sequence[str] = (), + ) -> sa.ColumnElement: + keys = list(dict.fromkeys([*self._get_unique_keys(kind), *extra_keys])) + return sa.and_(*(left_table.c[key] == right_table.c[key] for key in keys)) + + def _baseline_timeseries_join_condition( + self, + baseline_table: AnyTableType, + timeseries_table: AnyTableType, + ) -> sa.ColumnElement: + """JOIN ON for md-baseline-alias ⋈ ts. Bakes in `baseline.upgrade=0`.""" + return sa.and_( + self._join_condition(baseline_table, timeseries_table, "timeseries"), + self._upgrade_zero_filter(baseline_table), + ) - pickle_path = pathlib.Path(path) if path else self._get_cache_file_path() - if trim_excess: - if excess_queries := [key for key in self._query_cache if key not in self._session_queries]: - for query in excess_queries: - del self._query_cache[query] - logger.info(f"{len(excess_queries)} excess queries removed from cache.") - self.last_saved_queries = cached_queries - save_pickle(pickle_path, self._query_cache) - logger.info(f"{len(self._query_cache)} queries cache saved to {pickle_path}") + def _baseline_upgrade_join_condition( + self, + baseline_table: AnyTableType, + upgrade_table: AnyTableType, + ) -> sa.ColumnElement: + """JOIN ON for md-baseline-alias ⋈ md-upgrade-alias. Bakes in `baseline.upgrade=0`.""" + return sa.and_( + self._join_condition(baseline_table, upgrade_table, "metadata"), + self._upgrade_zero_filter(baseline_table), + ) - def _initialize_tables(self): - self.bs_table, self.ts_table, self.up_table = self._get_tables(self.table_name) + def _upgrade_zero_filter(self, table: AnyTableType) -> sa.ColumnElement: + """`table.upgrade = 0` — the WHERE/ON predicate that selects baseline rows + on the unified annual_and_metadata table (or any alias of it).""" + upgrade_col = table.c["upgrade"] + return upgrade_col == typed_literal(upgrade_col, "0") + + def _md_baseline_filter(self, table: AnyTableType | None = None) -> sa.ColumnElement: + """`.upgrade = 0` — convenience helper for callers that want + baseline rows on the metadata side. `table` defaults to `bs_table` + (the canonical alias used in joined queries); pass `md_table` when + the outer FROM is the raw md_table directly, so SQLAlchemy doesn't + auto-add bs_table to the FROM as a stray comma-join.""" + return self._upgrade_zero_filter(table if table is not None else self.bs_table) + + def _timeseries_pair_join_condition( + self, + left_timeseries_table: AnyTableType, + right_timeseries_table: AnyTableType, + ) -> sa.ColumnElement: + return self._join_condition( + left_timeseries_table, + right_timeseries_table, + "timeseries", + [self.timestamp_column_name], + ) - self.bs_bldgid_column = self.bs_table.c[self.building_id_column_name] - if self.ts_table is not None: - self.timestamp_column = self.ts_table.c[self.timestamp_column_name] - self.ts_bldgid_column = self.ts_table.c[self.building_id_column_name] - if self.up_table is not None: - self.up_bldgid_column = self.up_table.c[self.building_id_column_name] - self.sample_wt = self._get_sample_weight(self.sample_weight) + @staticmethod + def _count_distinct(columns: Sequence[sa.Column]) -> sa.ColumnElement: + if len(columns) == 1: + return safunc.count(safunc.distinct(columns[0])) + return safunc.count(sa.distinct(sa.tuple_(*columns))) + + @staticmethod + def _scalar_or_tuple(row: Sequence): + return row[0] if len(row) == 1 else tuple(row) def _get_sample_weight(self, sample_weight): if not sample_weight: @@ -270,24 +391,38 @@ def _get_column( return sa.literal(column_name).label(self._simple_label(column_name.name)) if not candidate_tables: + # bs_table and up_table are aliases over the same md_table — searching + # bs_table (alias of md_table) holds annual + characteristics + # columns; ts_table holds timeseries values. Annual-only column + # resolution looks at bs alone; otherwise both. Using bs_table + # rather than md_table makes the resolved column references bind + # to the alias that's in the FROM of any aggregate query. if annual_only: - candidate_tables = (self.bs_table, self.up_table) + candidate_tables = (self.bs_table,) else: - candidate_tables = (self.bs_table, self.up_table, self.ts_table) + candidate_tables = (self.bs_table, self.ts_table) search_tables = [self._get_table(table) for table in candidate_tables if table is not None] - valid_tables = [] - for tbl in search_tables: - if column_name in tbl.columns: - valid_tables.append(tbl) - if not valid_tables: - raise ValueError(f"Column {column_name} not found in any tables {[t.name for t in search_tables]}") - if len(valid_tables) > 1: - logger.warning( - f"Column {column_name} found in multiple tables {[t.name for t in valid_tables]}. " - f"Using {valid_tables[0].name}" - ) - return valid_tables[0].c[column_name] + char_prefix = self.db_schema.column_prefix.characteristics + names_to_try = [column_name] + if column_name.startswith(char_prefix): + names_to_try.append(column_name.removeprefix(char_prefix)) + else: + names_to_try.append(f"{char_prefix}{column_name}") + + for attempt_name in names_to_try: + valid_tables = [tbl for tbl in search_tables if attempt_name in tbl.columns] + if valid_tables: + if len(valid_tables) > 1: + logger.warning( + f"Column {attempt_name} found in multiple tables {[t.name for t in valid_tables]}. " + f"Using {valid_tables[0].name}" + ) + return valid_tables[0].c[attempt_name] + raise ValueError( + f"Column {column_name} not found in any tables {[t.name for t in search_tables]} " + f"(also tried {names_to_try[1]!r})" + ) def _get_subquery_table( self, source_table: DBTableType, where_clause: sa.ColumnElement, alias_name: str @@ -295,34 +430,51 @@ def _get_subquery_table( raw_subquery = sa.select("*").select_from(source_table).where(where_clause) return sa.text(self._compile(raw_subquery)).columns(*source_table.c).subquery(alias_name) - def _get_tables(self, table_name: Union[str, tuple[str, Optional[str], Optional[str]]]): + def _get_tables(self, table_name: Union[str, tuple[str, Optional[str]]]): + """Resolve the underlying physical tables for this run. + + Always returns `(md_table, ts_table)`. When the schema declares + `table_suffix.annual_and_metadata_state_agg`, the alt metadata + table is also loaded and stored at `self.md_table_state_agg` + for the routing-aware path; otherwise that attribute is None. + + For tuple `table_name`, the entries are + `(annual_and_metadata, timeseries)`. The alt table can't be + named via the tuple form; only the suffix path supports it. + """ self._engine = self._create_athena_engine( region_name=self.region_name, database=self.db_name, workgroup=self.workgroup ) + + suffix = self.db_schema.table_suffix + if isinstance(table_name, str): - baseline_table_name = f"{table_name}{self.db_schema.table_suffix.baseline}" - ts_table_name = f"{table_name}{self.db_schema.table_suffix.timeseries}" - upgrade_table_name = f"{table_name}{self.db_schema.table_suffix.upgrades}" + md_table_name = f"{table_name}{suffix.annual_and_metadata}" + ts_table_name = f"{table_name}{suffix.timeseries}" + md_state_agg_table_name = ( + f"{table_name}{suffix.annual_and_metadata_state_agg}" + if suffix.annual_and_metadata_state_agg else None + ) else: - baseline_table_name = table_name[0] - ts_table_name = table_name[1] - upgrade_table_name = table_name[2] + md_table_name = table_name[0] + ts_table_name = table_name[1] if len(table_name) > 1 else None + md_state_agg_table_name = None - baseline_table = self._get_table(baseline_table_name) + md_table = self._get_table(md_table_name) ts_table = self._get_table(ts_table_name, missing_ok=True) if ts_table_name else None - if baseline_table_name == upgrade_table_name: - upgrade_col = sa.cast(baseline_table.c["upgrade"], sa.String) - upgrade_table = self._get_subquery_table(baseline_table, upgrade_col != "0", "upgrade") - baseline_table = self._get_subquery_table(baseline_table, upgrade_col == "0", "baseline") - else: - upgrade_table = self._get_table(upgrade_table_name, missing_ok=True) if upgrade_table_name else None - return baseline_table, ts_table, upgrade_table + # Stash the alt table on the instance — _initialize_tables will + # turn it into a stable alias. Done here so the network round- + # trip is in the same place as the other autoload calls. + self._md_table_state_agg_raw = ( + self._get_table(md_state_agg_table_name) if md_state_agg_table_name else None + ) + + return md_table, ts_table def _initialize_book_keeping(self, execution_history): self._execution_history_file = execution_history or self.cache_folder / ".execution_history" self.execution_cost = {"GB": 0, "Dollars": 0} # Tracks the cost of current session. Only used for Athena query self.seen_execution_ids = set() # set to prevent double counting same execution id - self.last_saved_queries = set() if os.path.exists(self._execution_history_file): with open(self._execution_history_file) as f: existing_entries = f.readlines() @@ -479,26 +631,138 @@ def _save_execution_id(self, execution_id): with open(self._execution_history_file, "a") as f: f.write(f"{time.time()},{execution_id}\n") - def _log_execution_cost(self, execution_id: ExeId): + def _log_execution_cost(self, execution_id: ExeId, sql: Optional[str] = None): + """Pull GetQueryExecution metadata, log session cost, and (when `sql` + is supplied) write the full Athena response as a `.json` sidecar + in the cache directory. Future runs read that sidecar to compare cost + across runs without re-querying Athena. + """ if execution_id == "CACHED": # Can't log cost for cached query return res = self._aws_athena.get_query_execution(QueryExecutionId=execution_id) - scanned_GB = res["QueryExecution"]["Statistics"]["DataScannedInBytes"] / 1e9 + qe = res["QueryExecution"] + stats = qe["Statistics"] + scanned_GB = stats["DataScannedInBytes"] / 1e9 cost = scanned_GB * 5 / 1e3 # 5$ per TB scanned if execution_id not in self.seen_execution_ids: self.execution_cost["Dollars"] += cost self.execution_cost["GB"] += scanned_GB self.seen_execution_ids.add(execution_id) + # Persist the full QueryExecution dict alongside the query result so + # future analyses can pull whatever Athena reports without re-fetching. + # Keyed by the same hash as the .sql / .parquet sidecars. + if sql is not None: + self._cache.put_metadata(sql, qe) + logger.info( - f"{execution_id} cost {scanned_GB:.1f} GB (${cost:.1f}). Session total:" - f" {self.execution_cost['GB']:.1f} GB (${self.execution_cost['Dollars']:.1f})" + f"{execution_id} cost {scanned_GB:.2f} GB (${cost:.2f}). Session total:" + f" {self.execution_cost['GB']:.2f} GB (${self.execution_cost['Dollars']:.2f})" ) + _UNLOAD_RE = re.compile(r"^\s*UNLOAD\s*\((.*)\)\s*TO\s*'", re.DOTALL | re.IGNORECASE) + + def build_query_metadata_index(self) -> dict[str, dict]: + """Walk this workgroup's Athena history once and return + `{hash_sql(inner_select): full_QueryExecution_dict}`. + + For each historical query that's an UNLOAD wrapping a SELECT, the + index records the EARLIEST non-cache-hit successful execution + (DataScannedInBytes > 0). The full QueryExecution dict is preserved + so callers can pull whatever Athena reports — Statistics, + EngineVersion, ResultReuseInformation, etc. Athena history retains + ~45 days; older snapshots have no entry. + """ + # hash → (submitted_ts, full_qe_dict). On a duplicate hash, keep the + # earliest submitted_ts so we capture the cold-cache cost. + index: dict[str, tuple[float, dict]] = {} + paginator = self._aws_athena.get_paginator("list_query_executions") + for page in paginator.paginate(WorkGroup=self.workgroup): + ids = page.get("QueryExecutionIds", []) + if not ids: + continue + for chunk_start in range(0, len(ids), 50): + chunk = ids[chunk_start:chunk_start + 50] + resp = self._aws_athena.batch_get_query_execution(QueryExecutionIds=chunk) + for qe in resp.get("QueryExecutions", []): + query_text = qe.get("Query", "") + if "bsq_athena_unload_results" not in query_text: + continue + m = self._UNLOAD_RE.match(query_text) + if not m: + continue + status = qe.get("Status", {}) + if status.get("State") != "SUCCEEDED": + continue + stats = qe.get("Statistics", {}) + scanned_bytes = stats.get("DataScannedInBytes") or 0 + if scanned_bytes <= 0: + continue + submitted = status.get("SubmissionDateTime") + if submitted is None: + continue + submitted_ts = submitted.timestamp() + key = hash_sql(m.group(1)) + existing = index.get(key) + if existing is None or submitted_ts < existing[0]: + index[key] = (submitted_ts, qe) + return {k: v[1] for k, v in index.items()} + + def backfill_cache_metadata(self, cache_dir: Optional[pathlib.Path | str] = None) -> tuple[int, int]: + """Walk Athena history once and write `.json` sidecars into the + cache directory for any cached SQL that doesn't have one yet. + + `cache_dir` defaults to this BSQ's cache folder. The lookup matches + a cached SQL by its hash against the inner-SELECT hash of every + historical UNLOAD execution — so it works whether the cached SQL + was originally executed in this session or weeks ago. + + Returns (filled, skipped) — count of metadata files written and + cached entries whose execution wasn't found in history. + """ + cache_root = pathlib.Path(cache_dir) if cache_dir else self._cache.root + # Find cached entries lacking metadata + missing_hashes: list[str] = [] + for parquet in cache_root.glob("*.parquet"): + h = parquet.stem + if not (cache_root / f"{h}.json").exists(): + missing_hashes.append(h) + if not missing_hashes: + return 0, 0 + index = self.build_query_metadata_index() + filled = 0 + skipped = 0 + for h in missing_hashes: + qe = index.get(h) + if qe is None: + skipped += 1 + continue + (cache_root / f"{h}.json").write_text(_json_module.dumps(qe, indent=2, default=str)) + filled += 1 + return filled, skipped + + def get_query_cost_from_history(self, sql: str) -> Optional[dict]: + """Look up Athena execution metadata for a single SQL by walking + history. Returns the full QueryExecution dict (or None if not found). + + Convenience wrapper for one-shot lookups. To backfill many snapshots, + prefer `backfill_cache_metadata()` or `build_query_metadata_index()` + — those walk history once instead of N times. + """ + target_hash = hash_sql(sql) + index = self.build_query_metadata_index() + return index.get(target_hash) + def _compile(self, query) -> str: compiled_query = CustomCompiler(AthenaDialect(), query).process(query, literal_binds=True) - return compiled_query + # Normalize whitespace at compile time so every consumer sees the same + # canonical form: cache filename hash, S3 unload-path hash, snapshot + # `.sql` content, and Athena query history all match for the + # same logical SQL. Without this, cache lookups by literal SQL string + # would miss across whitespace variations, and history-search helpers + # would need to re-normalize on the way in. + return normalize_sql(compiled_query) def _get_unload_result(self, execution_id, result_location: str) -> pd.DataFrame: t = time.time() @@ -514,6 +778,10 @@ def _get_unload_result(self, execution_id, result_location: str) -> pd.DataFrame try: df = pd.read_parquet(result_location) except FileNotFoundError: # empty result + # SUCCEEDED + empty destination = UNLOAD wrote zero files, result is genuinely empty. + # Drop an _EMPTY sentinel so future runs can recognize this as a cache hit instead of + # re-executing the (possibly expensive) query. + self._write_empty_marker(result_location) df = pd.DataFrame() return df elif stat.upper() in ["FAILED", "CANCELLED"]: @@ -527,19 +795,36 @@ def _get_unload_result(self, execution_id, result_location: str) -> pd.DataFrame time.sleep(1) raise TimeoutError("Query failed to complete within 30 mins.") + _EMPTY_MARKER_KEY = "_EMPTY" + + def _write_empty_marker(self, result_location: str) -> None: + """Write a 0-byte _EMPTY sentinel inside `result_location` to cache a zero-row UNLOAD.""" + if not result_location.startswith("s3://"): + return + bucket_name, prefix = result_location.replace("s3://", "").split("/", 1) + marker_key = prefix.rstrip("/") + "/" + self._EMPTY_MARKER_KEY + try: + self._aws_s3.put_object(Bucket=bucket_name, Key=marker_key, Body=b"") + except ClientError as e: + logger.warning("Could not write _EMPTY marker to %s: %s", result_location, e) + def _get_query_result_location(self, result_path: str) -> Optional[str]: """Check if the UNLOAD result already exists in S3. Args: result_path (str): The S3 path where the UNLOAD result would be stored. Returns: - Optional[str]: The S3 path to the result if it exists, otherwise None. + Optional[str]: The S3 path to the result if it exists, otherwise None. When the + cached result is a zero-row UNLOAD, returns a path ending in "//_EMPTY" + (caller must recognize this sentinel and return an empty DataFrame without + calling read_parquet). """ bucket_name, prefix = result_path.replace("s3://", "").split("/", 1) normalized_prefix = prefix.rstrip("/") + "/" try: paginator = self._aws_s3.get_paginator("list_objects_v2") folders: dict[str, datetime.datetime] = {} + empty_folders: set[str] = set() for page in paginator.paginate(Bucket=bucket_name, Prefix=normalized_prefix): for obj in page.get("Contents", []): key = obj.get("Key", "") @@ -549,26 +834,32 @@ def _get_query_result_location(self, result_path: str) -> Optional[str]: if not remainder or "/" not in remainder: continue - folder = remainder.split("/", 1)[0] + folder, _, basename = remainder.partition("/") + if not folder: + continue + if basename == self._EMPTY_MARKER_KEY: + empty_folders.add(folder) + continue last_modified = obj.get("LastModified") - if not folder or last_modified is None: + if last_modified is None: continue - current = folders.get(folder) if current is None or last_modified > current: folders[folder] = last_modified - if not folders: - return None - - chosen_folder = max(folders.items(), key=lambda item: (item[1], item[0]))[0] - if len(folders) > 1: - logger.warning( - "Multiple cached UNLOAD result folders found for prefix %s; using newest folder %s.", - normalized_prefix, - chosen_folder, - ) - return f"s3://{bucket_name}/{normalized_prefix}{chosen_folder}/" + if folders: + chosen_folder = max(folders.items(), key=lambda item: (item[1], item[0]))[0] + if len(folders) > 1: + logger.warning( + "Multiple cached UNLOAD result folders found for prefix %s; using newest folder %s.", + normalized_prefix, + chosen_folder, + ) + return f"s3://{bucket_name}/{normalized_prefix}{chosen_folder}/" + if empty_folders: + chosen_folder = sorted(empty_folders)[0] + return f"s3://{bucket_name}/{normalized_prefix}{chosen_folder}/{self._EMPTY_MARKER_KEY}" + return None except ClientError as e: logger.error(f"Error accessing S3: {e}") return None @@ -602,20 +893,29 @@ def execute( if not isinstance(query, str): query = self._compile(query) - self._session_queries.add(query) - if query in self._query_cache: + cached = self._cache.get(query) + if cached is not None: if run_async: - return "CACHED", CachedFutureDf(self._query_cache[query].copy()) - return self._query_cache[query].copy() - - query_hash = hashlib.sha256(query.encode()).hexdigest() + return "CACHED", CachedFutureDf(cached) + return cached + + # `query` here is already whitespace-normalized (see `_compile`), so + # `hash_sql` and `sha256(query.encode())` are equivalent. The S3 unload + # path embeds this hash, which is the same as the snapshot cache + # `.sql` filename — letting the cost-history helper find a query's + # past Athena execution by substring-searching history for `//`. + query_hash = hash_sql(query) result_path = f"s3://{self.run_params.query_unload_s3_bucket}/bsq_athena_unload_results/{query_hash}" # check if result already exists in s3 if (result_location := self._get_query_result_location(result_path)): - self._query_cache[query] = pd.read_parquet(result_location) + if result_location.endswith("/" + self._EMPTY_MARKER_KEY): + df = pd.DataFrame() + else: + df = pd.read_parquet(result_location) + self._cache.put(query, df) if run_async: - return "CACHED", CachedFutureDf(self._query_cache[query].copy()) - return self._query_cache[query].copy() + return "CACHED", CachedFutureDf(df.copy()) + return df.copy() else: result_location = f"{result_path}/{uuid.uuid4()}/" # unique path to avoid collision @@ -632,18 +932,23 @@ def execute( exe_id = ExeId(exe_id) def get_df(future): - if query in self._query_cache: - return self._query_cache[query].copy() - self._query_cache[query] = self._get_unload_result(exe_id, result_location) - return self._query_cache[query].copy() + cached_inner = self._cache.get(query) + if cached_inner is not None: + return cached_inner + df_inner = self._get_unload_result(exe_id, result_location) + self._cache.put(query, df_inner) + self._log_execution_cost(exe_id, sql=query) + return df_inner.copy() if run_async: result_future.as_df = types.MethodType(get_df, result_future) self._save_execution_id(exe_id) return exe_id, AthenaFutureDf(result_future) - self._query_cache[query] = self._get_unload_result(exe_id, result_location) - return self._query_cache[query].copy() + df = self._get_unload_result(exe_id, result_location) + self._cache.put(query, df) + self._log_execution_cost(exe_id, sql=query) + return df.copy() def print_all_batch_query_status(self) -> None: """Prints the status of all batch queries.""" @@ -681,6 +986,8 @@ def get_failed_queries(self, batch_id: int) -> tuple[Sequence[ExeId], Sequence[s failed_queries: list[str] = [] if stats: for i, exe_id in enumerate(stats["submitted_execution_ids"]): + if exe_id == "CACHED": + continue completion_stat = self.get_query_status(exe_id) if completion_stat in ["FAILED", "CANCELLED"]: failed_query_ids.append(exe_id) @@ -713,6 +1020,8 @@ def get_ids_for_failed_queries(self, batch_id: int) -> Sequence[str]: """ failed_ids = [] for i, exe_id in enumerate(self._batch_query_status_map[batch_id]["submitted_execution_ids"]): + if exe_id == "CACHED": + continue completion_stat = self.get_query_status(exe_id) if completion_stat in ["FAILED", "CANCELLED"]: failed_ids.append(exe_id) @@ -855,6 +1164,7 @@ def get_batch_query_result(self, batch_id: int, *, combine: bool = True, no_bloc if len(query_exe_ids) == 0: raise ValueError("No query was submitted successfully") + submitted_queries = self._batch_query_status_map[batch_id]["submitted_queries"] res_df_array: list[pd.DataFrame] = [] for index, exe_id in enumerate(query_exe_ids): df = query_futures[index].as_pandas() @@ -862,7 +1172,7 @@ def get_batch_query_result(self, batch_id: int, *, combine: bool = True, no_bloc if len(df) > 0: df["query_id"] = index logger.info(f"Got result from Query [{index}] ({exe_id})") - self._log_execution_cost(exe_id) + self._log_execution_cost(exe_id, sql=submitted_queries[index]) res_df_array.append(df) if not combine: return res_df_array @@ -1100,8 +1410,8 @@ def get_cols(self, table: AnyTableType, fuel_type=None) -> Sequence[DBColType]: cols = [c for c in cols if c.name not in [self.ts_bldgid_column.name, self.timestamp_column.name]] cols = [c for c in cols if fuel_type in c.name] return cols - elif table in ["baseline", "bs"]: - cols = [c for c in self.bs_table.columns] + elif table in ["baseline", "bs", "metadata", "md"]: + cols = [c for c in self.md_table.columns] if fuel_type: cols = [c for c in cols if "simulation_output_report" in c.name] cols = [c for c in cols if fuel_type in c.name] @@ -1119,64 +1429,235 @@ def _simple_label(self, label: str, agg_func: Optional[str] = None): return label @staticmethod - def _normalize_restrict_subquery(criteria): + def _normalize_restrict_subquery(criteria, expected_width: int = 1): if isinstance(criteria, SelectBase): - if len(criteria.selected_columns) != 1: - raise ValueError("Subquery restrictions must select exactly one column.") + if len(criteria.selected_columns) != expected_width: + raise ValueError( + f"Subquery restrictions must select exactly {expected_width} column(s)." + ) return criteria if isinstance(criteria, Subquery): - if len(criteria.c) != 1: - raise ValueError("Subquery restrictions must select exactly one column.") - return sa.select(next(iter(criteria.c))) + if len(criteria.c) != expected_width: + raise ValueError( + f"Subquery restrictions must select exactly {expected_width} column(s)." + ) + return sa.select(*criteria.c) return None - def _get_restrict_clauses(self, restrict, annual_only=False): + @staticmethod + def _is_column_tuple(col_ref) -> bool: + if not isinstance(col_ref, tuple) or len(col_ref) == 0: + return False + return all(isinstance(c, (sa.Column, SALabel)) for c in col_ref) + + def _multi_column_membership(self, col_ref, criteria): + """Build a (cols) IN (...) expression. `criteria` may be a subquery or a sequence of row-tuples.""" + subquery = self._normalize_restrict_subquery(criteria, expected_width=len(col_ref)) + if subquery is not None: + return sa.tuple_(*col_ref).in_(subquery) + + if isinstance(criteria, Sequence) and not isinstance(criteria, str): + if not criteria: + raise ValueError("Multi-column membership criteria cannot be empty.") + for row in criteria: + if not isinstance(row, tuple) or len(row) != len(col_ref): + raise ValueError( + f"Each row in multi-column criteria must be a tuple of length {len(col_ref)}." + ) + return sa.tuple_(*col_ref).in_([sa.tuple_(*row) for row in criteria]) + + raise ValueError( + "Multi-column restrict keys must be paired with a subquery or a sequence of row-tuples." + ) + + def _get_restrict_clauses(self, restrict, annual_only=False, *, bs_table=None): + # Pre-compute single-column equality/IN predicates that target bs_table + # columns. These are pushed into any subquery-valued restrict entry + # whose projection includes the same column — Athena does not propagate + # the outer WHERE into IN-subqueries automatically, so a `state='CO'` + # filter on the outer query was leaving the inner side unconstrained + # (forcing an enumeration of all 3,133 (state,county) partitions on + # ComStock metadata). See INVESTIGATION_partition_overhead.md for the + # 11.6× speedup measurement on shape C. + # + # `bs_table` lets routing-aware callers (`_query` after Piece A) bind + # column references to the alt metadata table. Defaults to None, in + # which case `_get_column` uses `self.bs_table` (today's behavior). + # When set, restrict to bs_table only (no TS fallback) — otherwise + # `state` would resolve against the TS table partition column instead + # of the alt-md `in.state`, defeating routing. + candidate_tables = (bs_table,) if bs_table is not None else None + propagatable = self._collect_propagatable_predicates(restrict, bs_table=bs_table) + clauses = [] - for col_str, criteria in restrict: - col = self._get_column(col_str, annual_only=annual_only) + for col_ref, criteria in restrict: + if self._is_column_tuple(col_ref): + # Tuple LHS — the RHS may be a Select carrying an applied-buildings + # subquery. Inject propagatable predicates into that subquery's + # WHERE before wrapping it in `tuple_(...).in_(subquery)`. + col_names = {c.name for c in col_ref if isinstance(c, sa.Column)} + if isinstance(criteria, SelectBase): + criteria = self._inject_propagated(criteria, propagatable, col_names, bs_table=bs_table) + elif isinstance(criteria, Subquery): + base = sa.select(*criteria.c) + base = self._inject_propagated(base, propagatable, col_names, bs_table=bs_table) + criteria = base + clauses.append(self._multi_column_membership(col_ref, criteria)) + continue + + col = self._get_column(col_ref, candidate_tables=candidate_tables, annual_only=annual_only) subquery = self._normalize_restrict_subquery(criteria) if subquery is not None: + # Single-column subquery: same propagation idea, scoped to the + # column projected by the subquery (typically just bldg_id). + subquery = self._inject_propagated(subquery, propagatable, {col.name}, bs_table=bs_table) clauses.append(col.in_(subquery)) elif isinstance(criteria, Sequence) and not isinstance(criteria, str): - if len(criteria) > 1: - clauses.append(col.in_(criteria)) - elif len(criteria) == 1: - clauses.append(col == criteria[0]) + typed = [typed_literal(col, v) for v in criteria] + if len(typed) > 1: + clauses.append(col.in_(typed)) + elif len(typed) == 1: + clauses.append(col == typed[0]) else: raise ValueError(f"Invalid criteria {criteria}") else: - clauses.append(col == criteria) + clauses.append(col == typed_literal(col, criteria)) return clauses - def _add_restrict(self, query, restrict, *, annual_only=False): + def _collect_propagatable_predicates(self, restrict, *, bs_table=None): + """Return a list of (column_name, sqla_clause_factory) for restrict + entries that are safe to push into a sibling subquery's WHERE. + + A clause is propagatable iff: + - LHS is a single column (not a tuple). + - That column resolves to a bs_table column (after `in.` prefix + resolution — schemas like ResStock expose state as `in.state` + but accept `state` in user-facing restrict entries). + - RHS is a literal scalar or a sequence of literals (not a subquery). + + The returned `clause_factory` is a callable that takes the target + Column inside the subquery and emits the equivalent equality / IN + clause. We can't reuse the outer-scope clause directly because the + Column object would still bind to the outer FROM in the compiled SQL. + + `bs_table` lets routing-aware callers point resolution at the alt + metadata table; defaults to `self.bs_table`. + """ + target_bs = bs_table if bs_table is not None else self.bs_table + candidate_tables = (target_bs,) if bs_table is not None else None + out: list[tuple[str, typing.Callable]] = [] + if not restrict: + return out + for col_ref, criteria in restrict: + if self._is_column_tuple(col_ref): + continue + if isinstance(criteria, (SelectBase, Subquery)): + continue + # Resolve to a bs_table column. _get_column already handles the + # `in.` prefix logic, so user-supplied "state" maps to + # bs_table.c["in.state"] on ResStock and bs_table.c["state"] on + # ComStock. Restrict to annual_only=True since we're propagating + # into metadata-side subqueries. + try: + resolved_col = self._get_column(col_ref, candidate_tables=candidate_tables, annual_only=True) + except (ValueError, AttributeError): + continue + # Only propagate when the resolved column lives on the bs alias + # (skips MappedColumns, calculated labels, ts-only columns). + if not isinstance(resolved_col, sa.Column): + continue + if resolved_col.table is not target_bs: + continue + name = resolved_col.name + + # Capture criteria by closure (`crit=criteria` to avoid loop + # rebind). + def _factory(col, crit=criteria): + if isinstance(crit, Sequence) and not isinstance(crit, str): + typed = [typed_literal(col, v) for v in crit] + if len(typed) > 1: + return col.in_(typed) + if len(typed) == 1: + return col == typed[0] + return None + return col == typed_literal(col, crit) + + out.append((name, _factory)) + return out + + def _inject_propagated(self, select, propagatable, scope_col_names, *, bs_table=None): + """Inject propagatable predicates into `select`'s WHERE for any + column in `scope_col_names` that's also in `select`'s output columns. + + `select` is a SQLA Select; `scope_col_names` is the set of column + names the outer query is matching against. We only propagate + predicates on those columns — i.e. columns that the outer IN clause + is about to compare against the subquery's projection. + + `bs_table` lets routing-aware callers bind the inner-column reference + to the alt metadata table; defaults to `self.bs_table`. + """ + if not propagatable: + return select + target_bs = bs_table if bs_table is not None else self.bs_table + # Names of columns this Select projects. `select.selected_columns` is + # the SA-1.4+ public accessor. + try: + proj_names = {c.name for c in select.selected_columns} + except AttributeError: + return select + # Only push predicates that (a) target a column the subquery projects, + # and (b) target a column the outer is matching on. + target_names = proj_names & set(scope_col_names) + for name, factory in propagatable: + if name not in target_names: + continue + inner_col = target_bs.c.get(name) + if inner_col is None: + continue + clause = factory(inner_col) + if clause is not None: + select = select.where(clause) + return select + + def _add_restrict(self, query, restrict, *, annual_only=False, bs_table=None): if not restrict: return query - restrict_clauses = self._get_restrict_clauses(restrict, annual_only=annual_only) + restrict_clauses = self._get_restrict_clauses( + restrict, annual_only=annual_only, bs_table=bs_table, + ) query = query.where(*restrict_clauses) return query - def _add_avoid(self, query, avoid, *, annual_only=False): - if not avoid: - return query - where_clauses = [] - for col_str, criteria in avoid: - col = self._get_column(col_str, annual_only=annual_only) + def _get_avoid_clauses(self, avoid, *, annual_only=False): + clauses = [] + for col_ref, criteria in avoid: + if self._is_column_tuple(col_ref): + clauses.append(sa.not_(self._multi_column_membership(col_ref, criteria))) + continue + + col = self._get_column(col_ref, annual_only=annual_only) subquery = self._normalize_restrict_subquery(criteria) if subquery is not None: - where_clauses.append(col.not_in(subquery)) + clauses.append(col.not_in(subquery)) elif isinstance(criteria, Sequence) and not isinstance(criteria, str): if len(criteria) > 1: - where_clauses.append(col.not_in(criteria)) + clauses.append(col.not_in(criteria)) elif len(criteria) == 1: - where_clauses.append(col != criteria[0]) + clauses.append(col != criteria[0]) else: raise ValueError(f"Invalid criteria {criteria}") else: - where_clauses.append(col != criteria) - query = query.where(*where_clauses) - return query + clauses.append(col != criteria) + return clauses + + def _add_avoid(self, query, avoid, *, annual_only=False): + if not avoid: + return query + clauses = self._get_avoid_clauses(avoid, annual_only=annual_only) + return query.where(*clauses) def _get_name(self, col): if isinstance(col, tuple): @@ -1187,10 +1668,28 @@ def _get_name(self, col): return col.name raise ValueError(f"Can't get name for {col} of type {type(col)}") - def _add_join(self, query, join_list): + def _add_join(self, query, join_list, bs_alias=None): + # `bs_alias` overrides which "bs side" the join's left key resolves + # against. Defaults to the canonical self.bs_table. TS queries pass + # `bs_per_bldg` (the per-bldg pre-aggregated subquery that replaces + # bs in the outer FROM) so the JOIN ON references resolve to the + # subquery's projected columns rather than the original bs alias + # (which isn't in the outer FROM after the bs_per_bldg refactor). + bs_for_join = bs_alias if bs_alias is not None else self.bs_table for new_table_name, baseline_column_name, new_column_name in join_list: new_tbl = self._get_table(new_table_name) - baseline_column = self._get_column(baseline_column_name, candidate_tables=[self.bs_table]) + # Resolve the bs-side column. baseline_column_name can be a + # string (column name) or an SA Column. For both we look it up + # by name on bs_for_join when possible — this lets the + # bs_per_bldg subquery substitute for the canonical bs alias. + ref_name = ( + baseline_column_name if isinstance(baseline_column_name, str) + else getattr(baseline_column_name, "name", None) + ) + if ref_name and ref_name in bs_for_join.c: + baseline_column = bs_for_join.c[ref_name] + else: + baseline_column = self._get_column(baseline_column_name, candidate_tables=[self.bs_table]) new_column = self._get_column(new_column_name, candidate_tables=[new_tbl]) query = query.join(new_tbl, baseline_column == new_column) return query @@ -1249,13 +1748,13 @@ def _get_agg_func_and_weight(self, weights, agg_func=None): def delete_everything(self): """Deletes the athena tables and data in s3 for the run.""" - info = self._aws_glue.get_table(DatabaseName=self.db_name, Name=self.bs_table.name) + # bs_table/up_table are SA aliases over md_table — ".name" yields "bs"/"up", + # not the real Athena table name. Use md_table directly. + info = self._aws_glue.get_table(DatabaseName=self.db_name, Name=self.md_table.name) self.pth = pathlib.Path(info["Table"]["StorageDescriptor"]["Location"]).parent - tables_to_delete = [self.bs_table.name] + tables_to_delete = [self.md_table.name] if self.ts_table is not None: tables_to_delete.append(self.ts_table.name) - if self.up_table is not None: - tables_to_delete.append(self.up_table.name) print(f"Will delete the following tables {tables_to_delete} and the {self.pth} folder") while True: curtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") diff --git a/buildstock_query/report_query.py b/buildstock_query/report_query.py index d60e7e86..fe483a4c 100644 --- a/buildstock_query/report_query.py +++ b/buildstock_query/report_query.py @@ -9,9 +9,9 @@ from functools import reduce from buildstock_query import main import typing -from typing import Optional, Union, Literal +from typing import Any, Optional, Union, Literal from collections.abc import Hashable, Sequence -from buildstock_query.schema.utilities import AnyColType, validate_arguments +from buildstock_query.schema.utilities import AnyColType, typed_literal, validate_arguments from pydantic import Field from typing_extensions import assert_never @@ -29,9 +29,22 @@ def __init__(self, bsq: "main.BuildStockQuery") -> None: def _rename_completed_status_column(self, df: DataFrame) -> DataFrame: df = df.rename(columns={self._bsq.db_schema.column_names.completed_status: "completed_status"}) rev_value_map = {db_val: normal_val for normal_val, db_val in self._bsq.db_schema.completion_values} + # OEDI schemas return booleans (True/False) for `applicability`; classic + # ResStock returns strings ("Success"/"Fail"). Normalize to lowercase + # strings so the rev_value_map (whose keys come from the TOML, always + # lowercase strings) matches both. Without this, OEDI rows produce all- + # NaN completed_status and the downstream pivot collapses with a + # "duplicate index" reshape error. + df["completed_status"] = df["completed_status"].astype(str).str.lower() df["completed_status"] = df["completed_status"].map(rev_value_map) return df + @staticmethod + def _rows_as_keys(df: DataFrame, key: tuple[str, ...]): + if len(key) == 1: + return df[key[0]].to_numpy(dtype="int32").tolist() + return list(df[list(key)].itertuples(index=False, name=None)) + @typing.overload def _get_bs_success_report(self, get_query_only: Literal[False] = False) -> DataFrame: ... @@ -42,7 +55,8 @@ def _get_bs_success_report(self, get_query_only: Literal[True]) -> str: ... def _get_bs_success_report(self, get_query_only: bool) -> Union[DataFrame, str]: ... def _get_bs_success_report(self, get_query_only: bool = False): - bs_query = sa.select(*[self._bsq._bs_completed_status_col, safunc.count().label("count")]) + bs_query = sa.select(*[self._bsq._md_completed_status_col, safunc.count().label("count")]) + bs_query = bs_query.where(self._bsq._md_baseline_filter()) bs_query = bs_query.group_by(sa.text("1")) if get_query_only: return self._bsq._compile(bs_query) @@ -66,17 +80,23 @@ def _get_change_report(self, get_query_only: bool = False): Args: get_query_only (bool, optional): _description_. Defaults to False. """ - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") - queries: list[str] = [] chng_types = ["no-chng", "bad-chng", "ok-chng", "true-bad-chng", "true-ok-chng", "null", "any"] + bs = self._bsq.bs_table + up = self._bsq.md_table.alias("up") + up_col = up.c["upgrade"] + up_not_baseline = up_col != typed_literal(up_col, "0") for ch_type in chng_types: - up_query = sa.select(*[self._bsq.up_table.c["upgrade"], safunc.count().label("change")]) - up_query = up_query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.up_bldgid_column) - conditions = self._get_change_conditions(change_type=ch_type) + up_query = sa.select(*[up.c["upgrade"], safunc.count().label("change")]) + up_query = up_query.join(bs, self._bsq._baseline_upgrade_join_condition(bs, up)) + conditions = self._get_change_conditions(change_type=ch_type, bs_alias=bs, up_alias=up) up_query = up_query.where( - sa.and_(self._bsq._bs_successful_condition, self._bsq._up_successful_condition, conditions) + sa.and_( + self._bsq._get_success_condition(bs), + self._bsq._get_success_condition(up), + up_not_baseline, + conditions, + ) ) # type: ignore up_query = up_query.group_by(sa.text("1")) up_query = up_query.order_by(sa.text("1")) @@ -89,11 +109,10 @@ def _get_change_report(self, get_query_only: bool = False): if df.empty: df = pd.DataFrame(columns=["upgrade", "change"]) df.rename(columns={"change": chng_type}, inplace=True) - # df['upgrade'] = df['upgrade'].map(int) + df["upgrade"] = df["upgrade"].map(str) df = df.set_index("upgrade").sort_index() change_df = change_df.join(df, how="outer") if len(change_df) > 0 else df - with pd.option_context("future.no_silent_downcasting", True): - change_df = change_df.fillna(0).infer_objects(copy=False) + change_df = change_df.fillna(0).astype(int) for chng_type in chng_types: if chng_type not in change_df.columns: change_df[chng_type] = 0 @@ -117,7 +136,7 @@ def print_change_details( @typing.overload def _get_upgrade_buildings( self, *, upgrade_id: Union[int, str], trim_missing_bs: bool = True, get_query_only: Literal[False] = False - ) -> list[int]: ... + ) -> Union[list[int], list[tuple]]: ... @typing.overload def _get_upgrade_buildings( @@ -127,39 +146,39 @@ def _get_upgrade_buildings( @typing.overload def _get_upgrade_buildings( self, *, upgrade_id: Union[int, str], get_query_only: bool, trim_missing_bs: bool = True - ) -> Union[list[int], str]: ... + ) -> Union[list[int], list[tuple], str]: ... def _get_upgrade_buildings( self, *, upgrade_id: Union[int, str], trim_missing_bs: bool = True, get_query_only: bool = False ): - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") - up_query = sa.select(*[self._bsq.up_bldgid_column]) + up = self._bsq.md_table.alias("up") + up_key_cols = [up.c[k] for k in self._bsq.md_key] + up_col = up.c["upgrade"] + up_id = typed_literal(up_col, upgrade_id) + up_query = sa.select(*up_key_cols) if trim_missing_bs: - up_query = up_query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.up_bldgid_column) + bs = self._bsq.bs_table + up_query = up_query.join(bs, self._bsq._baseline_upgrade_join_condition(bs, up)) up_query = up_query.where( sa.and_( - self._bsq._bs_successful_condition, - self._bsq._up_successful_condition, - self._bsq.up_table.c["upgrade"] == str(upgrade_id), + self._bsq._get_success_condition(bs), + self._bsq._get_success_condition(up), + up_col == up_id, ) ) else: up_query = up_query.where( - sa.and_(self._bsq.up_table.c["upgrade"] == str(upgrade_id), self._bsq._up_successful_condition) + sa.and_(up_col == up_id, self._bsq._get_success_condition(up)) ) if get_query_only: return self._bsq._compile(up_query) df = self._bsq.execute(up_query) - return df[self._bsq.bs_bldgid_column.name].to_numpy(dtype="int32").tolist() - - def _get_change_conditions(self, change_type: str): - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") + return self._rows_as_keys(df, self._bsq.md_key) + def _get_change_conditions(self, change_type: str, *, bs_alias, up_alias): threshold = 1e-3 fuel_cols = list( - c for c in self._bsq.db_schema.column_names.fuel_totals if c in self._bsq.up_table.columns + c for c in self._bsq.db_schema.column_names.fuel_totals if c in up_alias.c ) # Look at all fuel type totals all_cols = list(fuel_cols) if self._bsq.db_schema.column_names.unmet_hours_cooling_hr: @@ -168,33 +187,33 @@ def _get_change_conditions(self, change_type: str): all_cols += [self._bsq.db_schema.column_names.unmet_hours_heating_hr] null_chng_conditions = sa.and_( *[ - sa.or_(self._bsq.up_table.c[col] == sa.null(), self._bsq.bs_table.c[col] == sa.null()) + sa.or_(up_alias.c[col] == sa.null(), bs_alias.c[col] == sa.null()) for col in fuel_cols ] ) no_chng_conditions = sa.and_( *[ - safunc.coalesce(safunc.abs(self._bsq.up_table.c[col] - self._bsq.bs_table.c[col]), 0) < threshold + safunc.coalesce(safunc.abs(up_alias.c[col] - bs_alias.c[col]), 0) < threshold for col in fuel_cols ] ) good_chng_conditions = sa.or_( - *[self._bsq.bs_table.c[col] - self._bsq.up_table.c[col] >= threshold for col in fuel_cols] + *[bs_alias.c[col] - up_alias.c[col] >= threshold for col in fuel_cols] ) opp_chng_conditions = sa.and_( *[ - safunc.coalesce(self._bsq.bs_table.c[col] - self._bsq.up_table.c[col], -1) < threshold + safunc.coalesce(bs_alias.c[col] - up_alias.c[col], -1) < threshold for col in fuel_cols ], sa.not_(no_chng_conditions), ) true_good_chng_conditions = sa.or_( - *[self._bsq.bs_table.c[col] - self._bsq.up_table.c[col] >= threshold for col in all_cols] + *[bs_alias.c[col] - up_alias.c[col] >= threshold for col in all_cols] ) true_opp_chng_conditions = sa.and_( *[ - safunc.coalesce(self._bsq.bs_table.c[col] - self._bsq.up_table.c[col], -1) < threshold + safunc.coalesce(bs_alias.c[col] - up_alias.c[col], -1) < threshold for col in all_cols ], sa.not_(no_chng_conditions), @@ -260,26 +279,31 @@ def get_buildings_by_change( ] = "no-chng", get_query_only: bool = False, ): - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") + bs = self._bsq.bs_table + up = self._bsq.md_table.alias("up") + bs_key_cols = [bs.c[k] for k in self._bsq.md_key] + completed_status = self._bsq.db_schema.column_names.completed_status up_query = sa.select( - *[self._bsq.bs_bldgid_column, self._bsq._bs_completed_status_col, self._bsq._up_completed_status_col] + *bs_key_cols, + bs.c[completed_status], + up.c[completed_status], ) - up_query = up_query.join(self._bsq.up_table, self._bsq.bs_bldgid_column == self._bsq.up_bldgid_column) + up_query = up_query.join(up, self._bsq._baseline_upgrade_join_condition(bs, up)) - conditions = self._get_change_conditions(change_type) + conditions = self._get_change_conditions(change_type, bs_alias=bs, up_alias=up) + up_col = up.c["upgrade"] up_query = up_query.where( sa.and_( - self._bsq._bs_successful_condition, - self._bsq._up_successful_condition, - self._bsq.up_table.c["upgrade"] == str(upgrade_id), + self._bsq._get_success_condition(bs), + self._bsq._get_success_condition(up), + up_col == typed_literal(up_col, upgrade_id), conditions, ) ) # type: ignore if get_query_only: return self._bsq._compile(up_query) df = self._bsq.execute(up_query) - return df[self._bsq.bs_bldgid_column.name].to_numpy(dtype="int32").tolist() + return self._rows_as_keys(df, self._bsq.md_key) @typing.overload def _get_up_success_report(self, *, get_query_only: Literal[True], trim_missing_bs: bool = True) -> str: ... @@ -305,14 +329,19 @@ def _get_up_success_report(self, *, trim_missing_bs: bool = True, get_query_only Returns: Union[str, pd.DataFrame]: If get_query_only then returns the query string. Otherwise returns the dataframe. """ - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") + up = self._bsq.md_table.alias("up") + up_col = up.c["upgrade"] + completed_status = self._bsq.db_schema.column_names.completed_status up_query = sa.select( - *[self._bsq.up_table.c["upgrade"], self._bsq._up_completed_status_col, safunc.count().label("count")] + *[up_col, up.c[completed_status], safunc.count().label("count")] ) + # Exclude baseline rows — they're "the baseline" not "an upgrade + # success", so they don't belong in the per-upgrade report. + up_query = up_query.select_from(up).where(up_col != typed_literal(up_col, "0")) if trim_missing_bs: - up_query = up_query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.up_bldgid_column) - up_query = up_query.where(self._bsq._bs_successful_condition) + bs = self._bsq.bs_table + up_query = up_query.join(bs, self._bsq._baseline_upgrade_join_condition(bs, up)) + up_query = up_query.where(self._bsq._get_success_condition(bs)) up_query = up_query.group_by(sa.text("1"), sa.text("2")) up_query = up_query.order_by(sa.text("1"), sa.text("2")) @@ -345,22 +374,30 @@ def _get_full_options_report( def _get_full_options_report(self, *, trim_missing_bs: bool, get_query_only: bool) -> Union[pd.DataFrame, str]: ... def _get_full_options_report(self, trim_missing_bs: bool = True, get_query_only: bool = False): - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") + up = self._bsq.md_table.alias("up") opt_name_cols = [ c - for c in self._bsq.up_table.columns + for c in up.c if c.name.startswith("upgrade_costs.option_") and c.name.endswith("name") ] + up_key_cols = [up.c[k] for k in self._bsq.md_key] + if len(up_key_cols) == 1: + applied_agg = safunc.array_agg(up_key_cols[0]) + else: + applied_agg = safunc.array_agg(sa.func.row(*up_key_cols)) + up_col = up.c["upgrade"] query = sa.select( - *[self._bsq.up_table.c["upgrade"], + *[up_col, *opt_name_cols, safunc.count().label("success"), - safunc.array_agg(self._bsq.up_bldgid_column)] + applied_agg] ) + # Exclude baseline rows — option names only make sense for actual upgrades. + query = query.select_from(up).where(up_col != typed_literal(up_col, "0")) if trim_missing_bs: - query = query.join(self._bsq.bs_table, self._bsq.bs_bldgid_column == self._bsq.up_bldgid_column) - query = query.where(self._bsq._bs_successful_condition) + bs = self._bsq.bs_table + query = query.join(bs, self._bsq._baseline_upgrade_join_condition(bs, up)) + query = query.where(self._bsq._get_success_condition(bs)) grouping_texts = [sa.text(str(i + 1)) for i in range(1 + len(opt_name_cols))] query = query.group_by(*grouping_texts) query = query.order_by(*grouping_texts) @@ -384,9 +421,6 @@ def get_options_report(self, trim_missing_bs: bool = True) -> pd.DataFrame: Returns: pd.DataFrame: The list of options the corresponding set of building ids the option applied to. """ - if self._bsq.up_table is None: - raise ValueError("No upgrade table is available .") - full_report = self._get_full_options_report(trim_missing_bs=trim_missing_bs) option_cols = [c for c in full_report.columns if c.startswith("option")] total_counts: Counter = Counter() @@ -575,9 +609,6 @@ def get_success_report(self, trim_missing_bs: Union[Literal["auto"], bool] = "au baseline_result = self._get_bs_success_report() - if self._bsq.up_table is None: - return baseline_result - if trim_missing_bs == "auto": if "success" in baseline_result: trim = True @@ -633,15 +664,16 @@ def _get_ts_report(self, get_query_only: bool = False): dict.fromkeys(str(upg) for upg in self._bsq.get_available_upgrades() if upg is not None) ) ts_queries: list[str] = [] - ts_upgrade_col = sa.cast(self._bsq.ts_table.c["upgrade"], sa.String) + ts_upgrade_col = self._bsq.ts_table.c["upgrade"] + distinct_ts_keys = self._bsq._count_distinct(self._bsq.ts_key_cols) for upgrade in available_upgrades: query = ( sa.select( sa.literal(upgrade).label("upgrade"), - safunc.count(self._bsq.ts_bldgid_column.distinct()).label("count"), + distinct_ts_keys.label("count"), ) .select_from(self._bsq.ts_table) - .where(ts_upgrade_col == upgrade) + .where(ts_upgrade_col == typed_literal(ts_upgrade_col, upgrade)) ) ts_queries.append(self._bsq._compile(query)) @@ -662,36 +694,166 @@ def _get_ts_report(self, get_query_only: bool = False): result_df = result_df.rename(columns={"count": "success"}) return result_df - def check_ts_bs_integrity(self) -> bool: + def _build_metadata_distinct_bldg_count_queries(self): + """Builds the two SA queries used by `_get_metadata_distinct_bldg_count`. + Factored out so snapshot tests can compile them via `_compile` without + executing. See `_get_metadata_distinct_bldg_count` for semantics.""" + bsq = self._bsq + # Bind every column reference to the canonical bs_table alias so the + # WHERE-side filters (which use bs_table) and the SELECT/GROUP BY (which + # need columns from the *same* table that's in the FROM) all share one + # handle. Mixing md_table column refs with bs_table predicates causes + # SA to comma-join md_table and bs_table — a Cartesian that Athena + # rejects with "mismatched input 'GROUP'". + bs = bsq.bs_table + ts_key_cols = [bs.c[c] for c in bsq.ts_key] + distinct_expr = bsq._count_distinct(ts_key_cols) + upgrade_col = bs.c["upgrade"] + + # The unified annual_and_metadata table carries TS-eligible rows for + # both successful and inapplicable buildings (inapplicables_have_ts is + # assumed True for every supported schema). Count both. + status_col = bsq._md_completed_status_col + success_or_inapplicable = sa.or_( + status_col == typed_literal(status_col, bsq.db_schema.completion_values.success), + status_col == typed_literal(status_col, bsq.db_schema.completion_values.inapplicable), + ) + + baseline_query = sa.select( + sa.literal("0").label("upgrade"), distinct_expr.label("count"), + ).select_from(bs).where(sa.and_(bsq._md_baseline_filter(), success_or_inapplicable)) + + up_query = ( + sa.select(upgrade_col.label("upgrade"), distinct_expr.label("count")) + .select_from(bs) + .where(sa.and_( + upgrade_col != typed_literal(upgrade_col, "0"), + success_or_inapplicable, + )) + .group_by(upgrade_col) + ) + return baseline_query, up_query + + def _get_metadata_distinct_bldg_count(self) -> dict[str, int]: + """Returns `{upgrade: distinct_bldg_count}` over ts_key columns. See + `_build_metadata_distinct_bldg_count_queries` for SQL construction + details (factored out for snapshot tests).""" + bsq = self._bsq + baseline_query, up_query = self._build_metadata_distinct_bldg_count_queries() + counts: dict[str, int] = {"0": int(bsq.execute(baseline_query)["count"].iloc[0])} + up_df = bsq.execute(up_query) + for _, row in up_df.iterrows(): + counts[str(row["upgrade"])] = int(row["count"]) + return counts + + def _build_duplicate_rows_query(self, table, key_columns: Sequence[Any], where=None, limit: int = 5): + """Builds the SA query used by `_find_duplicate_rows`. Factored out so + snapshot tests can compile without executing.""" + query = ( + sa.select(*key_columns, safunc.count().label("n")) + .select_from(table) + .group_by(*key_columns) + .having(safunc.count() > 1) + .limit(limit) + ) + if where is not None: + query = query.where(where) + return query + + def _find_duplicate_rows(self, table, key_columns: Sequence[Any], where=None, limit: int = 5): + """Return up to `limit` rows of key-tuples that appear more than once in `table`.""" + query = self._build_duplicate_rows_query(table, key_columns, where=where, limit=limit) + return self._bsq.execute(query) + + def check_ts_bs_integrity(self, get_query_only: bool = False): """Checks the integrity between the timeseries and baseline (metadata) tables. + Runs these checks: + 1. Distinct-building parity per upgrade between timeseries and baseline/upgrade tables. + 2. No duplicate rows in the baseline table (grouped by bs_key). + 3. No duplicate rows in the upgrade table (grouped by up_key + upgrade). + 4. Uniform timeseries row count per (upgrade, ts_key) — a dup in the ts table would + show up as a non-uniform count, and this is cheap relative to `count(DISTINCT ...)` + which can exhaust Athena's memory on large ts tables. + + Args: + get_query_only: If True, return the list of compiled SQL strings + this method would execute instead of running them. Used by + snapshot tests to pin the SQL shape — these queries fire on + every `BuildStockQuery(..., skip_reports=False)` init, and have + historically been a source of comma-join bugs (see commit 9cd148d). + Returns: - bool: Whether or not the integrity check passed. + bool: Whether or not the integrity check passed (default mode). + list[str]: Compiled SQL strings (when get_query_only=True), in + execution order: ts-report queries, metadata-distinct counts, + duplicate-row checks, rows-per-building check. """ + bsq = self._bsq + + if get_query_only: + queries: list[str] = [] + queries.extend(self._get_ts_report(get_query_only=True)) + baseline_q, up_q = self._build_metadata_distinct_bldg_count_queries() + queries.append(bsq._compile(baseline_q)) + queries.append(bsq._compile(up_q)) + bs = bsq.bs_table + queries.append(bsq._compile(self._build_duplicate_rows_query( + bs, list(bsq.md_key_cols), where=bsq._md_baseline_filter(), + ))) + queries.append(bsq._compile(self._build_duplicate_rows_query( + bs, list(bsq.md_key_cols) + [bs.c["upgrade"]], + ))) + queries.append(bsq._get_rows_per_building(get_query_only=True)) + return queries + logger.info("Checking integrity with ts_tables ...") + check_pass = True + raw_ts_report = self._get_ts_report() - raw_success_report = self.get_success_report(trim_missing_bs=False) - if self._bsq.db_schema.structure.inapplicables_have_ts: - bs_dict = raw_success_report[["inapplicable", "success"]].sum(axis=1).to_dict() - else: - bs_dict = raw_success_report["success"].to_dict() + bs_dict = self._get_metadata_distinct_bldg_count() ts_dict = raw_ts_report.to_dict()["success"] - check_pass = True for upgrade, count in ts_dict.items(): if count != bs_dict.get(upgrade, 0): print_r( - f"Upgrade {upgrade} has {count} samples in timeseries table, but {bs_dict.get(upgrade, 0)}" - " samples in baseline/upgrade table." + f"Upgrade {upgrade} has {count} distinct buildings in timeseries table," + f" but {bs_dict.get(upgrade, 0)} in baseline/upgrade table." ) check_pass = False if check_pass: - print_g("Annual and timeseries tables are verified to have the same number of buildings.") + print_g("Baseline/upgrade and timeseries tables have matching distinct building counts.") + + # duplicate checks on the unified annual_and_metadata table. Baseline-side + # check filters to upgrade=0. Upgrade-side check adds the upgrade column + # to the unique key so each upgrade's rows are validated separately. + # Use the bs_table alias for everything: md_key_cols and + # _md_baseline_filter() are both bs-bound, so passing md_table here + # would force a comma-join with bs in the FROM. + bs = bsq.bs_table + small_dup_specs: list[tuple[str, Any, list[Any], Any]] = [ + ("baseline", bs, list(bsq.md_key_cols), bsq._md_baseline_filter()), + ("upgrade", bs, list(bsq.md_key_cols) + [bs.c["upgrade"]], None), + ] + + for label, tbl, key_cols, where in small_dup_specs: + dup_df = self._find_duplicate_rows(tbl, key_cols, where=where) + if len(dup_df) > 0: + key_names = [c.name for c in key_cols] + print_r(f"Duplicate rows detected in {label} table on keys {key_names}:") + print_r(dup_df.to_string(index=False)) + check_pass = False + else: + print_g(f"No duplicate rows in {label} table on {[c.name for c in key_cols]}.") + + # ts-table duplicate detection: if any (upgrade, ts_key) has an off-cadence row count, + # that's a duplicate (or missing-row) signal. Cheaper and more memory-safe than + # count(distinct (keys)) on the full ts table. try: - rowcount = self._bsq._get_rows_per_building() - print_g(f"All buildings are verified to have the same number of ({rowcount}) timeseries rows.") + rowcount = bsq._get_rows_per_building() + print_g(f"All building partitions have the same number of ({rowcount}) timeseries rows.") except ValueError: check_pass = False - print_r("Different buildings have different number of timeseries rows.") + print_r("Different building partitions have different numbers of timeseries rows.") return check_pass @validate_arguments @@ -711,13 +873,17 @@ def get_successful_simulation_count( Returns: Pandas integer counting the number of successful simulation """ - query = sa.select(safunc.count().label("count")) + # Restrict to baseline rows on the unified annual_and_metadata table, + # else the count would multiply by num_upgrades. + query = sa.select(safunc.count().label("count")).where(self._bsq._md_baseline_filter()) restrict = list(restrict) if restrict else [] restrict.insert( 0, (self._bsq.db_schema.column_names.completed_status, [self._bsq.db_schema.completion_values.success]) ) - query = self._bsq._add_restrict(query, restrict, bs_only=True) + # `annual_only=True` restricts column resolution to md_table only, + # skipping the TS table — appropriate for a metadata-only count. + query = self._bsq._add_restrict(query, restrict, annual_only=True) if get_query_only: return self._bsq._compile(query) diff --git a/buildstock_query/savings_query.py b/buildstock_query/savings_query.py deleted file mode 100644 index 1f2bae4d..00000000 --- a/buildstock_query/savings_query.py +++ /dev/null @@ -1,237 +0,0 @@ -from buildstock_query import main -from buildstock_query.schema.query_params import SavingsQuery -from buildstock_query.schema.helpers import gather_params -from buildstock_query.schema.utilities import validate_arguments -import pandas as pd -from sqlalchemy import func as safunc -import sqlalchemy as sa -from typing import Union -from collections.abc import Sequence -from buildstock_query.schema.utilities import AnyColType, RestrictTuple -from pydantic import Field -from typing_extensions import deprecated - - -class BuildStockSavings: - """Class for doing savings query (both timeseries and annual).""" - - _bsq: "main.BuildStockQuery" - - def __init__(self, buildstock_query: "main.BuildStockQuery") -> None: - self._bsq = buildstock_query - - def _validate_partition_by(self, partition_by: Sequence[str]): - if not partition_by: - return [] - [self._bsq._get_gcol(col) for col in partition_by] # making sure all entries are valid - return partition_by - - @validate_arguments - def __get_timeseries_bs_up_table( - self, - enduses: Sequence[AnyColType], - upgrade_id: str, - applied_only: bool, - restrict: Sequence[RestrictTuple] = Field(default_factory=list), - ts_group_by: Sequence[Union[AnyColType, tuple[str, str]]] = Field(default_factory=list), - ): - if self._bsq.ts_table is None: - raise ValueError("No timeseries table found in database.") - - ts = self._bsq.ts_table - base = self._bsq.bs_table - sa_ts_cols = [ts.c[self._bsq.building_id_column_name], ts.c[self._bsq.timestamp_column_name], *ts_group_by] - sa_ts_cols.extend(enduses) - ucol = self._bsq._ts_upgrade_col - ts_b = self._bsq._add_restrict(sa.select(*sa_ts_cols), [[ucol, ("0")], *restrict]).alias("ts_b") - ts_u = self._bsq._add_restrict(sa.select(*sa_ts_cols), [[ucol, (upgrade_id)], *restrict]).alias("ts_u") - - if applied_only: - tbljoin = ts_b.join( - ts_u, - sa.and_( - ts_b.c[self._bsq.building_id_column_name] == ts_u.c[self._bsq.building_id_column_name], - ts_b.c[self._bsq.timestamp_column_name] == ts_u.c[self._bsq.timestamp_column_name], - ), - ).join(base, ts_b.c[self._bsq.building_id_column_name] == base.c[self._bsq.building_id_column_name]) - else: - tbljoin = ts_b.outerjoin( - ts_u, - sa.and_( - ts_b.c[self._bsq.building_id_column_name] == ts_u.c[self._bsq.building_id_column_name], - ts_b.c[self._bsq.timestamp_column_name] == ts_u.c[self._bsq.timestamp_column_name], - ), - ).join(base, ts_b.c[self._bsq.building_id_column_name] == base.c[self._bsq.building_id_column_name]) - return ts_b, ts_u, tbljoin - - @validate_arguments - def __get_annual_bs_up_table(self, upgrade_id: str, applied_only: bool): - if self._bsq.up_table is None: - raise ValueError("No upgrades table found in database.") - if applied_only: - tbljoin = self._bsq.bs_table.join( - self._bsq.up_table, - sa.and_( - self._bsq.bs_table.c[self._bsq.building_id_column_name] - == self._bsq.up_table.c[self._bsq.building_id_column_name], - self._bsq._up_upgrade_col == upgrade_id, - self._bsq._up_successful_condition, - ), - ) - else: - tbljoin = self._bsq.bs_table.outerjoin( - self._bsq.up_table, - sa.and_( - self._bsq.bs_table.c[self._bsq.building_id_column_name] - == self._bsq.up_table.c[self._bsq.building_id_column_name], - self._bsq._up_upgrade_col == upgrade_id, - self._bsq._up_successful_condition, - ), - ) - - return self._bsq.bs_table, self._bsq.up_table, tbljoin - - @gather_params(SavingsQuery) - @deprecated("Please use my_run.query with include_savings=True.") - def savings_shape( - self, - *, - params: SavingsQuery, - ) -> pd.DataFrame | str: - [self._bsq._get_table(jl[0]) for jl in params.join_list] # ingress all tables in join list - - if params.agg_func != "sum": - raise ValueError("Only 'sum' is supported for savings_shape") - - upgrade_id = self._bsq._validate_upgrade(params.upgrade_id) - self._bsq._validate_timeseries_upgrade_restrict( - params.restrict, - annual_only=params.annual_only, - upgrade_id=upgrade_id, - ) - if params.timestamp_grouping_func and params.timestamp_grouping_func not in ["hour", "day", "month"]: - raise ValueError("timestamp_grouping_func must be one of ['hour', 'day', 'month']") - - bs_restrict = self._bsq._add_applied_in_restrict( - params.restrict, - applied_in=params.applied_in, - annual_only=params.annual_only, - ) - enduse_cols = self._bsq._get_enduse_cols( - params.enduses, table="baseline" if params.annual_only else "timeseries" - ) - partition_by = self._validate_partition_by(params.partition_by) - total_weight = self._bsq._get_weight(params.weights) - group_by_selection = self._bsq._process_groupby_cols(params.group_by, annual_only=params.annual_only) - - if params.annual_only: - ts_b, ts_u, tbljoin = self.__get_annual_bs_up_table(upgrade_id, params.applied_only) - else: - bs_restrict, ts_restrict = self._bsq._split_restrict(bs_restrict) - bs_group_by, ts_group_by = self._bsq._split_group_by(group_by_selection) - ts_b, ts_u, tbljoin = self.__get_timeseries_bs_up_table( - enduse_cols, upgrade_id, params.applied_only, ts_restrict, ts_group_by - ) - ts_group_by = [ts_b.c[c.name] for c in ts_group_by] # Refer to the columns using ts_b table - group_by_selection = bs_group_by + ts_group_by - query_cols = [] - for col in enduse_cols: - if params.annual_only: - savings_col = safunc.coalesce(ts_b.c[col.name], 0) - safunc.coalesce( - sa.case((self._bsq._get_success_condition(ts_u), ts_u.c[col.name]), else_=ts_b.c[col.name]), 0 - ) - else: - savings_col = safunc.coalesce(ts_b.c[col.name], 0) - safunc.coalesce( - sa.case( - (ts_u.c[self._bsq.building_id_column_name] == None, ts_b.c[col.name]), # noqa E711 - else_=ts_u.c[col.name], - ), - 0, - ) - query_cols.extend( - [ - sa.func.sum(ts_b.c[col.name] * total_weight).label( - f"{self._bsq._simple_label(col.name)}__baseline" - ), - sa.func.sum(savings_col * total_weight).label(f"{self._bsq._simple_label(col.name)}__savings"), - ] - ) - if params.get_quartiles: - query_cols.extend( - [ - sa.func.approx_percentile(savings_col, [0, 0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98, 1]).label( - f"{self._bsq._simple_label(col.name)}__savings__quartiles" - ) - ] - ) - - query_cols.extend(group_by_selection) - if params.annual_only: # Use annual tables - grouping_metrics_selection = [ - safunc.sum(1).label("sample_count"), - safunc.sum(1 * total_weight).label("units_count"), - ] - elif params.collapse_ts: # Use timeseries tables but collapse timeseries - rows_per_building = self._bsq._get_rows_per_building() - grouping_metrics_selection = [ - (safunc.sum(1) / rows_per_building).label("sample_count"), - safunc.sum(total_weight / rows_per_building).label("units_count"), - ] - elif params.timestamp_grouping_func: - colname = self._bsq.timestamp_column_name - # sa.func.dis - bldg_id_col = ts_b.c[self._bsq.building_id_column_name] - grouping_metrics_selection = [ - safunc.count(sa.func.distinct(bldg_id_col)).label("sample_count"), - ( - safunc.count(sa.func.distinct(bldg_id_col)) - * safunc.sum(total_weight) - / safunc.sum(1) - ).label("units_count"), - (safunc.sum(1) / safunc.count(sa.func.distinct(bldg_id_col))).label("rows_per_sample"), - ] - sim_info = self._bsq._get_simulation_info() - time_col = ts_b.c[self._bsq.timestamp_column_name] - if sim_info.offset > 0: - # If timestamps are not period begining we should make them so for timestamp_grouping_func aggregation. - new_col = sa.func.date_trunc( - params.timestamp_grouping_func, sa.func.date_add(sim_info.unit, -sim_info.offset, time_col) - ).label(colname) - else: - new_col = sa.func.date_trunc(params.timestamp_grouping_func, time_col).label(colname) - grouping_metrics_selection.insert(0, new_col) - group_by_selection.append(new_col) - else: - time_col = ts_b.c[self._bsq.timestamp_column_name].label(self._bsq.timestamp_column_name) - grouping_metrics_selection = [ - time_col, - safunc.sum(1).label("sample_count"), - safunc.sum(1 * total_weight).label("units_count"), - ] - group_by_selection.append(time_col) - - query_cols = grouping_metrics_selection + query_cols - query = sa.select(*query_cols).select_from(tbljoin) - query = self._bsq._add_join(query, params.join_list) - query = self._bsq._add_restrict(query, bs_restrict) - if params.annual_only: - query = query.where(self._bsq._bs_successful_condition) - query = self._bsq._add_group_by(query, group_by_selection) - query = self._bsq._add_order_by(query, group_by_selection if params.sort else []) - - compiled_query = self._bsq._compile(query) - if params.unload_to: - if partition_by: - compiled_query = ( - f"UNLOAD ({compiled_query}) \n TO 's3://{params.unload_to}' \n " - f"WITH (format = 'PARQUET', partitioned_by = ARRAY{partition_by})" - ) - else: - compiled_query = ( - f"UNLOAD ({compiled_query}) \n TO 's3://{params.unload_to}' \n WITH (format = 'PARQUET')" - ) - - if params.get_query_only: - return compiled_query - - return self._bsq.execute(compiled_query) diff --git a/buildstock_query/savings_query.pyi b/buildstock_query/savings_query.pyi deleted file mode 100644 index 90bf662e..00000000 --- a/buildstock_query/savings_query.pyi +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -from typing import Sequence, Union -from buildstock_query.schema.query_params import SavingsQuery -from buildstock_query.schema.utilities import AnyColType, AnyTableType, RestrictTuple -import buildstock_query.main as main -from typing import Optional -from pydantic import Field -from typing import Literal -import typing -from typing_extensions import deprecated - - -class BuildStockSavings: - """Class for doing savings query (both timeseries and annual). - """ - - def __init__(self, buildstock_query: 'main.BuildStockQuery') -> None: - ... - - def _validate_partition_by(self, partition_by: list[str]): - ... - - def __get_timeseries_bs_up_table(self, - enduses: Sequence[AnyColType], - upgrade_id: Union[int, str], - applied_only: bool, - restrict: Sequence[RestrictTuple] = Field(default_factory=list), - ts_group_by: Sequence[Union[AnyColType, tuple[str, str]] - ] = Field(default_factory=list)): - ... - - def __get_annual_bs_up_table(self, upgrade_id: Union[int, str], applied_only: bool) -> ...: - ... - - @typing.overload - @deprecated("Please use my_run.query with include_savings=True.") - def savings_shape( - self, *, - get_query_only: Literal[True], - upgrade_id: Union[int, str], - enduses: Sequence[AnyColType], - group_by: Sequence[Union[AnyColType, tuple[str, str]]] = Field(default_factory=list), - annual_only: bool = True, - sort: bool = True, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = Field(default_factory=list), - weights: Sequence[Union[str, tuple]] = Field(default_factory=list), - restrict: Sequence[RestrictTuple] = Field(default_factory=list), - applied_only: bool = False, - applied_in: Optional[Sequence[Union[str, int]]] = None, - get_quartiles: bool = False, - unload_to: str = '', - partition_by: Optional[Sequence[str]] = None, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[str] = None, - ) -> str: - ... - - @typing.overload - @deprecated("Please use my_run.query with include_savings=True.") - def savings_shape( - self, *, - upgrade_id: Union[int, str], - get_query_only: Literal[False] = False, - enduses: Sequence[AnyColType], - group_by: Sequence[Union[AnyColType, tuple[str, str]]] = Field(default_factory=list), - annual_only: bool = True, - sort: bool = True, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = Field(default_factory=list), - weights: Sequence[Union[str, tuple]] = Field(default_factory=list), - restrict: Sequence[RestrictTuple] = Field(default_factory=list), - applied_only: bool = False, - applied_in: Optional[Sequence[Union[str, int]]] = None, - get_quartiles: bool = False, - unload_to: str = '', - partition_by: Optional[Sequence[str]] = None, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[str] = None, - ) -> pd.DataFrame: - ... - - @typing.overload - @deprecated("Please use my_run.query with include_savings=True.") - def savings_shape( - self, *, - get_query_only: bool, - upgrade_id: Union[int, str], - enduses: Sequence[AnyColType], - group_by: Sequence[Union[AnyColType, tuple[str, str]]] = Field(default_factory=list), - annual_only: bool = True, - sort: bool = True, - join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = Field(default_factory=list), - weights: Sequence[Union[str, tuple]] = Field(default_factory=list), - restrict: Sequence[RestrictTuple] = Field(default_factory=list), - applied_only: bool = False, - applied_in: Optional[Sequence[Union[str, int]]] = None, - get_quartiles: bool = False, - unload_to: str = '', - partition_by: Optional[Sequence[str]] = None, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[str] = None, - ) -> Union[str, pd.DataFrame]: - """Calculate savings shape for an upgrade - Args: - upgrade_id: id of the upgrade scenario from the ResStock analysis - enduses: Enduses to query, defaults to ['fuel_use__electricity__total'] - group_by: Building characteristics columns to group by, defaults to [] - annual_only: If true, calculates only the annual savings using baseline and upgrades table - sort: Whether the result should be sorted. Sorting takes extra time. - join_list: Additional table to join to baseline table to perform operation. All the inputs (`enduses`, - `group_by` etc) can use columns from these additional tables. It should be specified as a list of - tuples. - Example: `[(new_table_name, baseline_column_name, new_column_name), ...]` - where baseline_column_name and new_column_name are the columns on which the new_table - should be joined to baseline table. - applied_only: Calculate savings shape based on only buildings to which the upgrade applied - applied_in: Optional list of upgrade ids. When set alongside `applied_only=True`, the query is further - restricted to buildings for which all listed upgrades satisfy the run's success/applicability - condition. - weights: The additional columns to use as weight. The "build_existing_model.sample_weight" is already used. - It is specified as either list of string or list of tuples. When only string is used, the string - is the column name, when tuple is passed, the second element is the table name. - - restrict: The list of where condition to restrict the results to. It should be specified as a list of tuple. - Example: `[('state',['VA','AZ']), ("build_existing_model.lighting",['60% CFL']), ...]` - - get_query_only: Skips submitting the query to Athena and just returns the query string. Useful for batch - submitting multiple queries or debugging - get_quartiles: If true, return the following quartiles in addition to the sum for each enduses: - [0, 0.02, .25, .5, .75, .98, 1]. The 0% quartile is the minimum and the 100% quartile - is the maximum. - unload_to: Writes the ouput of the query to this location in s3. Consider using run_async = True with this - to unload multiple queries simulataneuosly - partition_by: List of columns to partition when writing to s3. To be used with unload_to. - collapse_ts: Only used when annual_only=False. When collapse_ts=True, the timeseries values are summed into - a single annual value. Useful for quality checking and comparing with annual values. - timestamp_grouping_func: One of 'hour', 'day' or 'month' or None. If provided, perform timeseries - aggregation of specified granularity. - Returns: - if get_query_only is True, returns the query_string, otherwise returns a pandas dataframe - """ - ... - - @typing.overload - @deprecated("Please use my_run.query with include_savings=True.") - def savings_shape(self, *, params: SavingsQuery) -> Union[str, pd.DataFrame]: - ... diff --git a/buildstock_query/schema/__init__.py b/buildstock_query/schema/__init__.py index 3e06229c..e69de29b 100644 --- a/buildstock_query/schema/__init__.py +++ b/buildstock_query/schema/__init__.py @@ -1,6 +0,0 @@ -# from .helpers import gather_params -# from .utilities import MappedColumn, AnyColType, AnyTableType, DBColType, DBTableType -# from .query_params import AnnualQuery, TSQuery, SavingsQuery, UtilityTSQuery - -# __all__ = ['AnnualQuery', 'TSQuery', 'SavingsQuery', 'UtilityTSQuery', 'gather_params', -# 'MappedColumn', 'AnyColType', 'AnyTableType', 'DBColType', 'DBTableType'] diff --git a/buildstock_query/schema/query_params.py b/buildstock_query/schema/query_params.py index e8dbd84c..e69739df 100644 --- a/buildstock_query/schema/query_params.py +++ b/buildstock_query/schema/query_params.py @@ -7,13 +7,6 @@ from typing_extensions import Self -def _normalize_applied_in(applied_in: Optional[Sequence[Union[str, int]]]) -> Optional[list[str]]: - if applied_in is None: - return None - normalized = list(dict.fromkeys(str(upgrade) for upgrade in applied_in)) - return normalized or None - - class BaseQuery(BaseModel): enduses: Sequence[AnyColType] group_by: Sequence[Union[AnyColType, tuple[str, str]]] = Field(default_factory=list) @@ -32,26 +25,7 @@ class BaseQuery(BaseModel): class TSQuery(BaseQuery): - split_enduses: bool = False - collapse_ts: bool = False - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = None - - -class SavingsQuery(TSQuery): - annual_only: bool = True - applied_only: bool = False - applied_in: Optional[Sequence[Union[str, int]]] = None - unload_to: str = "" - partition_by: Sequence[str] = Field(default_factory=list) - - @model_validator(mode="after") - def validate_consistency(self) -> Self: - self.applied_in = _normalize_applied_in(self.applied_in) - if self.applied_in and not self.applied_only: - raise ValueError("applied_in cannot be set when applied_only is False") - if self.applied_only and self.upgrade_id == "0": - raise ValueError("applied_only cannot be set when upgrade_id is '0'") - return self + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = None class UtilityTSQuery(TSQuery): @@ -67,12 +41,10 @@ class Query(BaseQuery): timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = None partition_by: Sequence[str] = Field(default_factory=list) applied_only: Optional[bool] = Field(default=None) - applied_in: Optional[Sequence[Union[str, int]]] = None unload_to: Optional[str] = None @model_validator(mode="after") def validate_consistency(self) -> Self: - self.applied_in = _normalize_applied_in(self.applied_in) effective_applied_only = self.upgrade_id != "0" if self.applied_only is None else self.applied_only if self.include_savings and self.upgrade_id == "0": raise ValueError("include_savings cannot be True when upgrade_id is '0'") @@ -82,10 +54,15 @@ def validate_consistency(self) -> Self: raise ValueError("annual_only must be False when timestamp_grouping_func is provided") if effective_applied_only and self.upgrade_id == "0": raise ValueError("applied_only cannot be set when upgrade_id is '0'") - if self.applied_in and not effective_applied_only: - raise ValueError("applied_in cannot be set when applied_only is False") if self.get_nonzero_count and not self.annual_only: raise ValueError("get_nonzero_count cannot be True when annual_only is False") + if self.get_quartiles and not self.annual_only: + raise ValueError( + "get_quartiles is not supported on timeseries queries (annual_only=False). " + "Quartiles over per-timestamp rows don't compose meaningfully with a " + "rollup, and quartiles over per-bucket sums are non-obvious; use " + "min/max-style aggregates instead, or run an annual query for quartiles." + ) if self.applied_only is None: self.applied_only = effective_applied_only # False for baseline, True otherwise return self diff --git a/buildstock_query/schema/run_params.py b/buildstock_query/schema/run_params.py index 3981e1a0..1bf1e4b4 100644 --- a/buildstock_query/schema/run_params.py +++ b/buildstock_query/schema/run_params.py @@ -6,7 +6,7 @@ class RunParams(BaseModel): workgroup: str db_name: str - table_name: Union[str, tuple[str, Optional[str], Optional[str]]] + table_name: Union[str, tuple[str, Optional[str]]] buildstock_type: Literal["resstock", "comstock"] = 'resstock' db_schema: Optional[str | dict] = None sample_weight_override: Optional[Union[int, float]] = None diff --git a/buildstock_query/schema/utilities.py b/buildstock_query/schema/utilities.py index b4bfa11f..27cc2bea 100644 --- a/buildstock_query/schema/utilities.py +++ b/buildstock_query/schema/utilities.py @@ -3,18 +3,63 @@ from collections.abc import Sequence from pydantic import ConfigDict, BaseModel, validate_call import sqlalchemy as sa +from sqlalchemy.sql import sqltypes from sqlalchemy.sql.elements import Label, ColumnElement -from sqlalchemy.sql.selectable import SelectBase, Subquery +from sqlalchemy.sql.selectable import SelectBase, Subquery, Alias # from buildstock_query import BuildStockQuery # can't import due to circular import SACol = sa.Column | ColumnElement SALabel = Label DBColType = SALabel | SACol -DBTableType = sa.Table | Subquery +# Alias is included so `bs_table` / `up_table` (SA aliases over the unified +# annual_and_metadata table) flow through the same type guards as real Tables. +DBTableType = sa.Table | Subquery | Alias AnyTableType = Union[DBTableType, str] +def typed_literal(col, value): + """Coerce a Python value to match the SQL type of `col`. + + Predicate pushdown on Athena/Trino requires comparing a column against a literal + of matching type — `CAST(col AS VARCHAR) = '1'` defeats stripe pruning on parquet + scans. Coercing the literal instead lets the column reference stay bare so the + parquet reader can use min/max statistics to skip row groups. + + Pass-through for None and SQLAlchemy expressions (subqueries, other columns). For + types we don't recognize, return the value unchanged. + """ + if value is None: + return value + if hasattr(value, "__clause_element__") or isinstance(value, sa.sql.ClauseElement): + return value + col_type = getattr(col, "type", None) + if col_type is None: + return value + if isinstance(col_type, sqltypes.Boolean): + if isinstance(value, bool): + return value + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in {"true", "t", "1", "yes"}: + return True + if lowered in {"false", "f", "0", "no", ""}: + return False + if isinstance(value, (int, float)): + return bool(value) + return value + if isinstance(col_type, (sqltypes.Integer, sqltypes.BigInteger, sqltypes.SmallInteger)): + if isinstance(value, int) and not isinstance(value, bool): + return value + try: + return int(value) + except (TypeError, ValueError): + return value + if isinstance(col_type, sqltypes.String): + return str(value) if not isinstance(value, str) else value + return value + + class MappedColumn(BaseModel): bsq: Any = None # BuildStockQuery name: str @@ -25,6 +70,11 @@ class MappedColumn(BaseModel): AnyColType = DBColType | str | MappedColumn RestrictValue = str | int | bool | Sequence[int | str | bool] | SelectBase | Subquery -RestrictTuple = tuple[AnyColType, RestrictValue] +RestrictColTuple = tuple[DBColType, ...] +RestrictRowValue = tuple[str | int | bool, ...] +RestrictTuple = ( + tuple[AnyColType, RestrictValue] + | tuple[RestrictColTuple, SelectBase | Subquery | Sequence[RestrictRowValue]] +) validate_arguments = validate_call(config=ConfigDict(arbitrary_types_allowed=True)) diff --git a/buildstock_query/sql_cache.py b/buildstock_query/sql_cache.py new file mode 100644 index 00000000..ca6f4f8b --- /dev/null +++ b/buildstock_query/sql_cache.py @@ -0,0 +1,207 @@ +"""Content-addressed on-disk cache for SQL query results. + +Each cached query is up to three files in the cache directory: + .sql — the normalized SQL text + .parquet — the query result DataFrame + .json — Athena execution metadata (cost, runtime, + engine version, etc.) — optional; + present when the query came from a real + Athena execution (not a stale-cache hit). + +Lookups, writes, and existence checks all key on the hash. No in-memory state — +every get() reads from disk, every put() writes to disk. Concurrency-safe by +construction: same SQL → same path (idempotent), different SQLs → different +paths (no contention). +""" +from __future__ import annotations + +import hashlib +import json as _json +import re +from pathlib import Path +from typing import Any, Iterator + +import pandas as pd + + +_WHITESPACE_RE = re.compile(r"\s+") + + +def normalize_sql(sql: str) -> str: + """Collapse all whitespace runs to single spaces and strip ends. + + Two SQL strings that differ only in whitespace (indentation, trailing + newlines, line breaks inside expressions) normalize to the same text and + therefore share a cache slot. + """ + return _WHITESPACE_RE.sub(" ", sql).strip() + + +def hash_sql(sql: str) -> str: + """Return sha256 of the normalized SQL as 64 hex chars. + + Single source of truth for cache addressing. Every SqlCache method routes + through this — never recompute the hash inline. + """ + return hashlib.sha256(normalize_sql(sql).encode()).hexdigest() + + +USAGE_LOG_NAME = ".cache_usage_log" + + +class SqlCache: + """Content-addressed on-disk SQL→DataFrame cache.""" + + def __init__(self, root: Path | str) -> None: + self.root = Path(root) + self.root.mkdir(parents=True, exist_ok=True) + # Cache hits and fresh writes append the hash to `.cache_usage_log` + # for `cleanup_stale_caches.py` to read. The log is APPENDED to + # across constructions on purpose: separate pytest invocations + # (snapshot run, then invariants run) need their hashes pooled. + # Use `clear_usage_log()` to reset before a tracking session. + self._usage_log_path = self.root / USAGE_LOG_NAME + # Seed in-memory dedupe set from the existing log so we don't grow + # the file by re-appending every hash on every BSQ construction in + # a long-running process. + self._usage_seen: set[str] = set() + if self._usage_log_path.exists(): + for line in self._usage_log_path.read_text().splitlines(): + line = line.strip() + if len(line) == 64 and all(c in "0123456789abcdef" for c in line): + self._usage_seen.add(line) + else: + # Create an empty log so callers that only inspect the file + # (cleanup tooling, ad-hoc shell `wc -l`) don't have to special- + # case its absence on a fresh cache. + self._usage_log_path.write_text("") + + def clear_usage_log(self) -> None: + """Truncate `.cache_usage_log` and reset the in-memory dedupe set. + + Call this once before a tracking session (the test runs you'll feed + into `cleanup_stale_caches.py`). The cleanup script's `--clear` + flag does this for both schema caches in one shot. + """ + self._usage_log_path.write_text("") + self._usage_seen = set() + + def _parquet_path(self, sql: str) -> Path: + return self.root / f"{hash_sql(sql)}.parquet" + + def _sql_path(self, sql: str) -> Path: + return self.root / f"{hash_sql(sql)}.sql" + + def _meta_path(self, sql: str) -> Path: + return self.root / f"{hash_sql(sql)}.json" + + def _record_usage(self, sql: str) -> None: + """Append `hash_sql(sql)` to the usage log if not already seen. + + Called automatically from get-hits and put-writes. Each line is a + single 64-char hex hash. The set guard avoids repeated identical + appends within one session — the file just records "this hash was + touched at least once". + """ + h = hash_sql(sql) + if h in self._usage_seen: + return + self._usage_seen.add(h) + with open(self._usage_log_path, "a") as f: + f.write(f"{h}\n") + + def used_hashes(self) -> set[str]: + """Return hashes recorded in this session's usage log (in-memory).""" + return set(self._usage_seen) + + def __contains__(self, sql: str) -> bool: + return self._parquet_path(sql).exists() + + def get(self, sql: str) -> pd.DataFrame | None: + """Return the cached DataFrame for `sql`, or None if not cached. + + A torn parquet (writer crashed mid-write) raises from pd.read_parquet; + callers should treat that as a miss and re-run the query. + """ + path = self._parquet_path(sql) + if not path.exists(): + return None + df = pd.read_parquet(path) + self._record_usage(sql) + return df + + def put(self, sql: str, df: pd.DataFrame) -> None: + """Write `df` and the normalized SQL sidecar atomically per file.""" + df.to_parquet(self._parquet_path(sql), index=False) + self._sql_path(sql).write_text(normalize_sql(sql)) + self._record_usage(sql) + + def put_metadata(self, sql: str, metadata: dict[str, Any]) -> None: + """Write Athena execution metadata for `sql` as a JSON sidecar. + + `metadata` is the full GetQueryExecution response (or any subset the + caller chose) — stored as-is so future analyses can pull whatever + Athena reports (DataScannedInBytes, EngineExecutionTimeInMillis, + ResultReuseInformation, EngineVersion, etc.) without needing to + re-fetch from Athena history. + """ + self._meta_path(sql).write_text(_json.dumps(metadata, indent=2, default=str)) + + def get_metadata(self, sql: str) -> dict[str, Any] | None: + """Return the metadata JSON for `sql`, or None if not present.""" + path = self._meta_path(sql) + if not path.exists(): + return None + try: + return _json.loads(path.read_text()) + except _json.JSONDecodeError: + return None + + def delete(self, sql: str) -> None: + self._sql_path(sql).unlink(missing_ok=True) + self._parquet_path(sql).unlink(missing_ok=True) + self._meta_path(sql).unlink(missing_ok=True) + + def rename(self, old_sql: str, new_sql: str) -> None: + """Move an entry from old_sql's hash to new_sql's hash. + + Used by --update-snapshot when the SQL changed cosmetically (sqlglot + considers it equivalent) but the data is unchanged — saves an Athena + re-run by reusing the existing parquet. + """ + old_parquet = self._parquet_path(old_sql) + new_parquet = self._parquet_path(new_sql) + if old_parquet != new_parquet: + old_parquet.rename(new_parquet) + self._sql_path(new_sql).write_text(normalize_sql(new_sql)) + old_sql_path = self._sql_path(old_sql) + if old_sql_path != self._sql_path(new_sql): + old_sql_path.unlink(missing_ok=True) + # Carry the metadata along too if it exists. Cost is associated with + # the data, and the data didn't change — so the cost numbers carry. + old_meta_path = self._meta_path(old_sql) + new_meta_path = self._meta_path(new_sql) + if old_meta_path != new_meta_path and old_meta_path.exists(): + old_meta_path.rename(new_meta_path) + # The new hash is now the live entry — record it so cleanup doesn't + # mistake it for stale just because no test read it back this session. + self._record_usage(new_sql) + + def read_sql(self, sql_or_hash: str) -> str | None: + """Read the stored .sql sidecar text, by either SQL or raw hash. + + Used by --update-snapshot to compare the previously-stored SQL against + a freshly-generated one (sqlglot equivalence check). + """ + if len(sql_or_hash) == 64 and all(c in "0123456789abcdef" for c in sql_or_hash): + path = self.root / f"{sql_or_hash}.sql" + else: + path = self._sql_path(sql_or_hash) + if not path.exists(): + return None + return path.read_text() + + def hashes(self) -> Iterator[str]: + """Yield every cached entry's hash (one per .parquet file present).""" + for parquet in self.root.glob("*.parquet"): + yield parquet.stem diff --git a/buildstock_query/tools/upgrades_visualizer/viz_data.py b/buildstock_query/tools/upgrades_visualizer/viz_data.py index e9d3100e..03301363 100644 --- a/buildstock_query/tools/upgrades_visualizer/viz_data.py +++ b/buildstock_query/tools/upgrades_visualizer/viz_data.py @@ -4,7 +4,6 @@ from buildstock_query.tools.upgrades_visualizer.plot_utils import PlotParams from typing import Union from typing import Literal -import datetime from buildstock_query.schema.utilities import validate_arguments num2month = {1: "January", 2: "February", 3: "March", 4: "April", @@ -101,7 +100,6 @@ def _get_results_csv_clean(self, upgrade: str): res_df = res_df.with_columns(upgrade=pl.lit(upgrade)) res_df = res_df.with_columns(count=pl.lit(1)) res_df = res_df.with_columns(month=pl.lit('All')) - self.run_obj(upgrade).save_cache() return res_df def _get_metadata_df(self): @@ -135,24 +133,13 @@ def init_monthly_results(self, metadata_df): ts_cols = self._get_ts_enduse_cols(upgrade) print(f"Getting monthly results for {upgrade}") run_obj = self.run_obj(upgrade) - monthly_vals_query = run_obj.agg.aggregate_timeseries(get_query_only=True, - enduses=ts_cols, - group_by=[run_obj.bs_bldgid_column], - upgrade_id=upgrade, - timestamp_grouping_func='month', - ) - if monthly_vals_query in run_obj._query_cache: - monthly_vals = run_obj._query_cache[monthly_vals_query].copy() - else: - month_year = f"{datetime.datetime.now().strftime('%b%Y')}" - s3_unload_path = f"s3://{run_obj.params.query_unload_s3_bucket}/bsq_athena_unload_results/{month_year}/" - pd_cursor = run_obj._conn.cursor(unload=True, s3_staging_dir=s3_unload_path).execute( - monthly_vals_query, - result_reuse_enable=True, - result_reuse_minutes=60 * 24 * 7) - monthly_vals = pd_cursor.as_pandas() - run_obj._query_cache[monthly_vals_query] = monthly_vals - run_obj.save_cache() + monthly_vals = run_obj.query( + enduses=ts_cols, + group_by=[run_obj.md_bldgid_column], + upgrade_id=upgrade, + annual_only=False, + timestamp_grouping_func='month', + ) monthly_df = pl.from_pandas(monthly_vals, include_index=True) monthly_df = monthly_df.with_columns(pl.col('time').dt.month().alias("month")) monthly_df = monthly_df.with_columns(pl.col('month').replace_strict(num2month).alias("month")) diff --git a/buildstock_query/utility_query.py b/buildstock_query/utility_query.py index bf894412..2fa86117 100644 --- a/buildstock_query/utility_query.py +++ b/buildstock_query/utility_query.py @@ -6,7 +6,7 @@ import sqlalchemy as sa from sqlalchemy.sql import functions as safunc from collections import defaultdict -from buildstock_query.schema.query_params import UtilityTSQuery, TSQuery +from buildstock_query.schema.query_params import UtilityTSQuery, Query from buildstock_query.schema.helpers import gather_params from buildstock_query.schema.utilities import AnyColType, AnyTableType, MappedColumn, validate_arguments from buildstock_query.helpers import read_csv @@ -60,7 +60,6 @@ def __init__( eia_mapping_version: The EIA mapping version to use. """ self._bsq = buildstock_query - self._agg = buildstock_query.agg self._group_query_id = 0 self.eia_mapping_year = eia_mapping_year self.eia_mapping_version = eia_mapping_version @@ -77,6 +76,8 @@ def _aggregate_ts_by_map( ): new_table = self._bsq._get_table(map_table_name) new_column = self._bsq._get_column(map_column_name, candidate_tables=[new_table]) + # Bind to bs_table (not md_table directly): the outer aggregate query's + # FROM is the bs alias, so md_table-bound references can't resolve. baseline_column = self._bsq._get_column(baseline_column_name, candidate_tables=[self._bsq.bs_table]) params.group_by = [new_table.c[id_column], *params.group_by] params.weights = [*params.weights, new_table.c["weight"]] @@ -84,11 +85,11 @@ def _aggregate_ts_by_map( logger.info(f"Will submit request for {id_list}") gs = params.query_group_size id_list_batches = [id_list[i: i + gs] for i in range(0, len(id_list), gs)] - results_array = [] + queries = [] for current_ids in id_list_batches: if len(current_ids) == 1: current_ids = current_ids[0] - new_params = TSQuery( + query_params = Query( enduses=params.enduses, group_by=params.group_by, upgrade_id=params.upgrade_id, @@ -96,29 +97,20 @@ def _aggregate_ts_by_map( join_list=params.join_list, weights=params.weights, restrict=[(new_table.c[id_column], current_ids)] + list(params.restrict), - collapse_ts=params.collapse_ts, + annual_only=False, timestamp_grouping_func=params.timestamp_grouping_func, limit=params.limit, - split_enduses=params.split_enduses, get_quartiles=params.get_quartiles, - get_query_only=False if params.split_enduses else True, + get_query_only=True, ) logger.info(f"Submitting query for {current_ids}") - result = self._agg.aggregate_timeseries(params=new_params) - results_array.append(result) + queries.append(self._bsq.query(params=query_params)) if params.get_query_only: - return results_array + return queries - if params.split_enduses: - # In this case, the results_array will contain the result dataframes - logger.info("Concatenating the results from all IDs") - all_dfs = pd.concat(results_array) - return all_dfs - else: - # In this case, results_array will contain the queries - batch_query_id = self._bsq.submit_batch_query(results_array) - return self._bsq.get_batch_query_result(batch_id=batch_query_id) + batch_query_id = self._bsq.submit_batch_query(queries) + return self._bsq.get_batch_query_result(batch_id=batch_query_id) def get_eiaid_map(self) -> tuple[str, str, str]: if self.eia_mapping_version == 1: @@ -154,7 +146,6 @@ def aggregate_ts_by_eiaid(self, params: UtilityTSQuery): get_query_only: If set to true, returns the list of queries to run instead of the result. query_group_size: The number of eiaids to be grouped together when running athena queries. This should be used as large as possible that doesn't result in query timeout. - split_endues: Query each enduses separately to spread load on Athena Returns: Pandas dataframe with the aggregated timeseries and the requested enduses grouped by utilities @@ -267,7 +258,10 @@ def get_filtered_results_csv_by_eiaid(self, eiaids: List[str], get_query_only: b Pandas dataframe that is a subset of the results csv, that belongs to provided list of utilities """ eiaid_map_table_name, map_baseline_column, map_eiaid_column = self.get_eiaid_map() - query = sa.select("*").select_from(self._bsq.bs_table) + # Select through bs_table alias (not md_table directly) so column + # references in the WHERE — _md_baseline_filter binds to bs_table by + # default — don't trigger an SA auto-comma-join. + query = sa.select("*").select_from(self._bsq.bs_table).where(self._bsq._md_baseline_filter()) query = self._bsq._add_join(query, [(eiaid_map_table_name, map_baseline_column, map_eiaid_column)]) query = self._bsq._add_restrict(query, [(self._bsq._get_column("eiaid", [eiaid_map_table_name]), eiaids)]) query = query.where(self._bsq._get_column("weight", [eiaid_map_table_name]) > 0) @@ -277,7 +271,11 @@ def get_filtered_results_csv_by_eiaid(self, eiaids: List[str], get_query_only: b return res @validate_arguments - def get_eiaids(self, restrict: Optional[List[Tuple[str, List]]] = None) -> list[str]: + def get_eiaids( + self, + restrict: Optional[List[Tuple[str, List]]] = None, + get_query_only: bool = False, + ) -> Union[list[str], str]: """ Returns the list of eiaids Args: @@ -285,20 +283,27 @@ def get_eiaids(self, restrict: Optional[List[Tuple[str, List]]] = None) -> list[ Example: `[('state',['VA','AZ']), ("build_existing_model.lighting",['60% CFL']), ...]` mapping_version: Version of eiaid mapping to use. After the spatial refactor upgrade, version two should be used + get_query_only: If True, returns the SQL string instead of executing. Bypasses the in-memory cache. Returns: Pandas dataframe consisting of the eiaids belonging to the provided list of locations. """ restrict = list(restrict) if restrict else [] eiaid_map_table_name, map_baseline_column, map_eiaid_column = self.get_eiaid_map() eiaid_col = self._bsq._get_column("eiaid", [eiaid_map_table_name]) + join_list = [(eiaid_map_table_name, map_baseline_column, map_eiaid_column)] + weight_col = ("weight", eiaid_map_table_name) + if get_query_only: + # Skip the cache short-circuit so the SQL is always generated. + return self._bsq.query( + annual_only=True, enduses=[], group_by=[eiaid_col], restrict=restrict, join_list=join_list, + weights=[weight_col], sort=True, get_query_only=True, + ) if "eiaids" in self._cache: if self._bsq.db_name + "/" + eiaid_map_table_name in self._cache["eiaids"]: return self._cache["eiaids"][self._bsq.db_name + "/" + eiaid_map_table_name] else: self._cache["eiaids"] = {} - join_list = [(eiaid_map_table_name, map_baseline_column, map_eiaid_column)] - weight_col = ("weight", eiaid_map_table_name) annual_agg = self._bsq.query( annual_only=True, enduses=[], group_by=[eiaid_col], restrict=restrict, join_list=join_list, weights=[weight_col], sort=True @@ -321,7 +326,9 @@ def get_buildings_by_eiaids(self, eiaids: List[str], get_query_only: bool = Fals """ eiaid_map_table_name, map_baseline_column, map_eiaid_column = self.get_eiaid_map() - query = sa.select(*[self._bsq.bs_bldgid_column.distinct()]) + # md_bldgid_column binds to bs_table — set FROM explicitly so SA doesn't + # auto-derive a different one when joins are added below. + query = sa.select(*[self._bsq.md_bldgid_column.distinct()]).select_from(self._bsq.bs_table) query = self._bsq._add_join(query, [(eiaid_map_table_name, map_baseline_column, map_eiaid_column)]) query = self._bsq._add_restrict(query, [(self._bsq._get_column("eiaid", [eiaid_map_table_name]), eiaids)]) query = query.where(self._bsq._get_column("weight", [eiaid_map_table_name]) > 0) @@ -353,7 +360,10 @@ def get_locations_by_eiaids(self, eiaids: List[str], get_query_only: bool = Fals if get_query_only: return self._bsq._compile(query) res = self._bsq.execute(query) - return list(res[map_eiaid_column].values) + # Athena UNLOAD names the projection positionally (e.g. "value") when + # the SELECT-DISTINCT column carries no alias, so look up by position + # rather than by `map_eiaid_column`. + return list(res.iloc[:, 0].values) def get_rate_map(self, weekend_csv_path: str, weekday_csv_path: str) -> dict[tuple[int, int, int], float]: def read_rate_file(file_path: str) -> pd.DataFrame: @@ -392,8 +402,7 @@ def calculate_tou_bill( join_list: Sequence[tuple[AnyTableType, AnyColType, AnyColType]] = Field(default_factory=list), weights: Sequence[Union[str, tuple]] = Field(default_factory=list), restrict: Sequence[tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = "month", + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = "month", limit: Optional[int] = None, get_query_only: bool = False, ): @@ -425,7 +434,7 @@ def calculate_tou_bill( for col in TOU_enduse: enduses_list.append((TOU_enduse[col] * rate_col / 100).label(f"{col}__dollars")) - ts_query = TSQuery( + return self._bsq.query( enduses=enduses_list, group_by=group_by, upgrade_id=str(upgrade_id), @@ -433,9 +442,8 @@ def calculate_tou_bill( join_list=join_list, weights=weights, restrict=restrict, - collapse_ts=collapse_ts, + annual_only=False, timestamp_grouping_func=timestamp_grouping_func, limit=limit, get_query_only=get_query_only, ) - return self._agg.aggregate_timeseries(params=ts_query) diff --git a/buildstock_query/utility_query.pyi b/buildstock_query/utility_query.pyi index 9952ba5c..c3993188 100644 --- a/buildstock_query/utility_query.pyi +++ b/buildstock_query/utility_query.pyi @@ -40,9 +40,7 @@ class BuildStockUtility: AnyColType]] = Field(default_factory=list), weights: Sequence[Union[str, tuple]] = [], restrict: Sequence[tuple[str, Union[str, int, Sequence[int], Sequence[str]]]] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = None, + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = None, query_group_size: int = 20, limit: Optional[int] = None, ) -> str: @@ -59,9 +57,7 @@ class BuildStockUtility: AnyColType]] = Field(default_factory=list), weights: Sequence[Union[str, tuple]] = [], restrict: Sequence[tuple[str, Union[str, int, Sequence[int], Sequence[str]]]] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = None, + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = None, get_query_only: Literal[False] = False, query_group_size: int = 20, limit: Optional[int] = None, @@ -80,9 +76,7 @@ class BuildStockUtility: AnyColType]] = Field(default_factory=list), weights: Sequence[Union[str, tuple]] = [], restrict: Sequence[tuple[str, Union[str, int, Sequence[int], Sequence[str]]]] = [], - split_enduses: bool = False, - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = None, + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = None, query_group_size: int = 20, limit: Optional[int] = None, ) -> Union[str, pd.DataFrame]: @@ -100,8 +94,6 @@ class BuildStockUtility: form ['weight_column' or ('weight_column', 'weight_table')] restrict: The list of restrictions to be applied to the query. Each restriction should be specified as a list of form [('column_name', restric_list / restric_value)] - split_endues: If true, query each enduses separately to spread load on Athena. - collapse_ts: If true, collapse the timeseries (i.e. sum them up) into a single row. get_query_only: If set to true, returns the list of queries to run instead of the result. timestamp_grouping_func: The function to be used to group the timeseries. If None, the timeseries are query_group_size: The number of eiaids to be grouped together when running athena queries. This should be @@ -271,8 +263,7 @@ class BuildStockUtility: restrict: Sequence[ tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = "month", + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = "month", limit: Optional[int] = None, ) -> str: ... @@ -290,8 +281,7 @@ class BuildStockUtility: restrict: Sequence[ tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = "month", + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = "month", limit: Optional[int] = None, get_query_only: Literal[False] = False ) -> pd.DataFrame: @@ -311,8 +301,7 @@ class BuildStockUtility: restrict: Sequence[ tuple[AnyColType, Union[str, int, Sequence[Union[int, str]]]]] = Field(default_factory=list), - collapse_ts: bool = False, - timestamp_grouping_func: Optional[Literal["month", "day", "hour"]] = "month", + timestamp_grouping_func: Optional[Literal["year", "month", "day", "hour"]] = "month", limit: Optional[int] = None, ) -> Union[str, pd.DataFrame]: """Calculates the dollar cost of electricity for a given time of use rate schedule. Currently, makes use of the @@ -361,11 +350,9 @@ class BuildStockUtility: In the above, my_run is a BuildStockQuery object and ts_bldgid_column is a building_id column of the timeseries table. - collapse_ts (bool, optional): Whether to converge all timestamps to get annual result. Defaults to False. - - timestamp_grouping_func (Literal["month", "day", "hour"]), optional): The function to use to + timestamp_grouping_func (Literal["year", "month", "day", "hour"]), optional): The function to use to group the timestamps. Defaults to "month" to get monthly bill. Can be set to None to get the bill - for each timestamp, or to 'hour' to get hourly bill. Use `collapse_ts = True` to get annual bill. + for each timestamp, to 'hour' to get hourly bill, or to 'year' to collapse into an annual bill. limit (Optional[int], optional): To limit result to certain number of rows. Defaults to None. Useful for debugging. diff --git a/example_usage/advanced_usage_oedi.ipynb b/example_usage/advanced_usage_oedi.ipynb index c2c5dcb8..3ffe3c03 100644 --- a/example_usage/advanced_usage_oedi.ipynb +++ b/example_usage/advanced_usage_oedi.ipynb @@ -112,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "afb0bdf2", "metadata": {}, "outputs": [ @@ -147,9 +147,9 @@ "bldg_col = my_run._get_column('in.geometry_building_type_recs')\n", "simple_bldg_col = MappedColumn(bsq=my_run, name='simple_bldg_type', mapping_dict=building_type_map,\n", " key=bldg_col)\n", - "mapped_agg_query = my_run.agg.aggregate_annual(enduses=['electricity.total.energy_consumption'],\n", + "mapped_agg_query = my_run.query(enduses=['electricity.total.energy_consumption'],\n", " group_by=[simple_bldg_col],\n", - " get_query_only=False)\n", + " get_query_only=True)\n", "print(mapped_agg_query)" ] }, @@ -799,7 +799,7 @@ ], "metadata": { "kernelspec": { - "display_name": "buildstock-query-applied_only (3.10.18)", + "display_name": "buildstock-query (3.12.7)", "language": "python", "name": "python3" }, @@ -813,7 +813,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/example_usage/aggregates_and_savings.ipynb b/example_usage/aggregates_and_savings.ipynb index 24090feb..c36da1d1 100644 --- a/example_usage/aggregates_and_savings.ipynb +++ b/example_usage/aggregates_and_savings.ipynb @@ -3999,7 +3999,7 @@ ], "metadata": { "kernelspec": { - "display_name": "buildstock-query-applied_only (3.10.18)", + "display_name": "buildstock-query (3.12.7)", "language": "python", "name": "python3" }, @@ -4013,7 +4013,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/example_usage/basic_usage_oedi.ipynb b/example_usage/basic_usage_oedi.ipynb index e08598ce..235f0fca 100644 --- a/example_usage/basic_usage_oedi.ipynb +++ b/example_usage/basic_usage_oedi.ipynb @@ -741,7 +741,7 @@ ], "metadata": { "kernelspec": { - "display_name": "buildstock-query-applied_only (3.10.18)", + "display_name": "buildstock-query (3.12.7)", "language": "python", "name": "python3" }, @@ -755,7 +755,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/example_usage/comstock_oedi_state_and_county.ipynb b/example_usage/comstock_oedi_state_and_county.ipynb new file mode 100644 index 00000000..8440d3bc --- /dev/null +++ b/example_usage/comstock_oedi_state_and_county.ipynb @@ -0,0 +1,2064 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "74b58599", + "metadata": {}, + "outputs": [], + "source": [ + "from buildstock_query import BuildStockQuery\n", + "import pandas as pd\n", + "import polars as pl\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "42087420", + "metadata": {}, + "outputs": [], + "source": [ + "# auto reload\n", + "%load_ext autoreload\n", + "%autoreload 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cafd4037", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:buildstock_query.query_core:Loading resstock_2024_amy2018_release_2 ...\n", + "INFO:botocore.tokens:Loading cached SSO token for nrel-sso\n", + "INFO:buildstock_query.query_core:14 queries cache read from .bsq_cache/resstock_2024_amy2018_release_2_query_cache.pkl.\n", + "INFO:buildstock_query.query_core:14 queries cache is updated.\n", + "INFO:buildstock_query.main:Getting Success counts...\n", + "INFO:botocore.tokens:Loading cached SSO token for nrel-sso\n", + "INFO:botocore.tokens:Loading cached SSO token for nrel-sso\n", + "INFO:buildstock_query.report_query:Checking integrity with ts_tables ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " success inapplicable fail Sum Applied % no-chng bad-chng \\\n", + "upgrade \n", + "0 549718 0 0 549718 0.0 0.0 0.0 \n", + "1 528643 21075 0 549718 96.2 8.0 14690.0 \n", + "10 406046 143672 0 549718 73.9 2.0 5230.0 \n", + "11 528642 21076 0 549718 96.2 0.0 4329.0 \n", + "12 528641 21077 0 549718 96.2 1.0 3484.0 \n", + "13 528641 21077 0 549718 96.2 0.0 3032.0 \n", + "14 268775 280943 0 549718 48.9 0.0 100.0 \n", + "15 406029 143689 0 549718 73.9 0.0 1865.0 \n", + "16 445283 104435 0 549718 81.0 981.0 2258.0 \n", + "2 528642 21076 0 549718 96.2 10.0 12863.0 \n", + "3 528641 21077 0 549718 96.2 4.0 11606.0 \n", + "4 268777 280941 0 549718 48.9 1.0 1735.0 \n", + "5 406018 143700 0 549718 73.9 4.0 6068.0 \n", + "6 528640 21078 0 549718 96.2 3.0 13318.0 \n", + "7 528641 21077 0 549718 96.2 5.0 11906.0 \n", + "8 528641 21077 0 549718 96.2 2.0 10888.0 \n", + "9 268775 280943 0 549718 48.9 0.0 1659.0 \n", + "\n", + " ok-chng true-bad-chng true-ok-chng null any no-chng % \\\n", + "upgrade \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 513945.0 14690.0 513945.0 0.0 528643.0 0.0 \n", + "10 400814.0 5230.0 400814.0 0.0 406046.0 0.0 \n", + "11 524313.0 4329.0 524313.0 0.0 528642.0 0.0 \n", + "12 525156.0 3484.0 525156.0 0.0 528641.0 0.0 \n", + "13 525609.0 3032.0 525609.0 0.0 528641.0 0.0 \n", + "14 268675.0 100.0 268675.0 0.0 268775.0 0.0 \n", + "15 404164.0 1865.0 404164.0 0.0 406029.0 0.0 \n", + "16 442044.0 2258.0 442044.0 0.0 445283.0 0.2 \n", + "2 515769.0 12863.0 515769.0 0.0 528642.0 0.0 \n", + "3 517031.0 11606.0 517031.0 0.0 528641.0 0.0 \n", + "4 267041.0 1735.0 267041.0 0.0 268777.0 0.0 \n", + "5 399946.0 6068.0 399946.0 0.0 406018.0 0.0 \n", + "6 515319.0 13318.0 515319.0 0.0 528640.0 0.0 \n", + "7 516730.0 11906.0 516730.0 0.0 528641.0 0.0 \n", + "8 517751.0 10888.0 517751.0 0.0 528641.0 0.0 \n", + "9 267116.0 1659.0 267116.0 0.0 268775.0 0.0 \n", + "\n", + " bad-chng % ok-chng % true-ok-chng % true-bad-chng % \n", + "upgrade \n", + "0 0.0 0.0 0.0 0.0 \n", + "1 2.8 97.2 97.2 2.8 \n", + "10 1.3 98.7 98.7 1.3 \n", + "11 0.8 99.2 99.2 0.8 \n", + "12 0.7 99.3 99.3 0.7 \n", + "13 0.6 99.4 99.4 0.6 \n", + "14 0.0 100.0 100.0 0.0 \n", + "15 0.5 99.5 99.5 0.5 \n", + "16 0.5 99.3 99.3 0.5 \n", + "2 2.4 97.6 97.6 2.4 \n", + "3 2.2 97.8 97.8 2.2 \n", + "4 0.6 99.4 99.4 0.6 \n", + "5 1.5 98.5 98.5 1.5 \n", + "6 2.5 97.5 97.5 2.5 \n", + "7 2.3 97.7 97.7 2.3 \n", + "8 2.1 97.9 97.9 2.1 \n", + "9 0.6 99.4 99.4 0.6 \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:buildstock_query.query_core:{'submitted': 0, 'running': 0, 'pending': 17, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[0] (27ba7c05-269d-4b18-b88f-ec05cd1d0b93)\n", + "INFO:buildstock_query.query_core:Submitted queries[1] (a8288503-f890-4f19-b792-b12cfedcbe76)\n", + "INFO:buildstock_query.query_core:{'submitted': 2, 'running': 1, 'pending': 16, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[2] (e0273e91-68c2-4aa0-949c-1764ae3fecb5)\n", + "INFO:buildstock_query.query_core:Submitted queries[3] (bd860812-7da8-448e-a444-4e8978b99ee1)\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[4] (3446af6b-467e-4b75-bcce-59a621fd68ee)\n", + "INFO:buildstock_query.query_core:Submitted queries[5] (c14a697b-cfe2-403f-a2cf-36243ff32799)\n", + "INFO:buildstock_query.query_core:{'submitted': 6, 'running': 3, 'pending': 11, 'completed': 0, 'failed': 3}\n", + "INFO:buildstock_query.query_core:Submitted queries[6] (9441b2e8-53c7-4a5c-a5b5-36ed2a76eaac)\n", + "INFO:buildstock_query.query_core:{'submitted': 7, 'running': 4, 'pending': 10, 'completed': 0, 'failed': 3}\n", + "INFO:buildstock_query.query_core:{'submitted': 7, 'running': 4, 'pending': 10, 'completed': 0, 'failed': 3}\n", + "INFO:buildstock_query.query_core:Submitted queries[7] (3a7aea29-9bab-4db0-a04b-ef8131c89eb4)\n", + "INFO:buildstock_query.query_core:{'submitted': 8, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 4}\n", + "INFO:buildstock_query.query_core:Submitted queries[8] (2ef39f14-04db-4c6b-9a6d-f570826048c2)\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 8, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 8, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 8, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 3, 'pending': 8, 'completed': 0, 'failed': 6}\n", + "INFO:buildstock_query.query_core:Submitted queries[9] (80d43091-7acb-401e-b316-0d9a372f2b4c)\n", + "INFO:buildstock_query.query_core:{'submitted': 10, 'running': 4, 'pending': 7, 'completed': 0, 'failed': 6}\n", + "INFO:buildstock_query.query_core:Submitted queries[10] (4a2191c0-8ff2-4738-8702-b6fa3fa23f11)\n", + "INFO:buildstock_query.query_core:{'submitted': 11, 'running': 4, 'pending': 6, 'completed': 0, 'failed': 7}\n", + "INFO:buildstock_query.query_core:{'submitted': 11, 'running': 4, 'pending': 6, 'completed': 0, 'failed': 7}\n", + "INFO:buildstock_query.query_core:{'submitted': 11, 'running': 4, 'pending': 6, 'completed': 0, 'failed': 7}\n", + "INFO:buildstock_query.query_core:{'submitted': 11, 'running': 4, 'pending': 6, 'completed': 0, 'failed': 7}\n", + "INFO:buildstock_query.query_core:Submitted queries[11] (4e421697-c54a-4120-8a29-3a4ca0f4b598)\n", + "INFO:buildstock_query.query_core:{'submitted': 12, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 12, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 12, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 12, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 8}\n", + "INFO:buildstock_query.query_core:Submitted queries[12] (ec6dd177-3dcf-416e-abb3-c06e4ab4873a)\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 4, 'completed': 1, 'failed': 8}\n", + "INFO:buildstock_query.query_core:Submitted queries[13] (7467c5eb-f679-4344-a8a3-3a9c0fe7bcf8)\n", + "INFO:buildstock_query.query_core:Submitted queries[14] (4716fe1c-7d1b-4bb4-8e13-f82877f4070e)\n", + "INFO:buildstock_query.query_core:{'submitted': 15, 'running': 4, 'pending': 2, 'completed': 2, 'failed': 9}\n", + "INFO:buildstock_query.query_core:Submitted queries[15] (32e70efd-7b10-4bcb-8dbc-e469738eec67)\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:{'submitted': 16, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 9}\n", + "INFO:buildstock_query.query_core:Submitted queries[16] (44793a26-97bb-41e7-9f02-b18e8e9c6772)\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 4, 'pending': 0, 'completed': 3, 'failed': 10}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 4, 'pending': 0, 'completed': 3, 'failed': 10}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 3, 'pending': 0, 'completed': 3, 'failed': 11}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 3, 'pending': 0, 'completed': 3, 'failed': 11}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 1, 'pending': 0, 'completed': 3, 'failed': 13}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 1, 'pending': 0, 'completed': 3, 'failed': 13}\n", + "INFO:buildstock_query.query_core:{'submitted': 17, 'running': 0, 'pending': 0, 'completed': 4, 'failed': 13}\n", + "INFO:buildstock_query.query_core:Batch query completed. \n", + "WARNING:buildstock_query.query_core:13 queries failed. Redoing them\n", + "INFO:buildstock_query.query_core:{'submitted': 0, 'running': 0, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[0] (f1fa1721-2cdc-4d91-a139-3f274dcfcc3f)\n", + "INFO:buildstock_query.query_core:Submitted queries[1] (b677d3f8-a893-42b5-a2f5-da3545206b90)\n", + "INFO:buildstock_query.query_core:{'submitted': 2, 'running': 0, 'pending': 13, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[2] (1bd9a95c-3a72-4204-8c64-9be50ab09c6d)\n", + "INFO:buildstock_query.query_core:Submitted queries[3] (1f38c017-6de1-4fa8-b94b-df6ad08f1c06)\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:{'submitted': 4, 'running': 4, 'pending': 9, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[4] (5143d83d-84b0-4657-bf80-f45d78fbfad8)\n", + "INFO:buildstock_query.query_core:{'submitted': 5, 'running': 3, 'pending': 8, 'completed': 0, 'failed': 2}\n", + "INFO:buildstock_query.query_core:Submitted queries[5] (7a6ba12e-ffaa-40e5-93be-5eda124884ec)\n", + "INFO:buildstock_query.query_core:{'submitted': 6, 'running': 4, 'pending': 7, 'completed': 0, 'failed': 2}\n", + "INFO:buildstock_query.query_core:{'submitted': 6, 'running': 4, 'pending': 7, 'completed': 0, 'failed': 2}\n", + "INFO:buildstock_query.query_core:{'submitted': 6, 'running': 4, 'pending': 7, 'completed': 0, 'failed': 2}\n", + "INFO:buildstock_query.query_core:Submitted queries[6] (0b829964-0aba-402e-9f77-908e69789c12)\n", + "INFO:buildstock_query.query_core:{'submitted': 7, 'running': 4, 'pending': 6, 'completed': 0, 'failed': 3}\n", + "INFO:buildstock_query.query_core:Submitted queries[7] (3719a3a4-1f33-4028-be4b-77743be1095b)\n", + "INFO:buildstock_query.query_core:{'submitted': 8, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 4}\n", + "INFO:buildstock_query.query_core:{'submitted': 8, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 4}\n", + "INFO:buildstock_query.query_core:{'submitted': 8, 'running': 4, 'pending': 5, 'completed': 0, 'failed': 4}\n", + "INFO:buildstock_query.query_core:Submitted queries[8] (443fb3a2-0d8b-4d39-a26e-fc555d01d355)\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 4, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 4, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 4, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 9, 'running': 4, 'pending': 4, 'completed': 0, 'failed': 5}\n", + "INFO:buildstock_query.query_core:Submitted queries[9] (c00f5562-678d-4fd4-bf24-419fce6826ce)\n", + "INFO:buildstock_query.query_core:{'submitted': 10, 'running': 4, 'pending': 3, 'completed': 1, 'failed': 5}\n", + "INFO:buildstock_query.query_core:Submitted queries[10] (9f6b1547-da1f-45b6-9d18-c19d6e53c71d)\n", + "INFO:buildstock_query.query_core:{'submitted': 11, 'running': 3, 'pending': 2, 'completed': 3, 'failed': 5}\n", + "INFO:buildstock_query.query_core:Submitted queries[11] (eb1e29d6-91ac-43c0-a3fd-db0dcb1424e7)\n", + "INFO:buildstock_query.query_core:{'submitted': 12, 'running': 4, 'pending': 1, 'completed': 3, 'failed': 5}\n", + "INFO:buildstock_query.query_core:Submitted queries[12] (c4ffef75-2b51-47af-b6e3-d23bc4c3c227)\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 0, 'completed': 4, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 0, 'completed': 4, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 0, 'completed': 4, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 4, 'pending': 0, 'completed': 4, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 2, 'pending': 0, 'completed': 6, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 2, 'pending': 0, 'completed': 6, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 1, 'pending': 0, 'completed': 7, 'failed': 5}\n", + "INFO:buildstock_query.query_core:{'submitted': 13, 'running': 0, 'pending': 0, 'completed': 8, 'failed': 5}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query id: f1fa1721-2cdc-4d91-a139-3f274dcfcc3f. \n", + " Query string: SELECT '0' AS upgrade, count(distinct(resstock_2024_amy2018_release_2_by_state.bldg_id)) AS count \n", + "FROM resstock_2024_amy2018_release_2_by_state \n", + "WHERE CAST(resstock_2024_amy2018_release_2_by_state.upgrade AS VARCHAR) = '0'. Query Ended with: FAILED\n", + "Error: HIVE_S3_THROTTLING: S3 throttling\n", + "\n", + "Query id: b677d3f8-a893-42b5-a2f5-da3545206b90. \n", + " Query string: SELECT '1' AS upgrade, count(distinct(resstock_2024_amy2018_release_2_by_state.bldg_id)) AS count \n", + "FROM resstock_2024_amy2018_release_2_by_state \n", + "WHERE CAST(resstock_2024_amy2018_release_2_by_state.upgrade AS VARCHAR) = '1'. Query Ended with: FAILED\n", + "Error: HIVE_S3_THROTTLING: S3 throttling\n", + "\n", + "Query id: 1bd9a95c-3a72-4204-8c64-9be50ab09c6d. \n", + " Query string: SELECT '2' AS upgrade, count(distinct(resstock_2024_amy2018_release_2_by_state.bldg_id)) AS count \n", + "FROM resstock_2024_amy2018_release_2_by_state \n", + "WHERE CAST(resstock_2024_amy2018_release_2_by_state.upgrade AS VARCHAR) = '2'. Query Ended with: FAILED\n", + "Error: HIVE_S3_THROTTLING: S3 throttling\n", + "\n", + "Query id: 1f38c017-6de1-4fa8-b94b-df6ad08f1c06. \n", + " Query string: SELECT '3' AS upgrade, count(distinct(resstock_2024_amy2018_release_2_by_state.bldg_id)) AS count \n", + "FROM resstock_2024_amy2018_release_2_by_state \n", + "WHERE CAST(resstock_2024_amy2018_release_2_by_state.upgrade AS VARCHAR) = '3'. Query Ended with: FAILED\n", + "Error: HIVE_S3_THROTTLING: S3 throttling\n", + "\n", + "Query id: 5143d83d-84b0-4657-bf80-f45d78fbfad8. \n", + " Query string: SELECT '4' AS upgrade, count(distinct(resstock_2024_amy2018_release_2_by_state.bldg_id)) AS count \n", + "FROM resstock_2024_amy2018_release_2_by_state \n", + "WHERE CAST(resstock_2024_amy2018_release_2_by_state.upgrade AS VARCHAR) = '4'. Query Ended with: FAILED\n", + "Error: HIVE_S3_THROTTLING: S3 throttling\n", + "\n" + ] + }, + { + "ename": "QueryException", + "evalue": "Queries failed again. Sorry!", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mQueryException\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m res_run = BuildStockQuery('rescore','buildstock_sdr',\n\u001b[32m 2\u001b[39m \u001b[33m\"resstock_2024_amy2018_release_2\"\u001b[39m,\n\u001b[32m 3\u001b[39m buildstock_type=\u001b[33m'resstock'\u001b[39m,\n\u001b[32m 4\u001b[39m db_schema=\u001b[33m\"resstock_oedi\"\u001b[39m,\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/.venv/lib/python3.12/site-packages/pydantic/_internal/_validate_call.py:40\u001b[39m, in \u001b[36mupdate_wrapper_attributes..wrapper_function\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 38\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(wrapped)\n\u001b[32m 39\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper_function\u001b[39m(*args, **kwargs):\n\u001b[32m---> \u001b[39m\u001b[32m40\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/.venv/lib/python3.12/site-packages/pydantic/_internal/_validate_call.py:137\u001b[39m, in \u001b[36mValidateCallWrapper.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__pydantic_complete__:\n\u001b[32m 135\u001b[39m \u001b[38;5;28mself\u001b[39m._create_validators()\n\u001b[32m--> \u001b[39m\u001b[32m137\u001b[39m res = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpydantic_core\u001b[49m\u001b[43m.\u001b[49m\u001b[43mArgsKwargs\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 138\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__return_pydantic_validator__:\n\u001b[32m 139\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__return_pydantic_validator__(res)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/buildstock_query/main.py:120\u001b[39m, in \u001b[36mBuildStockQuery.__init__\u001b[39m\u001b[34m(self, workgroup, db_name, table_name, db_schema, buildstock_type, sample_weight_override, region_name, execution_history, skip_reports, athena_query_reuse, query_unload_s3_bucket, cache_folder)\u001b[39m\n\u001b[32m 118\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mself\u001b[39m.report.get_success_report())\n\u001b[32m 119\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.ts_table \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m120\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mreport\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcheck_ts_bs_integrity\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 121\u001b[39m \u001b[38;5;28mself\u001b[39m.save_cache()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/buildstock_query/report_query.py:756\u001b[39m, in \u001b[36mBuildStockReport.check_ts_bs_integrity\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 753\u001b[39m logger.info(\u001b[33m\"\u001b[39m\u001b[33mChecking integrity with ts_tables ...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 754\u001b[39m check_pass = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m756\u001b[39m raw_ts_report = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_ts_report\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 757\u001b[39m bs_dict = \u001b[38;5;28mself\u001b[39m._get_metadata_distinct_bldg_count()\n\u001b[32m 758\u001b[39m ts_dict = raw_ts_report.to_dict()[\u001b[33m\"\u001b[39m\u001b[33msuccess\u001b[39m\u001b[33m\"\u001b[39m]\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/buildstock_query/report_query.py:672\u001b[39m, in \u001b[36mBuildStockReport._get_ts_report\u001b[39m\u001b[34m(self, get_query_only)\u001b[39m\n\u001b[32m 669\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m pd.DataFrame(columns=[\u001b[33m\"\u001b[39m\u001b[33msuccess\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 671\u001b[39m batch_id = \u001b[38;5;28mself\u001b[39m._bsq.submit_batch_query(ts_queries, max_threads=\u001b[32m4\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m672\u001b[39m result_df = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_bsq\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_batch_query_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbatch_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 673\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result_df.empty:\n\u001b[32m 674\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m pd.DataFrame(columns=[\u001b[33m\"\u001b[39m\u001b[33msuccess\u001b[39m\u001b[33m\"\u001b[39m])\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/.venv/lib/python3.12/site-packages/pydantic/_internal/_validate_call.py:40\u001b[39m, in \u001b[36mupdate_wrapper_attributes..wrapper_function\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 38\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(wrapped)\n\u001b[32m 39\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper_function\u001b[39m(*args, **kwargs):\n\u001b[32m---> \u001b[39m\u001b[32m40\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/.venv/lib/python3.12/site-packages/pydantic/_internal/_validate_call.py:137\u001b[39m, in \u001b[36mValidateCallWrapper.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__pydantic_complete__:\n\u001b[32m 135\u001b[39m \u001b[38;5;28mself\u001b[39m._create_validators()\n\u001b[32m--> \u001b[39m\u001b[32m137\u001b[39m res = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpydantic_core\u001b[49m\u001b[43m.\u001b[49m\u001b[43mArgsKwargs\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 138\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__return_pydantic_validator__:\n\u001b[32m 139\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__return_pydantic_validator__(res)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/buildstock2025/buildstock-query-applied_only/buildstock_query/query_core.py:979\u001b[39m, in \u001b[36mQueryCore.get_batch_query_result\u001b[39m\u001b[34m(self, batch_id, combine, no_block)\u001b[39m\n\u001b[32m 977\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m new_report[\u001b[33m\"\u001b[39m\u001b[33mfailed\u001b[39m\u001b[33m\"\u001b[39m] > \u001b[32m0\u001b[39m:\n\u001b[32m 978\u001b[39m \u001b[38;5;28mself\u001b[39m.print_failed_query_errors(new_batch_id)\n\u001b[32m--> \u001b[39m\u001b[32m979\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m QueryException(\u001b[33m\"\u001b[39m\u001b[33mQueries failed again. Sorry!\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 980\u001b[39m logger.info(\u001b[33m\"\u001b[39m\u001b[33mThe queries succeeded this time. Gathering all the results.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 981\u001b[39m \u001b[38;5;66;03m# replace the old failed exe_ids with new successful exe_ids\u001b[39;00m\n", + "\u001b[31mQueryException\u001b[39m: Queries failed again. Sorry!" + ] + } + ], + "source": [ + "res_run = BuildStockQuery('rescore','buildstock_sdr',\n", + " \"resstock_2024_amy2018_release_2\",\n", + " buildstock_type='resstock',\n", + " db_schema=\"resstock_oedi\",\n", + " skip_reports=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a287cd2d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:buildstock_query.query_core:Loading comstock_amy2018_r2_2025 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:botocore.tokens:Loading cached SSO token for nrel-sso\n", + "INFO:botocore.tokens:SSO Token refresh succeeded\n", + "INFO:buildstock_query.query_core:145 queries cache read from .bsq_cache/comstock_amy2018_r2_2025_query_cache.pkl.\n", + "INFO:buildstock_query.query_core:145 queries cache is updated.\n", + "INFO:buildstock_query.main:Getting Success counts...\n", + "INFO:buildstock_query.report_query:Checking integrity with ts_tables ...\n", + "INFO:buildstock_query.query_core:Submitted queries[0] (CACHED)\n", + "INFO:buildstock_query.query_core:{'submitted': 0, 'running': 0, 'pending': 62, 'completed': 0, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Submitted queries[1] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[2] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[3] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[4] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[5] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[6] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[7] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[8] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[9] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[10] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[11] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[12] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[13] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[14] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[15] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[16] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[17] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[18] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[19] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[20] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[21] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[22] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[23] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[24] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[25] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[26] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[27] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[28] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[29] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[30] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[31] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[32] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[33] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[34] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[35] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[36] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[37] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[38] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[39] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[40] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[41] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[42] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[43] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[44] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[45] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[46] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[47] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[48] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[49] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[50] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[51] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[52] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[53] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[54] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[55] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[56] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[57] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[58] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[59] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[60] (CACHED)\n", + "INFO:buildstock_query.query_core:Submitted queries[61] (CACHED)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " success inapplicable fail Sum Applied % no-chng \\\n", + "upgrade \n", + "0 7602151.0 0.0 0 7602151.0 0.0 0.0 \n", + "1 4257229.0 3344922.0 0 7602151.0 56.0 0.0 \n", + "10 4257064.0 3345087.0 0 7602151.0 56.0 0.0 \n", + "11 4053004.0 3549147.0 0 7602151.0 53.3 0.0 \n", + "12 4052904.0 3549247.0 0 7602151.0 53.3 0.0 \n", + "... ... ... ... ... ... ... \n", + "60 7274170.0 327981.0 0 7602151.0 95.7 47.0 \n", + "61 7602128.0 23.0 0 7602151.0 100.0 5269.0 \n", + "7 4256950.0 3345201.0 0 7602151.0 56.0 0.0 \n", + "8 4257231.0 3344920.0 0 7602151.0 56.0 0.0 \n", + "9 4256914.0 3345237.0 0 7602151.0 56.0 0.0 \n", + "\n", + " bad-chng ok-chng true-bad-chng true-ok-chng null any \\\n", + "upgrade \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 131.0 4257098.0 131.0 4257098.0 0.0 4257229.0 \n", + "10 0.0 4257064.0 0.0 4257064.0 0.0 4257064.0 \n", + "11 37.0 4052967.0 37.0 4052967.0 0.0 4053004.0 \n", + "12 37.0 4052867.0 37.0 4052867.0 0.0 4052904.0 \n", + "... ... ... ... ... ... ... \n", + "60 106943.0 7167180.0 106943.0 7167180.0 0.0 7274170.0 \n", + "61 10187.0 7586672.0 10187.0 7586672.0 0.0 7602128.0 \n", + "7 1850.0 4255100.0 1850.0 4255100.0 0.0 4256950.0 \n", + "8 37769.0 4219462.0 37769.0 4219462.0 0.0 4257231.0 \n", + "9 3321.0 4253593.0 3321.0 4253593.0 0.0 4256914.0 \n", + "\n", + " no-chng % bad-chng % ok-chng % true-ok-chng % true-bad-chng % \n", + "upgrade \n", + "0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 100.0 100.0 0.0 \n", + "10 0.0 0.0 100.0 100.0 0.0 \n", + "11 0.0 0.0 100.0 100.0 0.0 \n", + "12 0.0 0.0 100.0 100.0 0.0 \n", + "... ... ... ... ... ... \n", + "60 0.0 1.5 98.5 98.5 1.5 \n", + "61 0.1 0.1 99.8 99.8 0.1 \n", + "7 0.0 0.0 100.0 100.0 0.0 \n", + "8 0.0 0.9 99.1 99.1 0.9 \n", + "9 0.0 0.1 99.9 99.9 0.1 \n", + "\n", + "[62 rows x 17 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:buildstock_query.query_core:{'submitted': 62, 'running': 0, 'pending': 0, 'completed': 62, 'failed': 0}\n", + "INFO:buildstock_query.query_core:Batch query completed. \n", + "INFO:buildstock_query.query_core:Got result from Query [0] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [1] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [2] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [3] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [4] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [5] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [6] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [7] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [8] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [9] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [10] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [11] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [12] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [13] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [14] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [15] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [16] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [17] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [18] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [19] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [20] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [21] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [22] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [23] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [24] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [25] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [26] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [27] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [28] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [29] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [30] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [31] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [32] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [33] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [34] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [35] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [36] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [37] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [38] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [39] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [40] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [41] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [42] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [43] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [44] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [45] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [46] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [47] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [48] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [49] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [50] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [51] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [52] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [53] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [54] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [55] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [56] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [57] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [58] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [59] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [60] (CACHED)\n", + "INFO:buildstock_query.query_core:Got result from Query [61] (CACHED)\n", + "INFO:buildstock_query.query_core:Concatenating the results.\n", + "INFO:buildstock_query.query_core:No new queries to save.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[92mBaseline/upgrade and timeseries tables have matching distinct building counts.\u001b[0m\n", + "\u001b[92mNo duplicate rows in baseline table on ['bldg_id', 'in.nhgis_tract_gisjoin', 'state'].\u001b[0m\n", + "\u001b[92mNo duplicate rows in upgrade table on ['bldg_id', 'in.nhgis_tract_gisjoin', 'state', 'upgrade'].\u001b[0m\n", + "\u001b[92mAll building partitions have the same number of (35040) timeseries rows.\u001b[0m\n" + ] + } + ], + "source": [ + "my_run = BuildStockQuery('rescore','buildstock_sdr',\n", + " \"comstock_amy2018_r2_2025\",\n", + " buildstock_type='comstock',\n", + " db_schema=\"comstock_oedi_state_and_county\",\n", + " skip_reports=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2b1d7a4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
successinapplicablefailSumApplied %no-chngbad-chngok-chngtrue-bad-chngtrue-ok-chngnullanyno-chng %bad-chng %ok-chng %true-ok-chng %true-bad-chng %
upgrade
387474082.0128069.007602151.098.34245.07404789.065048.07404789.065048.00.07474082.00.199.10.90.999.1
293474227.04127924.007602151.045.777199.03332879.064149.03332879.064149.00.03474227.02.295.91.81.895.9
376229122.01373029.007602151.081.967406.05656507.0505209.05656507.0505209.00.06229122.01.190.88.18.190.8
261611444.05990707.007602151.021.20.0558385.01053059.0558385.01053059.00.01611444.00.034.765.365.334.7
477565905.036246.007602151.099.58816.01103454.06453635.01103454.06453635.00.07565905.00.114.685.385.314.6
......................................................
303546153.04055998.007602151.046.60.0494.03545659.0494.03545659.00.03546153.00.00.0100.0100.00.0
333528558.04073593.007602151.046.461.0800.03527697.0800.03527697.00.03528558.00.00.0100.0100.00.0
14257229.03344922.007602151.056.00.0131.04257098.0131.04257098.00.04257229.00.00.0100.0100.00.0
407602148.03.007602151.0100.00.0772.07601376.0772.07601376.00.07602148.00.00.0100.0100.00.0
07602151.00.007602151.00.00.00.00.00.00.00.00.00.00.00.00.00.0
\n", + "

62 rows × 17 columns

\n", + "" + ], + "text/plain": [ + " success inapplicable fail Sum Applied % no-chng \\\n", + "upgrade \n", + "38 7474082.0 128069.0 0 7602151.0 98.3 4245.0 \n", + "29 3474227.0 4127924.0 0 7602151.0 45.7 77199.0 \n", + "37 6229122.0 1373029.0 0 7602151.0 81.9 67406.0 \n", + "26 1611444.0 5990707.0 0 7602151.0 21.2 0.0 \n", + "47 7565905.0 36246.0 0 7602151.0 99.5 8816.0 \n", + "... ... ... ... ... ... ... \n", + "30 3546153.0 4055998.0 0 7602151.0 46.6 0.0 \n", + "33 3528558.0 4073593.0 0 7602151.0 46.4 61.0 \n", + "1 4257229.0 3344922.0 0 7602151.0 56.0 0.0 \n", + "40 7602148.0 3.0 0 7602151.0 100.0 0.0 \n", + "0 7602151.0 0.0 0 7602151.0 0.0 0.0 \n", + "\n", + " bad-chng ok-chng true-bad-chng true-ok-chng null any \\\n", + "upgrade \n", + "38 7404789.0 65048.0 7404789.0 65048.0 0.0 7474082.0 \n", + "29 3332879.0 64149.0 3332879.0 64149.0 0.0 3474227.0 \n", + "37 5656507.0 505209.0 5656507.0 505209.0 0.0 6229122.0 \n", + "26 558385.0 1053059.0 558385.0 1053059.0 0.0 1611444.0 \n", + "47 1103454.0 6453635.0 1103454.0 6453635.0 0.0 7565905.0 \n", + "... ... ... ... ... ... ... \n", + "30 494.0 3545659.0 494.0 3545659.0 0.0 3546153.0 \n", + "33 800.0 3527697.0 800.0 3527697.0 0.0 3528558.0 \n", + "1 131.0 4257098.0 131.0 4257098.0 0.0 4257229.0 \n", + "40 772.0 7601376.0 772.0 7601376.0 0.0 7602148.0 \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " no-chng % bad-chng % ok-chng % true-ok-chng % true-bad-chng % \n", + "upgrade \n", + "38 0.1 99.1 0.9 0.9 99.1 \n", + "29 2.2 95.9 1.8 1.8 95.9 \n", + "37 1.1 90.8 8.1 8.1 90.8 \n", + "26 0.0 34.7 65.3 65.3 34.7 \n", + "47 0.1 14.6 85.3 85.3 14.6 \n", + "... ... ... ... ... ... \n", + "30 0.0 0.0 100.0 100.0 0.0 \n", + "33 0.0 0.0 100.0 100.0 0.0 \n", + "1 0.0 0.0 100.0 100.0 0.0 \n", + "40 0.0 0.0 100.0 100.0 0.0 \n", + "0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[62 rows x 17 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report_df = my_run.report.get_success_report()\n", + "report_df.sort_values('bad-chng %', ascending=False, inplace=True)\n", + "report_df.to_csv(\"comstock_upgrade_results.csv\")\n", + "report_df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a810644", + "metadata": {}, + "outputs": [], + "source": [ + "# folder_path = my_run.download_metadata_and_annual_results(\n", + "# upgrade_id=\"0\",\n", + "# folder=\"comstock_oedi_state_and_county\"\n", + "# )\n", + "# print(folder_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f6279f97", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "folder_path = Path(\"comstock_oedi_state_and_county/upgrade=0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9e0e3b31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (51, 4)
in.statetotal_rowsdistinct_bldgsreuse_rate
stru32u32f64
"FL"5559603311167.91
"VA"184670190896.79
"OH"305775347887.92
"TX"716243832186.08
"SC"146967174284.37
"WY"18630171510.86
"DE"14490140110.34
"AK"1324814219.32
"SD"1817720388.92
"VT"907510888.34
" + ], + "text/plain": [ + "shape: (51, 4)\n", + "┌──────────┬────────────┬────────────────┬────────────┐\n", + "│ in.state ┆ total_rows ┆ distinct_bldgs ┆ reuse_rate │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ u32 ┆ u32 ┆ f64 │\n", + "╞══════════╪════════════╪════════════════╪════════════╡\n", + "│ FL ┆ 555960 ┆ 3311 ┆ 167.91 │\n", + "│ VA ┆ 184670 ┆ 1908 ┆ 96.79 │\n", + "│ OH ┆ 305775 ┆ 3478 ┆ 87.92 │\n", + "│ TX ┆ 716243 ┆ 8321 ┆ 86.08 │\n", + "│ SC ┆ 146967 ┆ 1742 ┆ 84.37 │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ WY ┆ 18630 ┆ 1715 ┆ 10.86 │\n", + "│ DE ┆ 14490 ┆ 1401 ┆ 10.34 │\n", + "│ AK ┆ 13248 ┆ 1421 ┆ 9.32 │\n", + "│ SD ┆ 18177 ┆ 2038 ┆ 8.92 │\n", + "│ VT ┆ 9075 ┆ 1088 ┆ 8.34 │\n", + "└──────────┴────────────┴────────────────┴────────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the full set is too big to load all into memory - use polars to scan and collect only relevant columns\n", + "df = pl.scan_parquet(folder_path, hidden_file_prefix=\".\")\n", + "reuse_rate = (\n", + " df.group_by(\"in.state\")\n", + " .agg([\n", + " pl.len().alias(\"total_rows\"),\n", + " pl.col(\"bldg_id\").n_unique().alias(\"distinct_bldgs\")\n", + " ])\n", + " .with_columns(\n", + " (pl.col(\"total_rows\") / pl.col(\"distinct_bldgs\")).round(2).alias(\"reuse_rate\")\n", + " )\n", + " .sort(\"reuse_rate\", descending=True)\n", + " .collect()\n", + ")\n", + "reuse_rate" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5402a034", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 3)
total_rowsdistinct_bldgsreuse_rate_mean
u32u32f64
760215115895242.974706
" + ], + "text/plain": [ + "shape: (1, 3)\n", + "┌────────────┬────────────────┬─────────────────┐\n", + "│ total_rows ┆ distinct_bldgs ┆ reuse_rate_mean │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ u32 ┆ f64 │\n", + "╞════════════╪════════════════╪═════════════════╡\n", + "│ 7602151 ┆ 158952 ┆ 42.974706 │\n", + "└────────────┴────────────────┴─────────────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reuse_rate_agg = reuse_rate.select(\n", + " pl.col(\"total_rows\").sum().alias(\"total_rows\"),\n", + " pl.col(\"distinct_bldgs\").sum().alias(\"distinct_bldgs\"),\n", + " pl.col(\"reuse_rate\").mean().alias(\"reuse_rate_mean\"),\n", + ")\n", + "reuse_rate_agg" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6b28299d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
bldg_id
u32
102678
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌─────────┐\n", + "│ bldg_id │\n", + "│ --- │\n", + "│ u32 │\n", + "╞═════════╡\n", + "│ 102678 │\n", + "└─────────┘" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(pl.col(\"bldg_id\").n_unique()).collect()\n", + "# This is lower than the sum above because some bldg_ids are duplicated across states." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3f7f742a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['in.as_simulated_nhgis_county_gisjoin',\n", + " 'in.county_name',\n", + " 'in.nhgis_county_gisjoin',\n", + " 'county']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[c for c in df.collect_schema().names() if \"county\" in c]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3303a25b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['out.electricity.cooling.energy_consumption..kwh',\n", + " 'out.electricity.cooling.energy_savings..kwh',\n", + " 'out.electricity.exterior_lighting.energy_consumption..kwh',\n", + " 'out.electricity.exterior_lighting.energy_savings..kwh',\n", + " 'out.electricity.fans.energy_consumption..kwh',\n", + " 'out.electricity.fans.energy_savings..kwh',\n", + " 'out.electricity.heat_recovery.energy_consumption..kwh',\n", + " 'out.electricity.heat_recovery.energy_savings..kwh',\n", + " 'out.electricity.heat_rejection.energy_consumption..kwh',\n", + " 'out.electricity.heat_rejection.energy_savings..kwh',\n", + " 'out.electricity.heating.energy_consumption..kwh',\n", + " 'out.electricity.heating.energy_savings..kwh',\n", + " 'out.electricity.interior_equipment.energy_consumption..kwh',\n", + " 'out.electricity.interior_equipment.energy_savings..kwh',\n", + " 'out.electricity.interior_lighting.energy_consumption..kwh',\n", + " 'out.electricity.interior_lighting.energy_savings..kwh',\n", + " 'out.electricity.net.energy_consumption..kwh',\n", + " 'out.electricity.net.energy_savings..kwh',\n", + " 'out.electricity.pumps.energy_consumption..kwh',\n", + " 'out.electricity.pumps.energy_savings..kwh',\n", + " 'out.electricity.purchased.energy_consumption..kwh',\n", + " 'out.electricity.purchased.energy_savings..kwh',\n", + " 'out.electricity.pv.energy_consumption..kwh',\n", + " 'out.electricity.pv.energy_savings..kwh',\n", + " 'out.electricity.refrigeration.energy_consumption..kwh',\n", + " 'out.electricity.refrigeration.energy_savings..kwh',\n", + " 'out.electricity.total.apr.energy_consumption..kwh',\n", + " 'out.electricity.total.aug.energy_consumption..kwh',\n", + " 'out.electricity.total.dec.energy_consumption..kwh',\n", + " 'out.electricity.total.energy_consumption..kwh',\n", + " 'out.electricity.total.energy_savings..kwh',\n", + " 'out.electricity.total.feb.energy_consumption..kwh',\n", + " 'out.electricity.total.jan.energy_consumption..kwh',\n", + " 'out.electricity.total.jul.energy_consumption..kwh',\n", + " 'out.electricity.total.jun.energy_consumption..kwh',\n", + " 'out.electricity.total.mar.energy_consumption..kwh',\n", + " 'out.electricity.total.may.energy_consumption..kwh',\n", + " 'out.electricity.total.nov.energy_consumption..kwh',\n", + " 'out.electricity.total.oct.energy_consumption..kwh',\n", + " 'out.electricity.total.sep.energy_consumption..kwh',\n", + " 'out.electricity.water_systems.energy_consumption..kwh',\n", + " 'out.electricity.water_systems.energy_savings..kwh',\n", + " 'out.electricity.total.peak_demand..kw',\n", + " 'out.electricity.cooling.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.cooling.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.exterior_lighting.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.exterior_lighting.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.fans.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.fans.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.heat_recovery.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.heat_recovery.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.heat_rejection.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.heat_rejection.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.heating.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.heating.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.interior_equipment.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.interior_equipment.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.interior_lighting.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.interior_lighting.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.net.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.net.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.pumps.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.pumps.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.purchased.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.purchased.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.pv.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.pv.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.refrigeration.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.refrigeration.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.total.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.total.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.electricity.water_systems.energy_consumption_intensity..kwh_per_ft2',\n", + " 'out.electricity.water_systems.energy_savings_intensity..kwh_per_ft2',\n", + " 'out.emissions.electricity.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.cooling.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.cooling.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.enduse_group.hvac.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.exterior_lights.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.heating.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.heating.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.heating.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.heating.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.heating.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.heating.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_equipment.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.interior_lights.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.refrigeration.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.shoulder_daily_average.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.shoulder_daily_average.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.shoulder_daily_average.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.shoulder_daily_average.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.shoulder_daily_average.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.summer_daily_average.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.summer_daily_average.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.summer_daily_average.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.summer_daily_average.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.summer_daily_average.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.aer_95_decarb_by_2035_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.aer_95_decarb_by_2050_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.aer_high_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.aer_low_re_cost_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.aer_mid_case_from_2023..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2018_state..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2018_subregion..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2019_state..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2019_subregion..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2020_state..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2020_subregion..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2035_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2035_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2035_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2035_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2050_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_95_decarb_by_2050_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_high_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_low_re_cost_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_low_re_cost_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_low_re_cost_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_mid_case_15_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_mid_case_25_2025_start..co2e_kg',\n", + " 'out.emissions.electricity.water_systems.lrmer_mid_case_30_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.winter_daily_average.egrid_2021_state..co2e_kg',\n", + " 'out.emissions.electricity.winter_daily_average.egrid_2021_subregion..co2e_kg',\n", + " 'out.emissions.electricity.winter_daily_average.lrmer_high_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.winter_daily_average.lrmer_low_re_cost_15_2023_start..co2e_kg',\n", + " 'out.emissions.electricity.winter_daily_average.lrmer_mid_case_15_2023_start..co2e_kg',\n", + " 'out.utility_bills.electricity_bill_max..usd',\n", + " 'out.utility_bills.electricity_bill_max_label',\n", + " 'out.utility_bills.electricity_bill_mean..usd',\n", + " 'out.utility_bills.electricity_bill_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_bill_median_high..usd',\n", + " 'out.utility_bills.electricity_bill_median_high_label',\n", + " 'out.utility_bills.electricity_bill_median_low..usd',\n", + " 'out.utility_bills.electricity_bill_median_low_label',\n", + " 'out.utility_bills.electricity_bill_min..usd',\n", + " 'out.utility_bills.electricity_bill_min_label',\n", + " 'out.utility_bills.electricity_bill_num_bills',\n", + " 'out.utility_bills.electricity_bill_savings_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_bill_savings_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_bill_savings_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_bill_savings_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_bill_savings_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_max..usd',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_max_label',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_mean..usd',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_high..usd',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_high_label',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_low..usd',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_median_low_label',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_min..usd',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_min_label',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_savings_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_savings_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_savings_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_savings_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_flat_bill_savings_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_max..usd',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_max_label',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_mean..usd',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_high..usd',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_high_label',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_low..usd',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_median_low_label',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_min..usd',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_min_label',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_savings_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_savings_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_savings_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_savings_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_demandcharge_tou_bill_savings_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_max..usd',\n", + " 'out.utility_bills.electricity_energycharge_bill_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_max_label',\n", + " 'out.utility_bills.electricity_energycharge_bill_mean..usd',\n", + " 'out.utility_bills.electricity_energycharge_bill_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_high..usd',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_high_label',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_low..usd',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_median_low_label',\n", + " 'out.utility_bills.electricity_energycharge_bill_min..usd',\n", + " 'out.utility_bills.electricity_energycharge_bill_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_min_label',\n", + " 'out.utility_bills.electricity_energycharge_bill_savings_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_savings_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_savings_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_savings_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_energycharge_bill_savings_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_max..usd',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_max_label',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_mean..usd',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_high..usd',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_high_label',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_low..usd',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_median_low_label',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_min..usd',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_min_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_min_label',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_savings_max_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_savings_mean_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_savings_median_high_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_savings_median_low_intensity..usd_per_ft2',\n", + " 'out.utility_bills.electricity_fixedcharge_bill_savings_min_intensity..usd_per_ft2',\n", + " 'out.params.unitary_sys_cycling_excess_electricity_cooling_pcnt',\n", + " 'out.params.unitary_sys_cycling_excess_electricity_heating_pcnt',\n", + " 'calc.enduse_group.electricity.hvac.energy_consumption..kwh',\n", + " 'calc.enduse_group.electricity.interior_equipment.energy_consumption..kwh',\n", + " 'calc.enduse_group.electricity.lighting.energy_consumption..kwh',\n", + " 'calc.enduse_group.electricity.refrigeration.energy_consumption..kwh',\n", + " 'calc.enduse_group.electricity.water_systems.energy_consumption..kwh',\n", + " 'calc.percent_savings.electricity.cooling.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.exterior_lighting.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.fans.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.heat_recovery.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.heat_rejection.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.heating.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.interior_equipment.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.interior_lighting.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.net.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.pumps.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.purchased.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.pv.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.refrigeration.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.total.energy_consumption..percent',\n", + " 'calc.percent_savings.electricity.water_systems.energy_consumption..percent',\n", + " 'calc.percent_savings.utility_bills.electricity_bill_mean_intensity..percent',\n", + " 'calc.percent_savings.weighted.utility_bills.electricity_bill_max..percent',\n", + " 'calc.percent_savings.weighted.utility_bills.electricity_bill_mean..percent',\n", + " 'calc.percent_savings.weighted.utility_bills.electricity_bill_min..percent',\n", + " 'calc.weighted.electricity.cooling.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.exterior_lighting.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.fans.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.heat_recovery.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.heat_rejection.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.heating.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.interior_equipment.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.interior_lighting.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.net.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.pumps.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.purchased.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.pv.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.refrigeration.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.total.energy_consumption..tbtu',\n", + " 'calc.weighted.electricity.water_systems.energy_consumption..tbtu',\n", + " 'calc.weighted.emissions.electricity.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.emissions.electricity.lrmer_high_re_cost_15_2023_start..co2e_mmt',\n", + " 'calc.weighted.emissions.electricity.lrmer_low_re_cost_15_2023_start..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.hvac.emissions.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.hvac.energy_consumption..tbtu',\n", + " 'calc.weighted.enduse_group.electricity.interior_equipment.emissions.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.interior_equipment.energy_consumption..tbtu',\n", + " 'calc.weighted.enduse_group.electricity.lighting.emissions.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.lighting.energy_consumption..tbtu',\n", + " 'calc.weighted.enduse_group.electricity.refrigeration.emissions.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.refrigeration.energy_consumption..tbtu',\n", + " 'calc.weighted.enduse_group.electricity.water_systems.emissions.egrid_2021_subregion..co2e_mmt',\n", + " 'calc.weighted.enduse_group.electricity.water_systems.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.cooling.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.exterior_lighting.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.fans.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.heat_recovery.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.heat_rejection.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.heating.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.interior_equipment.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.interior_lighting.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.net.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.pumps.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.purchased.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.pv.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.refrigeration.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.total.energy_consumption..tbtu',\n", + " 'calc.weighted.savings.electricity.water_systems.energy_consumption..tbtu',\n", + " 'calc.weighted.utility_bills.electricity_bill_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_num_bills',\n", + " 'calc.weighted.utility_bills.electricity_bill_savings_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_savings_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_savings_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_savings_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_savings_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_bill_state_average..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_savings_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_savings_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_savings_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_savings_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_flat_bill_savings_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_savings_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_savings_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_savings_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_savings_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_demandcharge_tou_bill_savings_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_savings_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_savings_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_savings_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_savings_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_energycharge_bill_savings_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_min..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_savings_max..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_savings_mean..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_savings_median_high..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_savings_median_low..billion_usd',\n", + " 'calc.weighted.utility_bills.electricity_fixedcharge_bill_savings_min..billion_usd']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[c for c in df.collect_schema().names() if \"electricity\" in c]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4b225636", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 1)
in.county_name
str
"TX, Frio County"
"MS, Lowndes County"
"MO, Cooper County"
"WI, Winnebago County"
"IL, Cook County"
"MS, Lafayette County"
"OR, Wasco County"
"VA, Accomack County"
"VA, King William County"
"AR, Arkansas County"
" + ], + "text/plain": [ + "shape: (10, 1)\n", + "┌─────────────────────────┐\n", + "│ in.county_name │\n", + "│ --- │\n", + "│ str │\n", + "╞═════════════════════════╡\n", + "│ TX, Frio County │\n", + "│ MS, Lowndes County │\n", + "│ MO, Cooper County │\n", + "│ WI, Winnebago County │\n", + "│ IL, Cook County │\n", + "│ MS, Lafayette County │\n", + "│ OR, Wasco County │\n", + "│ VA, Accomack County │\n", + "│ VA, King William County │\n", + "│ AR, Arkansas County │\n", + "└─────────────────────────┘" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(pl.col('in.county_name').unique())[:10].collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9692caef", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:buildstock_query.query_core:Column in.county_name found in multiple tables ['baseline', 'upgrade']. Using baseline\n", + "WARNING:buildstock_query.query_core:Column state found in multiple tables ['baseline', 'upgrade', 'comstock_amy2018_r2_2025_ts_by_state']. Using baseline\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT sum(1) AS sample_count, sum(baseline.weight) AS units_count, sum(CASE WHEN (CAST(upgrade.applicability AS VARCHAR) = 'true') THEN upgrade.\"out.electricity.total.energy_consumption..kwh\" ELSE baseline.\"out.electricity.total.energy_consumption..kwh\" END * baseline.weight) AS \"electricity.total.energy_consumption..kwh\" \n", + "FROM (SELECT * \n", + "FROM comstock_amy2018_r2_2025_md_by_state_and_county_parquet \n", + "WHERE CAST(comstock_amy2018_r2_2025_md_by_state_and_county_parquet.upgrade AS VARCHAR) = '0') AS baseline LEFT OUTER JOIN (SELECT * \n", + "FROM comstock_amy2018_r2_2025_md_by_state_and_county_parquet \n", + "WHERE CAST(comstock_amy2018_r2_2025_md_by_state_and_county_parquet.upgrade AS VARCHAR) != '0') AS upgrade ON baseline.bldg_id = upgrade.bldg_id AND baseline.\"in.nhgis_tract_gisjoin\" = upgrade.\"in.nhgis_tract_gisjoin\" AND baseline.state = upgrade.state AND CAST(upgrade.upgrade AS VARCHAR) = '1' AND CAST(upgrade.applicability AS VARCHAR) = 'true' \n", + "WHERE CAST(baseline.applicability AS VARCHAR) = 'true' AND baseline.\"in.county_name\" = 'MN, Lyon County' AND baseline.state = 'MN'\n" + ] + } + ], + "source": [ + "result = my_run.query(\n", + " enduses=[\"out.electricity.total.energy_consumption..kwh\"],\n", + " restrict=[(\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n", + " annual_only=True,\n", + " upgrade_id=1,\n", + " get_query_only=True,\n", + " applied_only=False\n", + ")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ce302ee9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:buildstock_query.query_core:Column in.county_name found in multiple tables ['baseline', 'upgrade']. Using baseline\n", + "WARNING:buildstock_query.query_core:Column state found in multiple tables ['baseline', 'upgrade', 'comstock_amy2018_r2_2025_ts_by_state']. Using baseline\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:botocore.tokens:Loading cached SSO token for nrel-sso\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_countunits_countelectricity.total.energy_consumption..kwh
0316150.9009963.527991e+07
\n", + "
" + ], + "text/plain": [ + " sample_count units_count electricity.total.energy_consumption..kwh\n", + "0 316 150.900996 3.527991e+07" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query the total electricity for a county\n", + "my_run.query(\n", + " enduses=[\"out.electricity.total.energy_consumption..kwh\"],\n", + " restrict=[(\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n", + " annual_only=True,\n", + " upgrade_id=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "787bf938", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 4)
unique_bldg_countsample_countunits_countout.electricity.total.energy_consumption..kwh
u32u32f64f64
218316150.9009963.5280e7
" + ], + "text/plain": [ + "shape: (1, 4)\n", + "┌───────────────────┬──────────────┬─────────────┬─────────────────────────────────┐\n", + "│ unique_bldg_count ┆ sample_count ┆ units_count ┆ out.electricity.total.energy_c… │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ u32 ┆ f64 ┆ f64 │\n", + "╞═══════════════════╪══════════════╪═════════════╪═════════════════════════════════╡\n", + "│ 218 ┆ 316 ┆ 150.900996 ┆ 3.5280e7 │\n", + "└───────────────────┴──────────────┴─────────────┴─────────────────────────────────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elec_total_df = df.filter(\n", + " (pl.col(\"in.county_name\") == \"MN, Lyon County\") & (pl.col(\"state\") == \"MN\")\n", + ").select(pl.col('bldg_id').n_unique().alias(\"unique_bldg_count\"),\n", + " pl.len().alias('sample_count'),\n", + " pl.sum('weight').alias('units_count'), \n", + " (pl.col(\"out.electricity.total.energy_consumption..kwh\") * pl.col(\"weight\")).sum()\n", + " ).collect()\n", + "elec_total_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3a9b481c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:buildstock_query.query_core:Column in.county_name found in multiple tables ['baseline', 'upgrade']. Using baseline\n", + "WARNING:buildstock_query.query_core:Column state found in multiple tables ['baseline', 'upgrade', 'comstock_amy2018_r2_2025_ts_by_state']. Using baseline\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_countunits_countelectricity.total.energy_consumption..kwh__baselineelectricity.total.energy_consumption..kwh__upgrade
0316150.9009963.527991e+073.761094e+07
\n", + "
" + ], + "text/plain": [ + " sample_count units_count \\\n", + "0 316 150.900996 \n", + "\n", + " electricity.total.energy_consumption..kwh__baseline \\\n", + "0 3.527991e+07 \n", + "\n", + " electricity.total.energy_consumption..kwh__upgrade \n", + "0 3.761094e+07 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query the total electricity for a county\n", + "my_run.query(\n", + " enduses=[\"out.electricity.total.energy_consumption..kwh\"],\n", + " restrict=[(\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n", + " annual_only=True,\n", + " include_baseline=True,\n", + " applied_only=False,\n", + " upgrade_id=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ea5370e0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:buildstock_query.query_core:Column in.county_name found in multiple tables ['baseline', 'upgrade']. Using baseline\n", + "WARNING:buildstock_query.query_core:Column state found in multiple tables ['baseline', 'upgrade', 'comstock_amy2018_r2_2025_ts_by_state']. Using baseline\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_countunits_countelectricity.total.energy_consumption..kwh__baselineelectricity.total.energy_consumption..kwh__upgrade
018377.0742841.316137e+071.549240e+07
\n", + "
" + ], + "text/plain": [ + " sample_count units_count \\\n", + "0 183 77.074284 \n", + "\n", + " electricity.total.energy_consumption..kwh__baseline \\\n", + "0 1.316137e+07 \n", + "\n", + " electricity.total.energy_consumption..kwh__upgrade \n", + "0 1.549240e+07 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_run.query(\n", + " enduses=[\"out.electricity.total.energy_consumption..kwh\"],\n", + " restrict=[(\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n", + " annual_only=True,\n", + " include_baseline=True,\n", + " applied_only=True,\n", + " upgrade_id=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d91fad04", + "metadata": {}, + "outputs": [], + "source": "applied_filter = my_run.get_applied_buildings_filter(all_of=[1, 12, 13])\nmy_run.query(\n enduses=[\"out.electricity.total.energy_consumption..kwh\"],\n restrict=[applied_filter, (\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n annual_only=True,\n include_baseline=True,\n applied_only=True,\n upgrade_id=1\n)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "806e126c", + "metadata": {}, + "outputs": [], + "source": "applied_filter = my_run.get_applied_buildings_filter(all_of=[1, 12, 13])\nmy_run.query(\n enduses=[\"out.electricity.total.energy_consumption\"],\n restrict=[applied_filter, (\"in.county_name\", \"MN, Lyon County\"), (\"state\", \"MN\")],\n annual_only=False,\n include_baseline=True,\n applied_only=True,\n upgrade_id=1\n)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "557ffefc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "buildstock-query (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..a7920556 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,93 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "buildstock_query" +version = "0.4.0" +description = "Python library for querying and analyzing ResStock and ComStock" +readme = "README.md" +license = { file = "LICENSE.md" } +requires-python = ">=3.12" +authors = [ + { name = "Rajendra Adhikari", email = "Rajendra.Adhikari@nrel.gov" }, +] +dependencies = [ + "pandas >= 2.0.0", + "polars", + "pyarrow >= 14.0.1", + "pyathena >= 3.19.0", + "SQLAlchemy >= 2.0.0", + "sqlalchemy2-stubs", + "pandas-stubs", + "inquirerpy >= 0.3.4", + "types-PyYAML >= 6.0.12.2", + "pydantic>=2", + "PyYAML", + "tabulate", + "toml", + "requests", + "sqlglot", + "filelock", + "pyparsing", + "tqdm", + "altair>=6.1.0", + "vegafusion>=2.0.3", + "vl-convert-python>=1.9.0.post1", + "nbformat>=5.10.4", + "nbclient>=0.10.4", +] + +[project.optional-dependencies] +full = [ + "dash-bootstrap-components >= 1.2.1", + "dash-extensions >= 0.1.6", + "dash-mantine-components == 0.10.2", + "dash-iconify >= 0.1.2", + "plotly >= 5.10.0", + "dash >= 2.6.2", +] + +[dependency-groups] +dev = [ + "pytest >= 7.1.3", + "flake8 >= 5.0.4", + "pdoc3 >= 0.10.0", + "dash-bootstrap-components >= 1.2.1", + "dash-extensions >= 0.1.6", + "dash-mantine-components == 0.10.2", + "dash-iconify >= 0.1.2", + "coverage >= 6.5.0", + "plotly >= 5.10.0", + "dash >= 2.6.2", + "ipywidgets>=8.1.8", +] + +[project.scripts] +upgrades_analyzer = "buildstock_query.tools.upgrades_analyzer:main" +upgrades_visualizer = "buildstock_query.tools.upgrades_visualizer:main" + +[project.urls] +Homepage = "https://github.com/NREL/buildstock-query" +Documentation = "https://github.com/NREL/buildstock-query/wiki" +Repository = "https://github.com/NREL/buildstock-query" + +[tool.hatch.build.targets.wheel] +packages = ["buildstock_query"] + +[tool.hatch.build.targets.sdist] +exclude = [ + "tests", + "docs", + "example_usage", + "htmlcov", + "build", + ".bsq_cache*", + ".history", + "*.ipynb", + "*.coverage", + ".git", + ".github", + ".vscode", + ".pytest_cache", +] diff --git a/setup.py b/setup.py deleted file mode 100644 index de258b1f..00000000 --- a/setup.py +++ /dev/null @@ -1,63 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="buildstock_query", - version="0.3.0", - description="Python library for querying and analyzing ResStock and ComStock", - author="Rajendra Adhikari", - packages=find_packages(), - author_email="Rajendra.Adhikari@nrel.gov", - package_dir={"buildstock_query": "buildstock_query"}, - package_data={"buildstock_query": ["**"]}, - include_package_data=True, - python_requires=">=3.9", - install_requires=[ - "pandas >= 2.0.0", - "polars", - "pyarrow >= 14.0.1", - # "s3fs[boto3] >= 2022.8.2", - "pyathena >= 3.19.0", - "SQLAlchemy >= 2.0.0", - "sqlalchemy2-stubs", - "pandas-stubs", - "inquirerpy >= 0.3.4", - "types-PyYAML >= 6.0.12.2", - "pydantic>=2", - "PyYAML", - "tabulate", - "toml", - "requests", - "sqlglot", - "filelock", - "pyparsing" - ], - extras_require={ - "dev": [ - "pytest >= 7.1.3", - "flake8 >= 5.0.4", - "pdoc3 >= 0.10.0", - "autopep8 >= 1.7.0", - "dash-bootstrap-components >= 1.2.1", - "dash-extensions >= 0.1.6", - "dash-mantine-components == 0.10.2", - "dash-iconify >= 0.1.2", - "coverage >= 6.5.0", - "plotly >= 5.10.0", - "dash >= 2.6.2", - ], - "full": [ - "dash-bootstrap-components >= 1.2.1", - "dash-extensions >= 0.1.6", - "dash-mantine-components == 0.10.2", - "dash-iconify >= 0.1.2", - "plotly >= 5.10.0", - "dash >= 2.6.2", - ], - }, - entry_points={ - "console_scripts": [ - "upgrades_analyzer=buildstock_query.tools.upgrades_analyzer:main", - "upgrades_visualizer=buildstock_query.tools.upgrades_visualizer:main", - ] - }, -) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..940cede8 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,191 @@ +# Tests guide + +This directory uses three complementary test styles. Pick the right one when +adding a new test. + +## Test layout + +``` +tests/ +├── test_query_snapshots.py SQL-hash + parquet pinning +├── query_snapshots/ per-flavor JSON entries + content-addressed parquet cache +│ ├── *.json one file per "flavor" — annual, savings, timeseries, ... +│ └── *_oedi_cache/ parquets keyed by sha256(SQL); shared across runs +├── test_invariants.py cross-flow / cross-query / mutation invariants +├── test_schema_unique_keys.py monkeypatch unit tests for composite-key plumbing +├── test_sql_cache.py disk-cache contract tests +├── test_set_cover.py avoid-restrict algorithm tests +├── test_query_core_cache_paths.py cache path resolution +├── test_UpgradeAnalyzer.py UpgradeAnalyzer YAML processing +├── conftest.py fixtures + CLI options +├── reference_files/ fixtures for UpgradeAnalyzer + Viz +├── local_only/ opt-in tests that download S3 parquets +│ ├── test_full_data_methods.py pure-pandas report methods (xfail-pinned for OEDI) +│ └── cache/ gitignored — populated by --include-local +├── legacy/ Viz tests + helpers; collect_ignore_glob filters them out +└── utility.py snapshot harness internals (NOT a test file) +``` + +## When to add which kind of test + +### Snapshot test (most queries) + +Add an entry to a `query_snapshots/*.json` file when you're pinning a SQL shape +the library should keep emitting. The harness runs `get_query_only=True`, +hashes the SQL, looks up the parquet under that hash, and compares against the +stored result. + +Use a snapshot when: +- The method has `get_query_only=True` (you can extract SQL without executing). +- The query bounds Athena cost: state-restricted (or partition-restricted), + bounded `bldg_id` list, or annual-only. +- You want regression coverage for the SQL shape *and* the data the query + produces. + +### Invariant test (cross-flow / cross-query checks) + +Add to `test_invariants.py` when you're asserting a relationship between +multiple queries — not the absolute output of one query. Examples: +- `annual = ts_year_collapse = sum(ts_monthly)` for the same group. +- `applied_in=[1,2]` returns the set intersection of `applied_in=[1]` and + `applied_in=[2]`. +- A schema-mutation breaks the result by exactly the predicted factor. + +Invariants catch bugs the snapshot harness can't, because they compare two +independent code paths against each other rather than comparing today's +result to a stored result. The snapshot harness can only tell you "this +matches what we recorded" — not "what we recorded is correct." + +### Unit test (no Athena) + +Add to `test_schema_unique_keys.py` (or a sibling new file) when you're +testing internal contracts that don't need a real database — schema +validation, query-model rejection, JOIN ON construction with synthetic +in-memory SA tables. Use `monkeypatch.setattr(BuildStockQuery, "_get_tables", +...)` to inject fake tables. + +### Local-only test (rare) + +Add to `local_only/` when the method: +- Doesn't emit Athena SQL (e.g. `report.get_applied_options` downloads full + metadata parquets and processes in pandas). +- Requires the full data to test meaningfully. + +Tag with `@pytest.mark.local_only`. The test is skipped by default; run with +`pytest --include-local`. Cache lives at `tests/local_only/cache/` (gitignored). + +## Cost guardrails + +**Athena costs add up fast on OEDI.** A single full-scan TS query is ~$16 at +$5/TB. Three of these in a row is a $50 accident. + +Before adding a snapshot entry that uses `_method` (i.e. not `bsq.query()`), +inspect the generated SQL via `get_query_only=True` and verify: + +1. Every reference to the timeseries table is constrained on **both**: + - `state` (or another partition key), AND + - `upgrade` + +2. The `ts.upgrade = N` filter must be in the JOIN ON clause, not just in a + baseline-side subquery WHERE — the TS table is still scanned across all + upgrades before the join eliminates non-matching rows. + +`bsq.query()` adds the `ts.upgrade = N` constraint automatically. Other entry +points may not. Audit the source if you're not sure. + +### Known cost-trap methods + +- `agg.get_building_average_kws_at` — has no `restrict` API at all; its + TS-side join doesn't constrain `upgrade`. Even a 3-timestamp filter scans + every upgrade × every building. Was responsible for an unintended $24 spend + on 2026-04-25 (3.24 TB scan against resstock_oedi). The original snapshot + entry was removed from `building_kws.json`; do NOT re-add until the method + gains state/restrict arguments. + +## Snapshot regeneration policy + +Two flags control snapshot updates with different blast radius: + +### `--update-snapshot` + +Auto-refreshes entries the framework can prove are safe: +- **Cosmetic SQL drift** (sqlglot-equivalent): renames the parquet to the new + hash with no data check. +- **Real SQL drift + data matches**: writes new pair, deletes old, patches JSON. +- **Real SQL drift + "equivalent but different" data** (extra/missing columns + with shared values agreeing): writes new pair, deletes old, patches JSON. +- **Real SQL drift + data genuinely diverged**: leaves both alone. You need + `--overwrite-snapshot` to force. + +### `--overwrite-snapshot` + +Force-overwrites cache entries even when data genuinely diverged. Use only +when you've **deliberately changed query semantics** and you've verified the +new numbers are correct. Includes everything `--update-snapshot` does. + +### Cross-checking before `--overwrite-snapshot` + +When you're about to overwrite snapshots whose data drifted (not just SQL +hash), first run any `test_invariants.py` cases that exercise the same query +shape. Decision rule: + +- **Invariants green** → the new data is internally consistent across flows; + overwrite is safe. +- **Invariants red** → don't overwrite. Investigate what regressed first. + +This is how the 2026-04-26 `applied_only` TS-flow fix was validated: +`test_annual_equals_ts_year_equals_ts_monthly_sum[upgrade1_applied]` (which +had been xfail) flipped to green after the SQL fix, proving the new lower +totals matched the always-correct annual totals — only then was overwrite run. + +## Mutation testing + +When a test passes, it's worth asking: *would it fail if the thing it tests +were broken?* `test_comstock_composite_key_mutation_breaks_invariants` +demonstrates the pattern: + +1. Construct a second `BuildStockQuery` with the schema deliberately mutated + (e.g. drop `state` from `unique_keys`). +2. Run the same query that the canonical sibling test runs. +3. Assert the mutated result diverges from the canonical correct values by + the expected factor (e.g. exact 46x inflation when the join cross-products + across 46 metadata rows for `bldg_id=51037`). + +If the mutation produces identical results, the test wasn't actually testing +what it claimed. This caught a real test-design flaw: an earlier composite-key +test used baseline-only annual queries (no JOIN) which the mutation couldn't +break — the test "passed" but was never actually exercising the keys. + +When adding a new contract-pinning test, ask: *what's the smallest +schema/code change that would break this test?* If you can't think of one, +the test isn't pinning enough. + +## Per-schema placeholders + +Snapshot entries use `$ELECTRICITY_TOTAL`, `$BUILDING_TYPE_COL`, etc. The +resolver in `tests/utility.py` substitutes per-schema strings — `out.electricity.total.energy_consumption` +on resstock vs `...energy_consumption..kwh` on comstock. This lets one JSON +entry exercise both schemas without forking the args. + +To restrict an entry to one schema (e.g. utility methods that need +`map_eiaid_column` which comstock TOMLs don't define): + +```json +{ + "name": "...", + "schemas": ["resstock_oedi"], + ... +} +``` + +## Running tests + +| Command | What it does | +|---|---| +| `pytest tests/` | Full suite excluding local-only (default) | +| `pytest tests/test_query_snapshots.py` | Just snapshot tests | +| `pytest tests/test_invariants.py` | Just invariants | +| `pytest tests/ --include-local` | Add local-only tests (downloads ~400 MB on first run) | +| `pytest tests/ --update-snapshot` | Refresh snapshots that drifted safely | +| `pytest tests/ --overwrite-snapshot` | Force-refresh including divergent data (read policy above first) | +| `pytest tests/ --check-data` | Run data check even when SQL matches | diff --git a/tests/backfill_snapshot_costs.py b/tests/backfill_snapshot_costs.py new file mode 100644 index 00000000..dbebcb57 --- /dev/null +++ b/tests/backfill_snapshot_costs.py @@ -0,0 +1,84 @@ +"""Backfill `.json` Athena execution metadata sidecars into the snapshot +cache directories. + +Walks each schema's snapshot cache (`tests/query_snapshots/_cache/`), +finds `.parquet` entries that lack an `.json` companion, and +populates them by looking up the past Athena execution via the workgroup's +query history. Each metadata file is the full GetQueryExecution response — +DataScannedInBytes, EngineExecutionTimeInMillis, ResultReuseInformation, +EngineVersion, etc. — stored verbatim so future analyses can pull whatever +they need without re-fetching. + +Usage: + python tests/backfill_snapshot_costs.py [--dry-run] + +Idempotent: re-running on already-populated entries is a no-op (the script +only fills in missing files). Entries whose original execution is older than +Athena's ~45-day history retention are reported as "skipped" and remain +without a metadata sidecar. +""" +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from buildstock_query import BuildStockQuery + + +SNAPSHOTS_ROOT = Path(__file__).parent / "query_snapshots" + + +def _make_bsq(schema: str) -> BuildStockQuery: + """Construct a BSQ for the given snapshot fixture schema.""" + if schema == "resstock_oedi": + return BuildStockQuery( + "rescore", "buildstock_sdr", "resstock_2024_amy2018_release_2", + buildstock_type="resstock", db_schema="resstock_oedi_vu", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "resstock_oedi_cache"), + ) + if schema == "comstock_oedi": + return BuildStockQuery( + "rescore", "buildstock_sdr", "comstock_amy2018_r2_2025", + buildstock_type="comstock", db_schema="comstock_oedi_state_and_county", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "comstock_oedi_cache"), + ) + if schema == "comstock_oedi_agg": + return BuildStockQuery( + "rescore", "buildstock_sdr", "comstock_amy2018_r2_2025", + buildstock_type="comstock", db_schema="comstock_oedi_agg_state_and_county", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "comstock_oedi_agg_cache"), + ) + raise ValueError(f"Unknown schema: {schema}") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true", help="Don't write changes; just report.") + args = parser.parse_args() + + schemas = ("resstock_oedi", "comstock_oedi", "comstock_oedi_agg") + for schema in schemas: + cache_dir = SNAPSHOTS_ROOT / f"{schema}_cache" + # Find cached entries lacking metadata sidecars BEFORE we walk history + # so we can report what we're about to do. + missing = [p.stem for p in cache_dir.glob("*.parquet") if not (cache_dir / f"{p.stem}.json").exists()] + if not missing: + print(f"{schema}: all cache entries already have metadata; skipping") + continue + print(f"{schema}: {len(missing)} cache entries missing metadata; querying history...") + bsq = _make_bsq(schema) + if args.dry_run: + index = bsq.build_query_metadata_index() + found = sum(1 for h in missing if h in index) + print(f" would fill {found}, skip {len(missing) - found}") + else: + filled, skipped = bsq.backfill_cache_metadata() + print(f" filled {filled}, skipped {skipped} (older than Athena history)") + + +if __name__ == "__main__": + main() diff --git a/tests/cleanup_stale_caches.py b/tests/cleanup_stale_caches.py new file mode 100644 index 00000000..316fa49e --- /dev/null +++ b/tests/cleanup_stale_caches.py @@ -0,0 +1,201 @@ +"""Remove stale cache entries from `tests/query_snapshots/_cache/`. + +A cache entry (`.sql`, `.parquet`, optional `.json`) is +"needed" if either: + + 1. Its hash appears as the `sql_hash` for some entry in any of the + top-level `*.json` flavor files under `tests/query_snapshots/`, OR + + 2. Its hash appears in the per-cache `.cache_usage_log` file, written by + `SqlCache._record_usage` whenever the hash was hit (cache read) or + freshly written during the most recent test session. + +Anything else is stale: a leftover from a previous SQL shape that no +current snapshot or invariant test references. By default the script runs +in dry-run mode and only reports what it would remove. Pass `--delete` to +actually unlink the files. + +When to run +----------- +After a test session that exercised the invariant tests (and ran any +queries those invariants emit which aren't tracked in the snapshot JSONs), +the `.cache_usage_log` will accurately reflect what the suite needs. That +is the right time to run this script — running it before that risks +deleting cache entries the invariants depend on, forcing an Athena +re-execution next time around. + +Typical workflow: + + pytest -s -v tests/test_query_snapshots.py --check-data + pytest -s -v tests/test_invariants.py + python tests/cleanup_stale_caches.py # dry-run report + python tests/cleanup_stale_caches.py --delete # actually remove + +Both pytest commands must run in the same session-fixture lifetime as the +cleanup expectation: each fresh BSQ construction truncates +`.cache_usage_log`, so if you run the snapshot suite, then run something +else that constructs a new BSQ on the same cache folder, the log resets +and the snapshot hashes drop out of the "used" set. Run cleanup +immediately after the test sessions that should populate the log. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Iterable + + +SNAPSHOTS_ROOT = Path(__file__).resolve().parent / "query_snapshots" +USAGE_LOG_NAME = ".cache_usage_log" +SCHEMAS = ("resstock_oedi", "comstock_oedi", "comstock_oedi_agg") +SIDECAR_SUFFIXES = (".sql", ".parquet", ".json") + + +def _collect_json_hashes(snapshots_root: Path) -> dict[str, set[str]]: + """Return {schema: {hashes}} aggregated across every flavor JSON file.""" + out: dict[str, set[str]] = {s: set() for s in SCHEMAS} + for json_path in sorted(snapshots_root.glob("*.json")): + try: + data = json.loads(json_path.read_text()) + except json.JSONDecodeError as e: + print(f"warn: skipping {json_path.name}: {e}", file=sys.stderr) + continue + if not isinstance(data, list): + continue + for entry in data: + if not isinstance(entry, dict): + continue + raw = entry.get("sql_hash") + if isinstance(raw, dict): + for schema, h in raw.items(): + if schema in out and isinstance(h, str) and h: + out[schema].add(h) + elif isinstance(raw, str) and raw: + # Schema-agnostic string: applies to every schema's cache. + for schema in out: + out[schema].add(raw) + return out + + +def _collect_log_hashes(cache_dir: Path) -> set[str]: + """Read hashes from the cache's `.cache_usage_log` (one per line).""" + log = cache_dir / USAGE_LOG_NAME + if not log.exists(): + return set() + out: set[str] = set() + for line in log.read_text().splitlines(): + line = line.strip() + if len(line) == 64 and all(c in "0123456789abcdef" for c in line): + out.add(line) + return out + + +def _collect_disk_hashes(cache_dir: Path) -> set[str]: + """Hashes physically present on disk (one per `.parquet`).""" + return {p.stem for p in cache_dir.glob("*.parquet")} + + +def _stale_files_for(cache_dir: Path, stale_hashes: Iterable[str]) -> list[Path]: + paths: list[Path] = [] + for h in stale_hashes: + for suffix in SIDECAR_SUFFIXES: + p = cache_dir / f"{h}{suffix}" + if p.exists(): + paths.append(p) + return paths + + +def _human_size(num_bytes: int) -> str: + for unit in ("B", "KB", "MB", "GB"): + if num_bytes < 1024: + return f"{num_bytes:.1f} {unit}" + num_bytes /= 1024 # type: ignore[assignment] + return f"{num_bytes:.1f} TB" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.split("\n\n", 1)[0]) + parser.add_argument( + "--delete", + action="store_true", + help="Actually unlink stale files. Without this, the script only reports.", + ) + parser.add_argument( + "--clear", + action="store_true", + help=( + "Truncate `.cache_usage_log` in every schema cache and exit " + "without computing stale entries. Run this before the test " + "session(s) you want to track." + ), + ) + parser.add_argument( + "--snapshots-root", + type=Path, + default=SNAPSHOTS_ROOT, + help=f"Override snapshots root (default: {SNAPSHOTS_ROOT}).", + ) + args = parser.parse_args() + + if args.clear: + for schema in SCHEMAS: + cache_dir = args.snapshots_root / f"{schema}_cache" + log = cache_dir / USAGE_LOG_NAME + if cache_dir.is_dir(): + log.write_text("") + print(f"[{schema}] cleared {log.relative_to(args.snapshots_root.parent)}") + else: + print(f"[{schema}] cache dir missing, skipping: {cache_dir}") + return 0 + + json_hashes = _collect_json_hashes(args.snapshots_root) + total_stale = 0 + total_bytes = 0 + + for schema in SCHEMAS: + cache_dir = args.snapshots_root / f"{schema}_cache" + if not cache_dir.is_dir(): + print(f"[{schema}] cache dir missing, skipping: {cache_dir}") + continue + log_hashes = _collect_log_hashes(cache_dir) + disk_hashes = _collect_disk_hashes(cache_dir) + needed = json_hashes[schema] | log_hashes + stale = sorted(disk_hashes - needed) + + print( + f"[{schema}] disk={len(disk_hashes)} " + f"json={len(json_hashes[schema])} " + f"log={len(log_hashes)} " + f"needed={len(needed)} " + f"stale={len(stale)}" + ) + if not stale: + continue + + stale_files = _stale_files_for(cache_dir, stale) + size = sum(p.stat().st_size for p in stale_files if p.exists()) + total_stale += len(stale) + total_bytes += size + print(f" would free {_human_size(size)} across {len(stale_files)} files") + for h in stale[:10]: + print(f" {h}") + if len(stale) > 10: + print(f" ... and {len(stale) - 10} more") + + if args.delete: + for p in stale_files: + p.unlink(missing_ok=True) + print(f" deleted {len(stale_files)} files") + + summary = f"total stale hashes: {total_stale} ({_human_size(total_bytes)})" + if args.delete: + print(f"\nDONE — {summary}") + else: + print(f"\nDRY RUN — {summary}; pass --delete to remove") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..0c312c1d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,193 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Iterator + +import pytest + +from buildstock_query import BuildStockQuery + +collect_ignore_glob = ["legacy/*"] + +SNAPSHOTS_ROOT = Path(__file__).parent / "query_snapshots" + + +def pytest_terminal_summary(terminalreporter, exitstatus, config): + """Print real entry/variant totals — each pytest node processes a JSON file + of N entries, so the test-function count understates actual coverage.""" + from tests.test_utility import SESSION_TOTALS + + if SESSION_TOTALS["entries"] == 0: + return + terminalreporter.write_sep("=", "snapshot entry totals") + terminalreporter.write_line( + f" {SESSION_TOTALS['entries']} entries / {SESSION_TOTALS['variants']} variants" + ) + terminalreporter.write_line( + f" passed={SESSION_TOTALS['passed']} updated={SESSION_TOTALS['updated']} " + f"skipped={SESSION_TOTALS['skipped']} errored={SESSION_TOTALS['errored']} " + f"failed={SESSION_TOTALS['failed']}" + ) + + # Cost-regression summary, only when there's something interesting. + cost_changes = SESSION_TOTALS.get("cost_changes") or [] + wins = [c for c in cost_changes if c["status"] == "win"] + regressions = [c for c in cost_changes if c["status"] == "regression"] + if wins: + terminalreporter.write_sep("=", "cost wins (≥20% improvement)", green=True) + for c in wins: + applied = "" if c["applied"] else " [not written]" + terminalreporter.write_line( + f" {c['name']} ({c['schema']}): {c['note']}{applied}", + green=True, + ) + if regressions: + terminalreporter.write_sep("=", "cost regressions (≥20% worse)", red=True) + for c in regressions: + applied = "" if c["applied"] else " [BLOCKED — pass --overwrite-snapshot to update]" + terminalreporter.write_line( + f" {c['name']} ({c['schema']}): {c['note']}{applied}", + red=True, + ) + + failure_details = SESSION_TOTALS.get("failure_details") or [] + if failure_details: + terminalreporter.write_sep("=", "snapshot failure reasons") + reason_order = [ + "data mismatch", + "missing snapshot data", + "data-check error", + "cost regression", + "sql drift only", + "sql/data drift", + "failure", + ] + for reason in reason_order: + matching = [d for d in failure_details if d["reason"] == reason] + if not matching: + continue + terminalreporter.write_line(f" {reason}:") + for detail in matching: + terminalreporter.write_line( + f" {detail['source']}/{detail['name']} ({detail['schema']})" + ) + + +def pytest_addoption(parser): + parser.addoption( + "--check-data", + action="store_true", + default=False, + help="Run data comparison against parquet for entries whose SQL check failed.", + ) + parser.addoption( + "--update-snapshot", + action="store_true", + default=False, + help=( + "Auto-refresh the snapshot cache for changes the framework can prove are safe. " + "Cosmetic SQL drift (sqlglot-equivalent) renames the parquet to the new hash with " + "no data check. Real SQL drift auto-runs the query: if data matches → write the " + "new pair and delete the old; if data is 'equivalent but different' (extra/missing " + "columns with shared values agreeing) → write the new pair and delete the old; if " + "data genuinely diverged → leave both alone (use --overwrite-snapshot to force)." + ), + ) + parser.addoption( + "--overwrite-snapshot", + action="store_true", + default=False, + help=( + "Force-overwrite cache entries even when data genuinely diverged. Use only when " + "you've deliberately changed query semantics. Includes everything --update-snapshot " + "does." + ), + ) + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "local_only: heavy test that downloads ~400 MB; auto-skipped in CI (CI env var set)", + ) + + +def pytest_collection_modifyitems(config, items): + if not os.environ.get("CI"): + return + skip_local = pytest.mark.skip(reason="local-only test; auto-skipped in CI") + for item in items: + if "local_only" in item.keywords: + item.add_marker(skip_local) + + +@pytest.fixture(scope="session") +def bsq_comstock_oedi() -> Iterator[BuildStockQuery]: + print("\n[fixture] constructing BuildStockQuery(comstock_oedi)...", flush=True) + bsq = BuildStockQuery( + "rescore", + "buildstock_sdr", + "comstock_amy2018_r2_2025", + buildstock_type="comstock", + db_schema="comstock_oedi_state_and_county", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "comstock_oedi_cache"), + ) + print("[fixture] comstock_oedi ready.", flush=True) + yield bsq + + +@pytest.fixture(scope="session") +def bsq_comstock_oedi_agg() -> Iterator[BuildStockQuery]: + print("\n[fixture] constructing BuildStockQuery(comstock_oedi_agg)...", flush=True) + bsq = BuildStockQuery( + "rescore", + "buildstock_sdr", + "comstock_amy2018_r2_2025", + buildstock_type="comstock", + db_schema="comstock_oedi_agg_state_and_county", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "comstock_oedi_agg_cache"), + ) + print("[fixture] comstock_oedi_agg ready.", flush=True) + yield bsq + + +@pytest.fixture(scope="session") +def bsq_resstock_oedi() -> Iterator[BuildStockQuery]: + print("\n[fixture] constructing BuildStockQuery(resstock_oedi)...", flush=True) + bsq = BuildStockQuery( + "rescore", + "buildstock_sdr", + "resstock_2024_amy2018_release_2", + buildstock_type="resstock", + db_schema="resstock_oedi_vu", + skip_reports=True, + cache_folder=str(SNAPSHOTS_ROOT / "resstock_oedi_cache"), + ) + print("[fixture] resstock_oedi ready.", flush=True) + yield bsq + + +@pytest.fixture(scope="session") +def bsq_resstock_oedi_local() -> Iterator[BuildStockQuery]: + """Resstock fixture with a SEPARATE local-only cache folder, for tests that + download full metadata parquets via download_metadata_and_annual_results. + The cache lives outside tests/query_snapshots so the downloaded files + (hundreds of MB) don't get staged by git. tests/local_only/cache/ is + listed in .gitignore.""" + local_cache_root = Path(__file__).parent / "local_only" / "cache" + local_cache_root.mkdir(parents=True, exist_ok=True) + print(f"\n[fixture] constructing BuildStockQuery(resstock_oedi_local) at {local_cache_root}...", flush=True) + bsq = BuildStockQuery( + "rescore", + "buildstock_sdr", + "resstock_2024_amy2018_release_2", + buildstock_type="resstock", + db_schema="resstock_oedi_vu", + skip_reports=True, + cache_folder=str(local_cache_root / "resstock_oedi_cache"), + ) + print("[fixture] resstock_oedi_local ready.", flush=True) + yield bsq diff --git a/tests/diagnose_wrapped_cache.py b/tests/diagnose_wrapped_cache.py new file mode 100644 index 00000000..0833991f --- /dev/null +++ b/tests/diagnose_wrapped_cache.py @@ -0,0 +1,98 @@ +"""One-shot read-only audit: find SqlCache parquets whose column shape +contradicts the SELECT-list of their sibling .sql file. + +Failure mode being detected: the snapshot harness's `run_query_data` +wraps `_method` returns of type list/dict/Series into +`pd.DataFrame({"value": result})` and the writethrough path then writes +that wrapped DataFrame back into the SqlCache under the **inner** +method's SQL hash. The cache thereby holds a 1-column shape for an SQL +that selects 3+ columns. Any live caller that invokes the original +method afterward gets the wrong shape from cache and crashes. + +Walks every .parquet/.sql pair under tests/query_snapshots/*_cache/ +and reports any whose parquet column count differs substantially from +the SELECT-list count parsed naively from the .sql header. +""" +from __future__ import annotations + +import re +from pathlib import Path + +import pandas as pd + + +SNAPSHOTS_ROOT = Path(__file__).parent / "query_snapshots" + + +def _count_select_columns(sql: str) -> int | None: + """Crude parse of the top-level SELECT list. Returns column count, or + None if the SQL doesn't look like a SELECT we can analyze. + + Counts top-level commas (depth 0) between SELECT and the matching FROM. + Good enough for the snapshot caches we see — the SQL is generated by + SQLAlchemy and uses a stable shape.""" + sql = sql.strip() + m = re.match(r"(?is)^\s*SELECT\s+(?:DISTINCT\s+)?(.*?)\s+FROM\s+", sql) + if not m: + return None + select_list = m.group(1) + depth = 0 + cols = 1 + for c in select_list: + if c == "(": + depth += 1 + elif c == ")": + depth -= 1 + elif c == "," and depth == 0: + cols += 1 + return cols + + +def audit_cache_dir(cache_dir: Path) -> list[tuple[Path, int, int, list[str]]]: + """Return [(parquet_path, sql_cols, parquet_cols, parquet_col_names)] for mismatches.""" + mismatches: list[tuple[Path, int, int, list[str]]] = [] + for parquet_path in sorted(cache_dir.glob("*.parquet")): + sql_path = parquet_path.with_suffix(".sql") + if not sql_path.exists(): + continue + try: + sql_text = sql_path.read_text() + except OSError: + continue + sql_cols = _count_select_columns(sql_text) + if sql_cols is None: + continue # not a SELECT we can analyze + try: + df_cols = list(pd.read_parquet(parquet_path).columns) + except Exception: + continue + if len(df_cols) >= sql_cols: + continue + mismatches.append((parquet_path, sql_cols, len(df_cols), df_cols)) + return mismatches + + +def main() -> None: + any_mismatch = False + for cache_dir in sorted(SNAPSHOTS_ROOT.glob("*_cache")): + if not cache_dir.is_dir(): + continue + mismatches = audit_cache_dir(cache_dir) + if not mismatches: + print(f"{cache_dir.name}: clean") + continue + any_mismatch = True + print(f"{cache_dir.name}: {len(mismatches)} mismatch(es)") + for parquet_path, sql_cols, df_cols, df_col_names in mismatches: + print( + f" {parquet_path.name}: SQL selects {sql_cols} cols, " + f"parquet has {df_cols} ({df_col_names})" + ) + if any_mismatch: + print("\nMismatches indicate cache poisoning by the wrap-then-writeback path.") + print("After landing the cache-poisoning fix, delete each mismatched parquet/.sql/.json") + print("triple and re-run pytest to repopulate from bsq.execute().") + + +if __name__ == "__main__": + main() diff --git a/tests/example_notebook_builder.py b/tests/example_notebook_builder.py new file mode 100644 index 00000000..437c60dd --- /dev/null +++ b/tests/example_notebook_builder.py @@ -0,0 +1,379 @@ +"""Generate runnable Jupyter notebooks from snapshot test entries. + +Each (flavor.json, schema) pair becomes one notebook at +`tests/example_notebooks/_.ipynb` containing: + 1. Imports + 2. BuildStockQuery construction (matched to the conftest fixture) + 3. One cell per entry that calls the entry's method with the entry's args + and prints the head of the returned DataFrame + +Notebook outputs are deterministic: cell IDs are content-derived hashes +(stable across regenerations), execution_count is None, and result +previews are truncated to head() so a fresh regen doesn't churn diffs +unless the underlying data actually changed. + +Trigger: regeneration fires inside `evaluate_entries` when: + - The notebook file is missing, OR + - --update-snapshot / --overwrite-snapshot ran (data was just verified + fresh, so the embedded preview should reflect it). +""" +from __future__ import annotations + +import hashlib +import json +import warnings +from pathlib import Path +from typing import Any, Iterable, Mapping + +import pandas as pd + +# Import lazily-needed pieces from test_utility at function scope to avoid a +# circular import (test_utility imports SNAPSHOTS_ROOT which lives there). + + +NOTEBOOKS_ROOT_NAME = "example_notebooks" + +# Methods whose cells we COMMENT OUT during regen instead of letting +# nbclient execute them. These fan out to multiple internal Athena +# queries that the snapshot cache only partially covers (it stores the +# OUTER call's hash, not the inner per-upgrade probes). Determined by +# reading each method's body — only methods that issue more than one +# Athena query (loops, batch_query, multiple internal `_get_*` helpers +# that themselves fire queries) appear here. +# +# Cross-reference for adding new entries: in the method's source, look +# for `submit_batch_query`, multiple `self.execute(...)` calls, or +# loops over `available_upgrades`. Single-execute methods are safe. +_NO_EXECUTE_METHODS = frozenset({ + "report.get_success_report", # _get_bs/_get_up/_get_change_report + # each fan out per-upgrade + "report.check_ts_bs_integrity", # one count-distinct per upgrade + "agg.get_building_average_kws_at", # 3.2 TB landmine (see CLAUDE.md) +}) + + +# Per-schema constructor knobs. Mirrors `tests/conftest.py:bsq__oedi` +# so the notebook builds the same BuildStockQuery the test fixture did. +_SCHEMA_CONSTRUCTOR: dict[str, dict[str, Any]] = { + "resstock_oedi": { + "table_name": "resstock_2024_amy2018_release_2", + "buildstock_type": "resstock", + "db_schema": "resstock_oedi_vu", + }, + "comstock_oedi": { + "table_name": "comstock_amy2018_r2_2025", + "buildstock_type": "comstock", + "db_schema": "comstock_oedi_state_and_county", + }, + "comstock_oedi_agg": { + "table_name": "comstock_amy2018_r2_2025", + "buildstock_type": "comstock", + "db_schema": "comstock_oedi_agg_state_and_county", + }, +} + + +def _cell_id(content: str) -> str: + """Stable cell ID derived from cell content. Notebooks re-render with the + same ID for the same content, so git diffs only flip on real changes.""" + return hashlib.md5(content.encode()).hexdigest()[:8] + + +def _markdown_cell(text: str) -> dict: + return { + "cell_type": "markdown", + "id": _cell_id(text), + "metadata": {}, + "source": text.splitlines(keepends=True) or [""], + } + + +def _code_cell(source: str, outputs: list[dict] | None = None) -> dict: + """A code cell with deterministic ID and no execution_count. + + Outputs default to []; pass a populated list to embed a result preview. + """ + return { + "cell_type": "code", + "id": _cell_id(source), + "metadata": {}, + "execution_count": None, + "outputs": outputs or [], + "source": source.splitlines(keepends=True) or [""], + } + + +def _execute_notebook_in_place(path: Path) -> None: + """Run all cells in `path` via nbclient and write the result back. + + This populates outputs with whatever Jupyter would naturally produce + (DataFrame HTML tables, log streams, etc.). Running the notebook in + place means the saved file IS the runnable artifact AND the source of + truth for what the demo currently outputs. + + Cells that legitimately can't execute (the `_NO_EXECUTE_METHODS` set, + `_has_unroundtrippable_placeholder`) are rendered comment-only by the + builder, so nbclient never sees them. Anything that does execute must + succeed — a `CellExecutionError` propagates up so the caller (and + pytest) can surface it. + """ + import nbformat + from nbclient import NotebookClient + from jupyter_client.kernelspec import KernelSpecManager, NoSuchKernel + + nb = nbformat.read(path, as_version=4) + preferred_kernel = nb.get("metadata", {}).get("kernelspec", {}).get("name", "python3") + kernel_mgr = KernelSpecManager() + available_kernels = kernel_mgr.find_kernel_specs() + kernel_name = preferred_kernel + if kernel_name not in available_kernels: + warnings.warn( + f"Skipping notebook execution for {path.name}: kernel '{kernel_name}' is unavailable in this environment.", + RuntimeWarning, + ) + nbformat.write(nb, path) + return + # Run from the notebook's directory so its `_dh[0]`-based cache path + # resolves to the sibling `tests/query_snapshots/` tree. + client = NotebookClient( + nb, timeout=300, kernel_name=kernel_name, + resources={"metadata": {"path": str(path.parent)}}, + ) + try: + client.execute() + except NoSuchKernel: + warnings.warn( + f"Skipping notebook execution for {path.name}: kernel '{kernel_name}' is unavailable in this environment.", + RuntimeWarning, + ) + finally: + # Persist outputs even on failure so the user can inspect the + # crash trace in the notebook itself. + nbformat.write(nb, path) + + +def _format_arg_value(v: Any) -> str: + """Render an arg value as a Python literal string. SA-built objects + (Label, MappedColumn) get a placeholder string — those aren't JSON-safe + and the user has to construct them inline anyway. The `_applied_filter`, + `_calc_column`, and `_mapped_column` sentinels render as inline + construction calls so the notebook produces runnable Python.""" + if isinstance(v, list): + return "[" + ", ".join(_format_arg_value(item) for item in v) + "]" + if isinstance(v, tuple): + if len(v) == 1: + return "(" + _format_arg_value(v[0]) + ",)" + return "(" + ", ".join(_format_arg_value(item) for item in v) + ")" + if isinstance(v, dict): + if "_applied_filter" in v: + spec = v["_applied_filter"] or {} + kwargs: list[str] = [] + if spec.get("any_of") is not None: + kwargs.append(f"any_of={_format_arg_value(spec['any_of'])}") + if spec.get("all_of") is not None: + kwargs.append(f"all_of={_format_arg_value(spec['all_of'])}") + return f"bsq.get_applied_buildings_filter({', '.join(kwargs)})" + if "_calc_column" in v: + spec = v["_calc_column"] or {} + args = [ + _format_arg_value(spec["name"]), + _format_arg_value(spec["expr"]), + f"table={_format_arg_value(spec.get('table', 'baseline'))}", + ] + return f"bsq.get_calculated_column({', '.join(args)})" + if "_mapped_column" in v: + spec = v["_mapped_column"] or {} + kwargs = [ + "bsq=bsq", + f"name={_format_arg_value(spec['name'])}", + f"mapping_dict={_format_arg_value(spec['mapping_dict'])}", + f"key=bsq._get_column({_format_arg_value(spec['key_column'])})", + ] + return f"MappedColumn({', '.join(kwargs)})" + return repr(v) + if isinstance(v, (str, int, float, bool)) or v is None: + return repr(v) + cls = type(v).__name__ + return f"<{cls} ...>" + + +def _has_unroundtrippable_placeholder(call_src: str) -> bool: + """Detect args rendered as object reprs that aren't valid Python. Two + sources: + - `_format_arg_value`'s fallback: literal `` token + (e.g. ``). + - `repr(list_with_SA)`: when an arg is a `list` (e.g. + `enduses=[label_obj]`), `_format_arg_value` returns `repr(v)` which + per-element calls `repr(SA_obj)`, producing strings like + `` — also not valid + Python. Such cells can't run as written; the user must construct + the SA object inline.""" + # Match three repr shapes that indicate an unrunnable arg: + # - SA Label: `` + # - object(): `` + # - fallback: `` from _format_arg_value + # The common shape is "". + import re + if re.search(r"\bat\s+0x[0-9A-Fa-f]+", call_src): + return True + return bool(re.search(r"<[A-Z][A-Za-z_]+\s\.\.\.>", call_src)) + + +def _render_call(method_path: str, args: Mapping[str, Any]) -> str: + """Render `bsq.(arg=val, ...)` as a one-line call. For long arg + lists, split across lines for readability.""" + arg_strs = [f"{k}={_format_arg_value(v)}" for k, v in args.items()] + inline = f"bsq.{method_path}(" + ", ".join(arg_strs) + ")" + if len(inline) <= 100: + return inline + indented = ",\n ".join(arg_strs) + return f"bsq.{method_path}(\n {indented},\n)" + + +def _build_notebook( + *, + schema: str, + flavor: str, + entries: Iterable, # list[SnapshotEntry] +) -> dict: + """Build the notebook dict (Jupyter v4 format).""" + knobs = _SCHEMA_CONSTRUCTOR[schema] + cells: list[dict] = [] + + cells.append(_markdown_cell( + f"# Example queries: `{flavor}` ({schema})\n" + f"\n" + f"Auto-generated from `tests/query_snapshots/{flavor}.json`. Each cell\n" + f"runs one entry from the snapshot suite. Regenerate by running the\n" + f"matching test with `--update-snapshot` or `--overwrite-snapshot`.\n" + )) + + cells.append(_code_cell( + "from pathlib import Path\n" + "from buildstock_query import BuildStockQuery\n" + "from buildstock_query.schema.utilities import MappedColumn\n" + "import pandas as pd\n" + )) + + cells.append(_markdown_cell( + "## Construct the BuildStockQuery object\n" + "\n" + "`cache_folder` points at the snapshot test cache directory so this\n" + "notebook reuses parquets that the test suite has already downloaded\n" + "from Athena. Queries that are already cached return immediately;\n" + "anything new still hits Athena.\n" + )) + + cells.append(_code_cell( + f'# This notebook lives in `tests/example_notebooks/`; the snapshot test\n' + f'# cache is its sibling `tests/query_snapshots/{schema}_cache/`. Resolve\n' + f'# the path relative to the notebook directory (`_dh[0]` is set by\n' + f'# IPython at kernel startup; falls back to CWD outside Jupyter).\n' + f'_NB_DIR = Path(_dh[0] if "_dh" in globals() else ".").resolve()\n' + f'_CACHE = (_NB_DIR / "../query_snapshots/{schema}_cache").resolve()\n' + f'bsq = BuildStockQuery(\n' + f' "rescore",\n' + f' "buildstock_sdr",\n' + f' "{knobs["table_name"]}",\n' + f' buildstock_type="{knobs["buildstock_type"]}",\n' + f' db_schema="{knobs["db_schema"]}",\n' + f' skip_reports=True,\n' + f' cache_folder=str(_CACHE),\n' + f')\n' + )) + + from tests.test_utility import expand_rate_map_flat + + for entry in entries: + # The entry has been placeholder-resolved already by load_entries(), + # so entry.args is a list[dict] of fully-concrete kwargs. + if not entry.args: + continue + first_variant = expand_rate_map_flat(dict(entry.args[0])) # copy — we mutate + method_name = first_variant.pop("_method", None) or "query" + + # Header for this entry. + header = f"## `{entry.name}`\n" + if entry.description: + header += f"\n{entry.description}\n" + cells.append(_markdown_cell(header)) + + call_src = _render_call(method_name, first_variant) + unsafe = method_name in _NO_EXECUTE_METHODS + not_roundtrippable = _has_unroundtrippable_placeholder(call_src) + if unsafe or not_roundtrippable: + # Comment-out cases: + # - `unsafe`: this method fans out to internal queries that + # aren't covered by the snapshot cache and would fire + # fresh Athena scans during nbclient execution. + # - `not_roundtrippable`: the call contains a `