From d26e336b6dd39a27e8641a52727424a14d2ca3f6 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Wed, 18 Mar 2026 22:00:53 +0000 Subject: [PATCH 1/5] fix: Correct Gemini CLI response parsing to strip markdown code blocks and remove a redundant prompt argument, and update Makefile container names, pre-run cleanup, and volume mount paths. --- Makefile | 12 ++++++++---- evalbench/generators/models/gemini_cli.py | 17 +++++++---------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 1381c6d4..dcffab23 100644 --- a/Makefile +++ b/Makefile @@ -33,10 +33,12 @@ build-test: $(CONTAINER_ENGINE) build -t evalbench-test -f evalbench_service/Dockerfile . container: - $(CONTAINER_ENGINE) run --rm --name=evalbench_container \ + $(CONTAINER_ENGINE) stop evalbench_server || true + $(CONTAINER_ENGINE) rm evalbench_server || true + $(CONTAINER_ENGINE) run --rm --name=evalbench_server \ $(if $(filter podman,$(CONTAINER_ENGINE)),--sysctl net.ipv6.conf.all.disable_ipv6=1) \ $(if $(filter docker,$(CONTAINER_ENGINE)),--net=host) \ - -v ~/.config/gcloud:/root/.config/gcloud \ + -v ~/.config/gcloud:/home/evalbench/.config/gcloud \ -e GOOGLE_CLOUD_PROJECT=cloud-db-nl2sql \ --cap-add=SYS_PTRACE \ -p 3000:3000 \ @@ -44,11 +46,13 @@ container: -e TYPE=$(TYPE) evalbench:latest shell: - $(CONTAINER_ENGINE) run -ti --rm --name=evalbench_container \ + $(CONTAINER_ENGINE) stop evalbench_server || true + $(CONTAINER_ENGINE) rm evalbench_server || true + $(CONTAINER_ENGINE) run -ti --rm --name=evalbench_server \ $(if $(filter podman,$(CONTAINER_ENGINE)),--sysctl net.ipv6.conf.all.disable_ipv6=1) \ $(if $(filter docker,$(CONTAINER_ENGINE)),--net=host) \ --cap-add=SYS_PTRACE \ - -v ~/.config/gcloud:/root/.config/gcloud \ + -v ~/.config/gcloud:/home/evalbench/.config/gcloud \ -v $(PWD)/requirements.txt:/evalbench/requirements.txt \ -v $(PWD)/evalbench:/evalbench/evalbench \ -v $(PWD)/viewer:/evalbench/viewer \ diff --git a/evalbench/generators/models/gemini_cli.py b/evalbench/generators/models/gemini_cli.py index ea1c8ddc..5e8faf18 100644 --- a/evalbench/generators/models/gemini_cli.py +++ b/evalbench/generators/models/gemini_cli.py @@ -370,15 +370,13 @@ def _verify_mcp_server(self, server_name: str, settings_path: str) -> bool: if "response" in envelope: response_text = envelope["response"].strip() - if response_text.startswith( - "```" - ) or response_text.startswith('"'): - # Robust JSON parsing - json_match = re.search( - r"\\[.*\\]", response_text, re.DOTALL - ) - if json_match: - response_text = json_match.group(0) + if response_text.startswith('```'): + lines = response_text.split('\n') + if lines and lines[0].startswith('```'): + lines = lines[1:] + if lines and lines[-1].startswith('```'): + lines = lines[:-1] + response_text = '\n'.join(lines).strip() tools = json.loads(response_text) @@ -685,7 +683,6 @@ def _run_gemini_cli(self, cli_cmd: CLICommand): [ "--output-format", "stream-json", - "--prompt", cli_cmd.prompt, ] ) From 1a7e54897140081f668e013597809d226ff271b3 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Thu, 19 Mar 2026 17:12:35 +0000 Subject: [PATCH 2/5] feat: Implement persistent volume claim for session files and refactor K8s deployment configurations. --- Makefile | 19 ++++++++- evalbench/eval_server.py | 48 ++++++++++++----------- evalbench/evaluator/progress_reporter.py | 27 +++++++------ evalbench/generators/models/gemini_cli.py | 17 ++++---- evalbench/util/sessionmgr.py | 2 +- evalbench_service/Dockerfile | 4 +- evalbench_service/k8s/evalbench-test.yaml | 8 ++-- evalbench_service/k8s/evalbench.yaml | 9 ++++- evalbench_service/k8s/pvc.yaml | 12 ++++++ viewer/main.py | 27 +++++++------ 10 files changed, 110 insertions(+), 63 deletions(-) create mode 100644 evalbench_service/k8s/pvc.yaml diff --git a/Makefile b/Makefile index dcffab23..d7f4fc5c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ #!/usr/bin/make -f -default:apply +default:deploy .PHONY: default CONTAINER_ENGINE ?= docker @@ -73,6 +73,7 @@ push: deploy: gcloud container clusters get-credentials evalbench-directpath-cluster --zone us-central1-c --project cloud-db-nl2sql kubectl apply -f evalbench_service/k8s/namespace.yaml + kubectl apply -f evalbench_service/k8s/pvc.yaml kubectl apply -f evalbench_service/k8s/ksa.yaml kubectl apply -f evalbench_service/k8s/service.yaml kubectl apply -f evalbench_service/k8s/evalbench.yaml @@ -99,6 +100,22 @@ undeploy-test: kubectl delete -f evalbench_service/k8s/service-test.yaml kubectl delete -f evalbench_service/k8s/evalbench-test.yaml +redeploy: + gcloud container clusters get-credentials evalbench-directpath-cluster --zone us-central1-c --project cloud-db-nl2sql + kubectl rollout restart deployment/evalbench-eval-server-deploy -n evalbench-namespace + +redeploy-test: + gcloud container clusters get-credentials evalbench-directpath-cluster --zone us-central1-c --project cloud-db-nl2sql + kubectl rollout restart deployment/evalbench-test-eval-server-deploy -n evalbench-test-namespace + +pod-shell: + gcloud container clusters get-credentials evalbench-directpath-cluster --zone us-central1-c --project cloud-db-nl2sql + kubectl exec -it deployment/evalbench-eval-server-deploy -n evalbench-namespace -c evalbench-eval -- /bin/bash + +pod-shell-test: + gcloud container clusters get-credentials evalbench-directpath-cluster --zone us-central1-c --project cloud-db-nl2sql + kubectl exec -it deployment/evalbench-test-eval-server-deploy -n evalbench-test-namespace -c evalbench-test-eval -- /bin/bash + proto: @python -m grpc_tools.protoc \ --proto_path=evalbench/evalproto \ diff --git a/evalbench/eval_server.py b/evalbench/eval_server.py index fe873223..71cfbb1e 100644 --- a/evalbench/eval_server.py +++ b/evalbench/eval_server.py @@ -1,18 +1,41 @@ """Server on GCP side for the evaluation service.""" +import os +import sys +from absl import logging + +# --- Logging Initialization (MUST happen before other imports) --- + + +class UncloseableStream: + def __init__(self, stream): + self.stream = stream + + def write(self, data): + self.stream.write(data) + + def flush(self): + self.stream.flush() + + def close(self): + pass # Do not close the underlying stream + + +logging.use_absl_handler() +# Prevent stream closing globally +logging.get_absl_handler().python_handler.stream = UncloseableStream(sys.stdout) + +# --- Remaining Imports --- import asyncio from collections.abc import Sequence from absl import app from absl import flags -from absl import logging import grpc import util from eval_service import EvalServicer from eval_service import SessionManagerInterceptor from evalproto import eval_service_pb2_grpc -import os -import sys _LOCALHOST = flags.DEFINE_bool( "localhost", @@ -26,25 +49,8 @@ _cleanup_coroutines = [] -class UncloseableStream: - - def __init__(self, stream): - self.stream = stream - - def write(self, data): - self.stream.write(data) - - def flush(self): - self.stream.flush() - - def close(self): - pass # Do not close the underlying stream - - async def _serve(): """Starts the server.""" - # Prevent stream closing - logging.get_absl_handler().python_handler.stream = UncloseableStream(sys.stdout) logging.info("Starting server") interceptors = [ @@ -76,8 +82,6 @@ def main(argv: Sequence[str]) -> None: if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") - logging.use_absl_handler() - loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: diff --git a/evalbench/evaluator/progress_reporter.py b/evalbench/evaluator/progress_reporter.py index b2a91cf1..98629f54 100644 --- a/evalbench/evaluator/progress_reporter.py +++ b/evalbench/evaluator/progress_reporter.py @@ -13,6 +13,7 @@ _ORIGINAL_STDERR = sys.stderr _ORIGINAL_HANDLERS = None _NUM_LINES_FOR_PROGRESS = 5 +_STDOUT_LOCK = threading.Lock() try: import google.colab # type: ignore from IPython.display import display, HTML # type: ignore @@ -69,11 +70,12 @@ def _setup_colab(progress_report): def _setup_stdout_reporting(): global _ORIGINAL_HANDLERS - logger = logging.getLogger() - _ORIGINAL_HANDLERS = logger.handlers - sys.stderr = sys.stdout = tmp_buffer = StringIO() - logger.handlers = [logging.StreamHandler(tmp_buffer)] - _ORIGINAL_STDOUT.write(("-" * 80 + "\n") * _NUM_LINES_FOR_PROGRESS) + with _STDOUT_LOCK: + logger = logging.getLogger() + _ORIGINAL_HANDLERS = logger.handlers + sys.stderr = sys.stdout = tmp_buffer = StringIO() + logger.handlers = [logging.StreamHandler(tmp_buffer)] + _ORIGINAL_STDOUT.write(("-" * 80 + "\n") * _NUM_LINES_FOR_PROGRESS) return tmp_buffer @@ -282,13 +284,14 @@ def cleanup_progress_reporting( colab_progress_report.update(_colab_progress(progress_report)) return global _ORIGINAL_HANDLERS - sys.stdout = _ORIGINAL_STDOUT - sys.stderr = _ORIGINAL_STDERR - logger = logging.getLogger() - if _ORIGINAL_HANDLERS: - logger.handlers = _ORIGINAL_HANDLERS - _print_report(progress_report, tmp_buffer) - tmp_buffer.close() + with _STDOUT_LOCK: + sys.stdout = _ORIGINAL_STDOUT + sys.stderr = _ORIGINAL_STDERR + logger = logging.getLogger() + if _ORIGINAL_HANDLERS: + logger.handlers = _ORIGINAL_HANDLERS + _print_report(progress_report, tmp_buffer) + tmp_buffer.close() # Print iterations progress bar for parallel calls diff --git a/evalbench/generators/models/gemini_cli.py b/evalbench/generators/models/gemini_cli.py index 5e8faf18..b682b582 100644 --- a/evalbench/generators/models/gemini_cli.py +++ b/evalbench/generators/models/gemini_cli.py @@ -29,7 +29,7 @@ def __init__(self, querygenerator_config): # If running via eval_server.py (gRPC), use session-specific path in shared volume if sys.argv[0].endswith("eval_server.py"): session_id = querygenerator_config.get("session_id", "default") - self.fake_home = os.path.join("/tmp_session_files", session_id, "fake_home") + self.fake_home = os.path.join("/tmp_sessions", session_id, "fake_home") else: self.fake_home = os.path.abspath(os.path.join(".venv", "fake_home")) @@ -73,6 +73,9 @@ def _setup(self): if not os.path.exists(os.path.dirname(gemini_settings_path)): os.makedirs(os.path.dirname(gemini_settings_path), exist_ok=True) + # Setup NPM Authentication first, so we can pull gemini-cli/extensions + self._setup_npm_auth() + # Setup MCP Servers mcp_servers_config = self.setup_config.get("mcp_servers", {}) self._setup_mcp_servers(mcp_servers_config, gemini_settings_path) @@ -83,8 +86,6 @@ def _setup(self): verify_tools=False, ) - self._setup_npm_auth() - # Install Extensions extensions_config = self.setup_config.get("extensions", {}) self._install_extensions(extensions_config) @@ -351,7 +352,7 @@ def _verify_mcp_server(self, server_name: str, settings_path: str) -> bool: text=True, check=False, env=verify_env, - timeout=120, + timeout=300, ) if result.returncode != 0: @@ -549,9 +550,7 @@ def _install_extensions(self, extensions: dict | list): break if already_installed: - logging.info( - f"Extension '{ext}' appears to be already installed. Skipping." - ) + logging.info(f"Extension '{ext}' appears to be already installed. Skipping.") continue logging.info(f"Installing extension: {ext}") @@ -631,9 +630,7 @@ def _execute_cli_command( self, command: list[str], env: dict[str, str] | None = None ) -> subprocess.CompletedProcess: try: - result = subprocess.run( - command, capture_output=True, text=True, check=False, env=env - ) + result = subprocess.run(command, capture_output=True, text=True, check=False, env=env) # Filter out benign schema warnings from json decoder from stderr to reduce noise if result.stderr: result.stderr = "\n".join( diff --git a/evalbench/util/sessionmgr.py b/evalbench/util/sessionmgr.py index 10c171fa..f8a83928 100644 --- a/evalbench/util/sessionmgr.py +++ b/evalbench/util/sessionmgr.py @@ -5,7 +5,7 @@ from absl import app import uuid -SESSION_RESOURCES_PATH = "/tmp_session_files/" +SESSION_RESOURCES_PATH = "/tmp_sessions/" class SessionManager: diff --git a/evalbench_service/Dockerfile b/evalbench_service/Dockerfile index 8444c83e..d05cd049 100644 --- a/evalbench_service/Dockerfile +++ b/evalbench_service/Dockerfile @@ -31,9 +31,9 @@ RUN uv sync RUN ln -s /usr/bin/python3 /usr/bin/python RUN make proto -f ./Makefile -RUN mkdir /tmp_session_files +RUN mkdir /tmp_session_files /tmp_sessions RUN cp /evalbench/evalbench_service/supervisord.conf /evalbench/supervisord.conf -RUN chown -R 65532:65532 /evalbench /tmp /tmp_session_files /home/evalbench +RUN chown -R 65532:65532 /evalbench /tmp /tmp_session_files /tmp_sessions /home/evalbench CMD ["/usr/bin/supervisord", "-c", "/evalbench/supervisord.conf"] EXPOSE 50051 3000 \ No newline at end of file diff --git a/evalbench_service/k8s/evalbench-test.yaml b/evalbench_service/k8s/evalbench-test.yaml index acb3cd51..0fd08558 100644 --- a/evalbench_service/k8s/evalbench-test.yaml +++ b/evalbench_service/k8s/evalbench-test.yaml @@ -15,6 +15,8 @@ spec: labels: app: evalbench-test-eval-server spec: + securityContext: + fsGroup: 65532 serviceAccountName: evalbench-test-ksa containers: - image: us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:test @@ -40,10 +42,10 @@ spec: volumeMounts: - mountPath: /tmp name: tmp-volume - - mountPath: /tmp_session_files - name: tmp-session-files + - mountPath: /tmp_sessions + name: tmp-sessions volumes: - name: tmp-volume emptyDir: {} - - name: tmp-session-files + - name: tmp-sessions emptyDir: {} diff --git a/evalbench_service/k8s/evalbench.yaml b/evalbench_service/k8s/evalbench.yaml index 2320cb82..740c3dd6 100644 --- a/evalbench_service/k8s/evalbench.yaml +++ b/evalbench_service/k8s/evalbench.yaml @@ -17,6 +17,8 @@ spec: annotations: gke-gcsfuse/volumes: "true" spec: + securityContext: + fsGroup: 65532 serviceAccountName: evalbench-ksa containers: - image: us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest @@ -65,6 +67,8 @@ spec: name: tmp-volume - mountPath: /tmp_session_files name: gcs-fuse-csi-vol + - mountPath: /tmp_sessions + name: evalbench-ssd-vol volumes: - name: tmp-volume emptyDir: {} @@ -73,4 +77,7 @@ spec: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: evalbench-sessions-cloud-db-nl2sql - mountOptions: "implicit-dirs,uid=65532,gid=65532" \ No newline at end of file + mountOptions: "implicit-dirs,uid=65532,gid=65532" + - name: evalbench-ssd-vol + persistentVolumeClaim: + claimName: evalbench-ssd-pvc diff --git a/evalbench_service/k8s/pvc.yaml b/evalbench_service/k8s/pvc.yaml new file mode 100644 index 00000000..68bd8b6f --- /dev/null +++ b/evalbench_service/k8s/pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: evalbench-ssd-pvc + namespace: evalbench-namespace +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: premium-rwo diff --git a/viewer/main.py b/viewer/main.py index d58c6b4d..fc68452d 100644 --- a/viewer/main.py +++ b/viewer/main.py @@ -7,13 +7,12 @@ logging.basicConfig(level=logging.INFO) try: - import summary import dashboard import conversations except ImportError: # Optional modules could not be imported; continue without them. logging.warning( - "Optional modules 'summary', 'dashboard', and 'conversations' " + "Optional modules 'dashboard', and 'conversations' " "could not be imported (absolute or relative)." ) @@ -65,8 +64,8 @@ class State: "%20%20max-height%3A%20none%20%21important%3B%0A" "%20%20max-width%3A%20none%20%21important%3B%0A" "%20%20white-space%3A%20pre-wrap%20%21important%3B%0A" - "%7D" - ] + "%7D", + ], ) def app(): state = me.state(State) @@ -75,17 +74,17 @@ def app(): results_dir_candidates = [ "/tmp_session_files/results", os.path.join(os.path.dirname(os.path.dirname(__file__)), "results"), - os.path.join(os.getcwd(), "results") + os.path.join(os.getcwd(), "results"), ] - + results_dir = None for candidate in results_dir_candidates: if os.path.exists(candidate) and os.path.isdir(candidate): results_dir = candidate break - + if results_dir is None: - results_dir = results_dir_candidates[1] # Fallback to default + results_dir = results_dir_candidates[1] # Fallback to default directories = [] if os.path.exists(results_dir): @@ -128,10 +127,14 @@ def on_selection_change(e: me.SelectSelectionChangeEvent): gap="16px", ) ): - with me.box(style=me.Style(width="100%", max_width="400px", margin=me.Margin(bottom="8px"))): + with me.box( + style=me.Style(width="100%", max_width="400px", margin=me.Margin(bottom="8px")) + ): me.select( label="Select a result directory", - options=[me.SelectOption(label=d, value=d) for d in sorted(directories)], + options=[ + me.SelectOption(label=d, value=d) for d in sorted(directories) + ], on_selection_change=on_selection_change, value=state.selected_directory, appearance="outline", @@ -149,7 +152,9 @@ def on_tab_change(e: me.ButtonToggleChangeEvent): me.ButtonToggleButton(label="Configs", value="Configs"), # me.ButtonToggleButton(label="Evals", value="Evals"), # me.ButtonToggleButton(label="Scores", value="Scores"), - me.ButtonToggleButton(label="Conversations", value="Conversations"), + me.ButtonToggleButton( + label="Conversations", value="Conversations" + ), # me.ButtonToggleButton(label="Summary", value="Summary"), ], on_change=on_tab_change, From 40b9dc7631fb314c3f7dafe12e01be60d8deab43 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Fri, 20 Mar 2026 21:16:31 +0000 Subject: [PATCH 3/5] fix: Improve dataset parsing robustness with default values and conditionally disable progress reporting for the eval server. --- evalbench/dataset/dataset.py | 26 +++++++++++++----------- evalbench/evaluator/progress_reporter.py | 7 ++++++- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index f254ec90..fa80941a 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -1,6 +1,6 @@ """Process datasets.""" -from typing import Any +from typing import Any, Optional import json import logging from collections.abc import Sequence @@ -205,17 +205,17 @@ def load_dataset(dataset: Sequence[dict], config): continue eval_input = EvalInputRequest( id=item["id"], - nl_prompt=item["nl_prompt"], - query_type=item["query_type"].lower(), - database=item["database"], + nl_prompt=item.get("nl_prompt", ""), + query_type=item.get("query_type", "dql").lower(), + database=item.get("database", ""), dialects=_union_dialects( - item["dialects"], config.get("dialects", [])), - golden_sql=item["golden_sql"], - eval_query=item["eval_query"], - setup_sql=item["setup_sql"], - cleanup_sql=item["cleanup_sql"], - tags=item["tags"], - other=build_normalized_other(item["other"]), + item.get("dialects", []), config.get("dialects", [])), + golden_sql=item.get("golden_sql", []), + eval_query=item.get("eval_query", ""), + setup_sql=item.get("setup_sql", []), + cleanup_sql=item.get("cleanup_sql", []), + tags=item.get("tags", []), + other=build_normalized_other(item.get("other")), ) input_items[eval_input.query_type].append(eval_input) return input_items @@ -245,7 +245,9 @@ def _item_meets_config_filters(item: dict, config: dict): return False -def build_normalized_other(other: dict[str, Any]): +def build_normalized_other(other: Optional[dict[str, Any]]): + if not other: + return {} return {key: json.dumps(value) for key, value in other.items()} diff --git a/evalbench/evaluator/progress_reporter.py b/evalbench/evaluator/progress_reporter.py index 98629f54..e3ded4fb 100644 --- a/evalbench/evaluator/progress_reporter.py +++ b/evalbench/evaluator/progress_reporter.py @@ -26,6 +26,9 @@ def setup_progress_reporting( manager: SyncManager, total_dataset_len: int, total_dbs: int ): + if sys.argv[0].endswith("eval_server.py"): + return None, None, None, None, None + tmp_buffer = None colab_progress_report = None progress_reporting = { @@ -280,6 +283,8 @@ def cleanup_progress_reporting( progress_report, tmp_buffer, colab_progress_report): + if not progress_report: + return if _IN_COLAB: colab_progress_report.update(_colab_progress(progress_report)) return @@ -288,7 +293,7 @@ def cleanup_progress_reporting( sys.stdout = _ORIGINAL_STDOUT sys.stderr = _ORIGINAL_STDERR logger = logging.getLogger() - if _ORIGINAL_HANDLERS: + if _ORIGINAL_HANDLERS is not None: logger.handlers = _ORIGINAL_HANDLERS _print_report(progress_report, tmp_buffer) tmp_buffer.close() From 8fba4a75bd48a8e2cceb1ed4e2851902e3d5039a Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 23 Mar 2026 00:47:50 +0000 Subject: [PATCH 4/5] feat: Add empty results check, enable BigQuery schema autodetect, and workaround pyarrow float truncation error. --- evalbench/reporting/bqstore.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/evalbench/reporting/bqstore.py b/evalbench/reporting/bqstore.py index 250da388..b724b0fc 100644 --- a/evalbench/reporting/bqstore.py +++ b/evalbench/reporting/bqstore.py @@ -51,6 +51,10 @@ def __init__(self, reporting_config, job_id, run_time): self.summary_table = "{}.summary".format(self.dataset_id) def store(self, results, type: STORETYPE): + if results is None or results.empty: + logging.info(f"No results to store for {type}") + return + dataset = bigquery.Dataset(self.dataset_id) dataset.location = self.location dataset = self.client.create_dataset( @@ -61,6 +65,7 @@ def store(self, results, type: STORETYPE): ) ) job_config = bigquery.LoadJobConfig() + job_config.autodetect = True job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, @@ -77,6 +82,15 @@ def store(self, results, type: STORETYPE): # Chunk this to avoid BQ OOM job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND # type: ignore for chunk in _split_dataframe(results, _CHUNK_SIZE): + # Workaround for pyarrow truncation error when inserting floats into INT64 columns. + # This happens if BQ autodetected a column as INT64 in a previous run. + # Casting to string avoids the client-side pyarrow crash and lets BQ handle the conversion. + if type in [STORETYPE.SUMMARY, STORETYPE.SCORES]: + chunk = chunk.copy() + for col in chunk.select_dtypes(include=["float64", "float32"]).columns: + # Convert to string and strip .0 to avoid "Failed to parse string: '1.0' as a scalar of type int64" + chunk[col] = chunk[col].astype(str).str.replace(r"\.0$", "", regex=True).replace("nan", None) + job = self.client.load_table_from_dataframe( chunk, table, job_config=job_config ) From d18c4f09f187e3648a71bc84df924ee1d42df4ef Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 23 Mar 2026 17:21:21 +0000 Subject: [PATCH 5/5] fix: Improve BigQuery float handling by casting known integer-like float columns to nullable integers and converting other floats to strings to prevent pyarrow errors. --- evalbench/reporting/bqstore.py | 105 +++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/evalbench/reporting/bqstore.py b/evalbench/reporting/bqstore.py index b724b0fc..997472ef 100644 --- a/evalbench/reporting/bqstore.py +++ b/evalbench/reporting/bqstore.py @@ -1,5 +1,6 @@ from google.cloud import bigquery import logging +import pandas as pd from reporting.report import Reporter, STORETYPE from util.gcp import get_gcp_project import urllib.parse @@ -14,7 +15,49 @@ _CHUNK_SIZE = 250 -_REPORT_QUERY = "WITH all_runs_with_set_tag AS ( SELECT job_id, database, REPLACE(REPLACE(REPLACE(dialects, '[', ''),']',''),'\\'','') AS dialect, id, nl_prompt, trim(generated_sql) AS generated_sql, golden_sql AS golden_sqls, eval_query AS eval_sqls, CASE WHEN generated_error IS NOT NULL THEN generated_error ELSE generated_result END AS generated_result, CASE WHEN golden_error IS NOT NULL THEN golden_error ELSE golden_result END AS golden_result, eval_results AS generated_eval_result, golden_eval_results AS golden_eval_result, DATE(run_time) AS date_of_eval, FROM evalbench.results WHERE job_id = @eval_id ) SELECT *, comparator = @correctness_scorer AS is_correctness_score, '__PROJECT_ID__' AS project_id FROM all_runs_with_set_tag AS eval LEFT JOIN ( SELECT id, job_id, score, COALESCE(dialects[SAFE_OFFSET(0)],'') AS dialect, database, comparator, IFNULL(comparison_logs, '') AS comparison_logs FROM evalbench.scores ) AS scores USING (job_id, id, dialect, database) ORDER BY date_of_eval DESC;" +_REPORT_QUERY = """ +WITH all_runs_with_set_tag AS ( + SELECT + job_id, + database, + REPLACE(REPLACE(REPLACE(dialects, '[', ''),']',''),'\'','') AS dialect, + id, + nl_prompt, + trim(generated_sql) AS generated_sql, + golden_sql AS golden_sqls, + eval_query AS eval_sqls, + CASE + WHEN generated_error IS NOT NULL THEN generated_error + ELSE generated_result + END AS generated_result, + CASE + WHEN golden_error IS NOT NULL THEN golden_error + ELSE golden_result + END AS golden_result, + eval_results AS generated_eval_result, + golden_eval_results AS golden_eval_result, + DATE(run_time) AS date_of_eval + FROM evalbench.results + WHERE job_id = @eval_id +) +SELECT + *, + comparator = @correctness_scorer AS is_correctness_score, + '__PROJECT_ID__' AS project_id +FROM all_runs_with_set_tag AS eval +LEFT JOIN ( + SELECT + id, + job_id, + score, + COALESCE(dialects[SAFE_OFFSET(0)],'') AS dialect, + database, + comparator, + IFNULL(comparison_logs, '') AS comparison_logs + FROM evalbench.scores +) AS scores USING (job_id, id, dialect, database) +ORDER BY date_of_eval DESC; +""" def _split_dataframe(df, chunk_size): @@ -80,16 +123,44 @@ def store(self, results, type: STORETYPE): table = self.summary_table # Chunk this to avoid BQ OOM - job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND # type: ignore + job_config.write_disposition = ( + bigquery.job.WriteDisposition.WRITE_APPEND # type: ignore + ) for chunk in _split_dataframe(results, _CHUNK_SIZE): - # Workaround for pyarrow truncation error when inserting floats into INT64 columns. - # This happens if BQ autodetected a column as INT64 in a previous run. - # Casting to string avoids the client-side pyarrow crash and lets BQ handle the conversion. + # Workaround for pyarrow truncation error when inserting floats + # into INT64 columns. This happens if BQ autodetected a column as + # INT64 in a previous run. Casting to string avoids the client-side + # pyarrow crash and lets BQ handle the conversion. if type in [STORETYPE.SUMMARY, STORETYPE.SCORES]: chunk = chunk.copy() - for col in chunk.select_dtypes(include=["float64", "float32"]).columns: - # Convert to string and strip .0 to avoid "Failed to parse string: '1.0' as a scalar of type int64" - chunk[col] = chunk[col].astype(str).str.replace(r"\.0$", "", regex=True).replace("nan", None) + # 1. Identify columns that BQ might have already locked + # as INT64 common ones are 'metric_score', etc. + int_cols = [ + "metric_score", "totalLatencyMs", + "total_results_count", "correct_results_count" + ] + for col in int_cols: + if col in chunk.columns: + # Round and cast to Int64 (nullable integer type) + chunk[col] = pd.to_numeric( + chunk[col], errors="coerce" + ).round().astype("Int64") + + # 2. For any remaining float columns that aren't in our + # "must-be-int" list, we cast to string to avoid pyarrow's + # strict truncation check if BQ thinks it's an int. + float_cols = chunk.select_dtypes( + include=["float64", "float32"] + ).columns + for col in float_cols: + if col not in int_cols: + # Convert to string and strip .0 if effectively int + chunk[col] = ( + chunk[col] + .astype(str) + .str.replace(r"\.0$", "", regex=True) + .replace("nan", None) + ) job = self.client.load_table_from_dataframe( chunk, table, job_config=job_config @@ -106,15 +177,23 @@ def print_dashboard_links(self): + "c.reportId=e7d7fc00-4268-45d6-b17b-160ca271a4d0" + "&ds.eval_results.connector=bigQuery" + "&ds.eval_results.type=CUSTOM_QUERY" - + f"&ds.eval_results.projectId={urllib.parse.quote(self.project_id)}" - + f"&ds.eval_results.sql={urllib.parse.quote(report_query)}" - + f"&ds.eval_results.billingProjectId={urllib.parse.quote(self.project_id)}" + + "&ds.eval_results.projectId=" + + urllib.parse.quote(self.project_id) + + "&ds.eval_results.sql=" + + urllib.parse.quote(report_query) + + "&ds.eval_results.billingProjectId=" + + urllib.parse.quote(self.project_id) + f"&r.reportName={urllib.parse.quote(report_name)}" + f"¶ms={urllib.parse.quote(report_params)}" ) if _IN_COLAB: - html_link = f'The evaluation report is now available on this Dashboard!' + html_link = ( + "The evaluation report is now available on this " + f"Dashboard!" + ) display(HTML(html_link)) # type: ignore else: print( - f"Results available at:\n\033[1;34m{report_link}\033[0m\n---\n") + "Results available at:\n" + f"\033[1;34m{report_link}\033[0m\n---\n" + )