diff --git a/bigquery_etl/cli/utils.py b/bigquery_etl/cli/utils.py index 076d07cbe79..75b3f8f43e2 100644 --- a/bigquery_etl/cli/utils.py +++ b/bigquery_etl/cli/utils.py @@ -4,11 +4,13 @@ import os import re from fnmatch import fnmatchcase +from functools import cache from glob import glob from pathlib import Path -from typing import Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple import click +import requests from google.auth.exceptions import DefaultCredentialsError from google.cloud import bigquery @@ -23,6 +25,7 @@ r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/" r"(?:checks\.sql)$" ) +GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings" def is_valid_dir(ctx, param, value): @@ -250,3 +253,26 @@ def temp_dataset_option( help="Dataset where intermediate query results will be temporarily stored, " "formatted as PROJECT_ID.DATASET_ID", ) + + +@cache +def get_glean_app_id_to_app_name_mapping() -> Dict[str, str]: + """Return a dict where key is the channel app id and the value is the shared app name. + + e.g. { + "org_mozilla_firefox": "fenix", + "org_mozilla_firefox_beta": "fenix", + "org_mozilla_ios_firefox": "firefox_ios", + "org_mozilla_ios_firefoxbeta": "firefox_ios", + } + """ + response = requests.get(GLEAN_APP_LISTINGS_URL) + response.raise_for_status() + + app_listings = response.json() + + return { + app["bq_dataset_family"]: app["app_name"] + for app in app_listings + if "bq_dataset_family" in app and "app_name" in app + } diff --git a/bigquery_etl/copy_deduplicate.py b/bigquery_etl/copy_deduplicate.py index 7618a7d76ac..a675a911c8d 100644 --- a/bigquery_etl/copy_deduplicate.py +++ b/bigquery_etl/copy_deduplicate.py @@ -20,14 +20,17 @@ from google.api_core.exceptions import BadRequest from google.cloud import bigquery -from bigquery_etl.cli.utils import table_matches_patterns +from bigquery_etl.cli.utils import ( + get_glean_app_id_to_app_name_mapping, + parallelism_option, + project_id_option, + table_matches_patterns, +) from bigquery_etl.config import ConfigLoader from bigquery_etl.util.bigquery_id import sql_table_id from bigquery_etl.util.client_queue import ClientQueue from bigquery_etl.util.common import TempDatasetReference -from .cli.utils import parallelism_option, project_id_option - QUERY_TEMPLATE = """ WITH -- Distinct document_ids and their minimum submission_timestamp today @@ -97,6 +100,13 @@ def _has_field_path(schema: List[bigquery.SchemaField], path: List[str]) -> bool def _select_geo(live_table: str, client: bigquery.Client) -> str: """Build a SELECT REPLACE clause that NULLs metadata.geo.* if applicable.""" _, dataset_id, table_id = live_table.split(".") + channel_to_app_name = get_glean_app_id_to_app_name_mapping() + app_id = re.sub("_live$", "", dataset_id) + + excluded_apps = set(ConfigLoader.get("geo_deprecation", "skip_apps", fallback=[])) + app_name = channel_to_app_name.get(app_id) + if app_name in excluded_apps: + return "" excluded_tables = set( ConfigLoader.get("geo_deprecation", "skip_tables", fallback=[]) @@ -104,26 +114,29 @@ def _select_geo(live_table: str, client: bigquery.Client) -> str: if re.sub(r"_v\d+$", "", table_id) in excluded_tables: return "" - app_id = dataset_id.removesuffix("_live") - included_apps = set( - ConfigLoader.get("geo_deprecation", "include_app_ids", fallback=[]) - ) - if app_id not in included_apps: - return "" - table = client.get_table(live_table) + # Only deprecating the geo fields for glean apps. Legacy tables would be deprecated after glean migration + if app_id not in channel_to_app_name.keys(): + return "" + + # only glean tables have this label include_client_id = table.labels.get("include_client_id") == "true" if not include_client_id: return "" - # Check schema to ensure geo fields exists + # Check schema to ensure required fields exists schema = table.schema - required_fields = ("city", "subdivision1", "subdivision2") - has_required_fields = all( - _has_field_path(schema, ["metadata", "geo", field]) for field in required_fields + has_client_id_field = _has_field_path(schema, ["client_info", "client_id"]) + if not has_client_id_field: + return "" + + required_geo_fields = ("city", "subdivision1", "subdivision2") + has_required_geo_fields = all( + _has_field_path(schema, ["metadata", "geo", field]) + for field in required_geo_fields ) - if not has_required_fields: + if not has_required_geo_fields: return "" return """ diff --git a/bigquery_etl/shredder/config.py b/bigquery_etl/shredder/config.py index 9c0049a9214..4565615785f 100755 --- a/bigquery_etl/shredder/config.py +++ b/bigquery_etl/shredder/config.py @@ -9,19 +9,19 @@ from functools import partial from itertools import chain from multiprocessing.pool import ThreadPool -from typing import Dict, List +from typing import List -import requests from google.cloud import bigquery from google.cloud.exceptions import NotFound +from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping + from ..util.bigquery_id import qualified_table_id MOZDATA = "mozdata" SHARED_PROD = "moz-fx-data-shared-prod" GLEAN_SCHEMA_ID = "glean_ping_1" GLEAN_MIN_SCHEMA_ID = "glean-min_ping_1" -GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings" @dataclass(frozen=True) @@ -789,7 +789,7 @@ def stable_tables_by_schema(schema_id): glean_stable_tables = stable_tables_by_schema(GLEAN_SCHEMA_ID) - channel_to_app_name = get_glean_channel_to_app_name_mapping() + channel_to_app_name = get_glean_app_id_to_app_name_mapping() # create mapping of dataset -> (tables containing associated deletion requests) # construct values as tuples because that is what they must be in the return type @@ -958,28 +958,6 @@ def stable_tables_by_schema(schema_id): } -def get_glean_channel_to_app_name_mapping() -> Dict[str, str]: - """Return a dict where key is the channel app id and the value is the shared app name. - - e.g. { - "org_mozilla_firefox": "fenix", - "org_mozilla_firefox_beta": "fenix", - "org_mozilla_ios_firefox": "firefox_ios", - "org_mozilla_ios_firefoxbeta": "firefox_ios", - } - """ - response = requests.get(GLEAN_APP_LISTINGS_URL) - response.raise_for_status() - - app_listings = response.json() - - return { - app["bq_dataset_family"]: app["app_name"] - for app in app_listings - if "bq_dataset_family" in app and "app_name" in app - } - - def _list_tables( dataset_ref: bigquery.DatasetReference, client: bigquery.Client, diff --git a/bqetl_project.yaml b/bqetl_project.yaml index 37e693d38bf..bde7cc80c77 100644 --- a/bqetl_project.yaml +++ b/bqetl_project.yaml @@ -563,27 +563,15 @@ generate: - firefox_desktop_background_tasks geo_deprecation: - include_app_ids: - - org_mozilla_ios_klar - - org_mozilla_klar - - org_mozilla_ios_focus - - org_mozilla_focus - - org_mozilla_focus_beta - - org_mozilla_focus_nightly - - org_mozilla_ios_firefox - - org_mozilla_ios_firefoxbeta - - org_mozilla_ios_fennec - - org_mozilla_firefox - - org_mozilla_firefox_beta - - org_mozilla_fenix - - org_mozilla_fenix_nightly - - org_mozilla_fennec_aurora - - firefox_desktop - - firefox_desktop_background_update - - firefox_desktop_background_defaultagent - - firefox_desktop_background_tasks + skip_apps: + - ads_backend + - mozilla_vpn + - mozillavpn_backend_cirrus + - accounts_backend + - accounts_frontend skip_tables: - newtab + - newtab_content retention_exclusion_list: - sql/moz-fx-data-shared-prod/search_derived/acer_cohort_v1 diff --git a/sql/moz-fx-data-shared-prod/monitoring_derived/shredder_targets_v1/query.py b/sql/moz-fx-data-shared-prod/monitoring_derived/shredder_targets_v1/query.py index 0ac9bf0517c..1e2d9894936 100644 --- a/sql/moz-fx-data-shared-prod/monitoring_derived/shredder_targets_v1/query.py +++ b/sql/moz-fx-data-shared-prod/monitoring_derived/shredder_targets_v1/query.py @@ -14,6 +14,7 @@ from google.cloud.bigquery import TableReference from google.cloud.exceptions import NotFound +from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping from bigquery_etl.schema import Schema from bigquery_etl.shredder.config import ( CLIENT_ID, @@ -22,7 +23,6 @@ SHARED_PROD, DeleteSource, find_glean_targets, - get_glean_channel_to_app_name_mapping, ) FIND_TABLES_QUERY_TEMPLATE = """ @@ -171,7 +171,7 @@ def get_associated_deletions( dataset_name.replace("_derived", "_stable") ] = f"{dataset_name}.additional_deletion_requests_v1" - glean_channel_names = get_glean_channel_to_app_name_mapping() + glean_channel_names = get_glean_app_id_to_app_name_mapping() for table_name, stable_tables in upstream_stable_tables.items(): deletion_tables: Set[DeleteSource] = set() @@ -248,7 +248,7 @@ def get_missing_deletions( bigquery_client = bigquery.Client() glean_delete_targets = find_glean_targets(pool, client=bigquery_client) - glean_channel_names = get_glean_channel_to_app_name_mapping() + glean_channel_names = get_glean_app_id_to_app_name_mapping() glean_app_name_to_channels = defaultdict(list) for channel, app_name in glean_channel_names.items(): glean_app_name_to_channels[app_name].append(channel) diff --git a/tests/shredder/test_config.py b/tests/shredder/test_config.py index 65d395ea766..fa5d811ef0f 100644 --- a/tests/shredder/test_config.py +++ b/tests/shredder/test_config.py @@ -6,6 +6,7 @@ from google.cloud import bigquery from google.cloud.bigquery import DatasetReference +from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping from bigquery_etl.shredder.config import ( CLIENT_ID, DELETE_TARGETS, @@ -14,7 +15,6 @@ DeleteTarget, _list_tables, find_glean_targets, - get_glean_channel_to_app_name_mapping, ) GLEAN_APP_LISTING = [ @@ -130,7 +130,7 @@ def get_table(self, table_ref): return table -@mock.patch("bigquery_etl.shredder.config.requests") +@mock.patch("bigquery_etl.cli.utils.requests") def test_glean_targets(mock_requests): mock_response = mock.Mock() mock_response.json.return_value = GLEAN_APP_LISTING @@ -295,7 +295,7 @@ def test_glean_targets(mock_requests): } -@mock.patch("bigquery_etl.shredder.config.requests") +@mock.patch("bigquery_etl.cli.utils.requests") def test_glean_targets_override(mock_requests): """Targets in GLEAN_DERIVED_OVERRIDES should override the target in find_glean_targets.""" @@ -379,13 +379,13 @@ def get_table(self, table_ref): } -@mock.patch("bigquery_etl.shredder.config.requests") +@mock.patch("bigquery_etl.cli.utils.requests") def test_glean_channel_app_mapping(mock_requests): mock_response = mock.Mock() mock_response.json.return_value = GLEAN_APP_LISTING mock_requests.get.return_value = mock_response - actual = get_glean_channel_to_app_name_mapping() + actual = get_glean_app_id_to_app_name_mapping() expected = { "org_mozilla_firefox": "fenix", diff --git a/tests/test_copy_deduplicate.py b/tests/test_copy_deduplicate.py new file mode 100644 index 00000000000..99afcf57ce9 --- /dev/null +++ b/tests/test_copy_deduplicate.py @@ -0,0 +1,453 @@ +from unittest.mock import Mock, patch + +import pytest +from click.testing import CliRunner +from google.cloud import bigquery + +from bigquery_etl.cli.query import ( # noqa: F401 # keeps circular import from exploding + run, +) +from bigquery_etl.copy_deduplicate import _has_field_path, _select_geo, copy_deduplicate + +PROJECT_ID = "moz-fx-data-shared-prod" + +GLEAN_MAPPING = { + "org_mozilla_firefox": "firefox", + "ads_backend": "ads_backend", +} + +VALID_GEO_DEPRECATION_SCHEMA = [ + bigquery.SchemaField("document_id", "STRING"), + bigquery.SchemaField( + "client_info", + "RECORD", + fields=[ + bigquery.SchemaField("client_id", "STRING"), + ], + ), + bigquery.SchemaField( + "metadata", + "RECORD", + fields=[ + bigquery.SchemaField( + "geo", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("subdivision1", "STRING"), + bigquery.SchemaField("subdivision2", "STRING"), + ], + ) + ], + ), +] + +GEO_CONFIG = { + "geo_deprecation": { + "skip_apps": ["ads_backend"], + "skip_tables": ["newtab"], + } +} + + +class TestCopyDeduplicate: + + @pytest.fixture + def runner(self): + return CliRunner() + + @pytest.fixture + def valid_geo_deprecation_schema_table(self): + """Table with include_client_id=true and all required geo fields present.""" + table = Mock() + table.labels = {"include_client_id": "true"} + table.schema = VALID_GEO_DEPRECATION_SCHEMA + return table + + @pytest.fixture + def mock_bq_env(self, monkeypatch): + """ + Fully self-contained environment: + - Patches get_glean_app_id_to_app_name_mapping and ConfigLoader.get + - Optionally wires a provided ClientQueue mock when client_queue_cls is given + - Returns (mock_client, captured_calls, run_dedup_side_effect) + """ + + mapping_mock = Mock() + config_get_mock = Mock() + + monkeypatch.setattr( + "bigquery_etl.copy_deduplicate.get_glean_app_id_to_app_name_mapping", + mapping_mock, + ) + monkeypatch.setattr( + "bigquery_etl.copy_deduplicate.ConfigLoader.get", + config_get_mock, + ) + + def _configure( + *, + mapping: dict | None = None, + config: dict | None = None, + table=None, + client_queue_cls=None, + ): + def make_config_side_effect(config_dict: dict): + def _side_effect(section=None, key=None, fallback=None): + section_dict = config_dict.get(section) + if section_dict: + return section_dict.get(key, fallback) + return fallback + + return _side_effect + + mapping_mock.return_value = mapping or {} + config_get_mock.side_effect = make_config_side_effect(config or {}) + + mock_client = Mock(spec=bigquery.Client) + if table is not None: + mock_client.get_table.return_value = table + + captured_calls = None + run_dedup_side_effect = None + + if client_queue_cls is not None: + client_queue = client_queue_cls.return_value + client_ctx = client_queue.client.return_value + client_ctx.__enter__.return_value = mock_client + client_ctx.__exit__.return_value = False + + client_queue.with_client.side_effect = ( + lambda func, *args, **kwargs: func(mock_client, *args, **kwargs) + ) + + captured_calls = [] + + def _run_dedup_side_effect( + client, sql, stable_table, job_config, num_retries + ): + captured_calls.append( + (client, sql, stable_table, job_config, num_retries) + ) + fake_job = Mock() + return stable_table, fake_job + + run_dedup_side_effect = _run_dedup_side_effect + + return mock_client, captured_calls, run_dedup_side_effect + + return _configure + + @patch("bigquery_etl.copy_deduplicate._copy_join_parts") + @patch("bigquery_etl.copy_deduplicate._run_deduplication_query") + @patch("bigquery_etl.copy_deduplicate._list_live_tables") + @patch("bigquery_etl.copy_deduplicate.ClientQueue") + def test_copy_deduplicate_geo_deprecation_sql( + self, + mock_client_queue_cls, + mock_list_live_tables, + mock_run_dedup, + mock_copy_join_parts, + runner, + mock_bq_env, + valid_geo_deprecation_schema_table, + ): + mock_client, captured_calls, run_dedup_side_effect = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=valid_geo_deprecation_schema_table, + client_queue_cls=mock_client_queue_cls, + ) + + mock_run_dedup.side_effect = run_dedup_side_effect + mock_copy_join_parts.return_value = None + + mock_list_live_tables.return_value = [ + f"{PROJECT_ID}.org_mozilla_firefox_live.baseline_v1", + f"{PROJECT_ID}.org_mozilla_firefox_live.newtab_v1", + f"{PROJECT_ID}.ads_backend_live.events_v1", + f"{PROJECT_ID}.telemetry_live.events_v1", + ] + + submission_date = "2020-11-01" + + result = runner.invoke( + copy_deduplicate, + [ + f"--project_id={PROJECT_ID}", + f"--date={submission_date}", + ], + ) + + assert result.exit_code == 0 + # 4 tables listed in mock_list_live_tables.return_value + assert len(captured_calls) == 4 + + partition = submission_date.replace("-", "") + + # 1) org_mozilla_firefox_live.baseline_v1: should include geo REPLACE clause + _, sql_arg, stable_table_arg, _, _ = captured_calls[0] + assert "REPLACE (" in sql_arg + assert "CAST(NULL AS STRING) AS city" in sql_arg + assert ( + f"{mock_list_live_tables.return_value[0].replace('_live', '_stable')}${partition}" + == stable_table_arg + ) + + # 2) newtab_v1: should be skipped by skip_tables + _, sql_arg, stable_table_arg, _, _ = captured_calls[1] + assert "REPLACE (" not in sql_arg + assert ( + f"{mock_list_live_tables.return_value[1].replace('_live', '_stable')}${partition}" + == stable_table_arg + ) + + # 3) ads_backend_live: should be skipped by skip_apps + _, sql_arg, stable_table_arg, _, _ = captured_calls[2] + assert "REPLACE (" not in sql_arg + assert ( + f"{mock_list_live_tables.return_value[2].replace('_live', '_stable')}${partition}" + == stable_table_arg + ) + + # 4) telemetry_live: not in GLEAN_MAPPING + _, sql_arg, stable_table_arg, _, _ = captured_calls[3] + assert "REPLACE (" not in sql_arg + assert ( + f"{mock_list_live_tables.return_value[3].replace('_live', '_stable')}${partition}" + == stable_table_arg + ) + + def test_has_field_path_top_level_and_nested(self): + # top-level field + assert _has_field_path(VALID_GEO_DEPRECATION_SCHEMA, ["document_id"]) + assert not _has_field_path(VALID_GEO_DEPRECATION_SCHEMA, ["missing_top_level"]) + + # nested fields + assert _has_field_path( + VALID_GEO_DEPRECATION_SCHEMA, ["client_info", "client_id"] + ) + assert _has_field_path( + VALID_GEO_DEPRECATION_SCHEMA, ["metadata", "geo", "city"] + ) + assert _has_field_path( + VALID_GEO_DEPRECATION_SCHEMA, ["metadata", "geo", "subdivision1"] + ) + assert _has_field_path( + VALID_GEO_DEPRECATION_SCHEMA, ["metadata", "geo", "subdivision2"] + ) + assert not _has_field_path( + VALID_GEO_DEPRECATION_SCHEMA, ["metadata", "geo", "nope"] + ) + + def test_select_geo_with_required_fields_present( + self, + valid_geo_deprecation_schema_table, + mock_bq_env, + ): + """If geo fields exist and conditions pass, we should get the NULLing SQL.""" + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=valid_geo_deprecation_schema_table, + ) + + live_table = "moz-fx-data-shared-prod.org_mozilla_firefox_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert ( + "org_mozilla_firefox_live" not in GEO_CONFIG["geo_deprecation"]["skip_apps"] + ) + assert "baseline" not in GEO_CONFIG["geo_deprecation"]["skip_tables"] + assert "REPLACE (" in sql + assert "metadata.geo" in sql + assert "CAST(NULL AS STRING) AS city" in sql + assert "CAST(NULL AS STRING) AS subdivision1" in sql + assert "CAST(NULL AS STRING) AS subdivision2" in sql + + def test_select_geo_when_skipped_app( + self, + mock_bq_env, + ): + """If app_name is in geo_deprecation.skip_apps, we should get an empty string.""" + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + ) + + live_table = "moz-fx-data-shared-prod.ads_backend_live.events_v1" + sql = _select_geo(live_table, mock_client) + + assert "ads_backend" in GEO_CONFIG["geo_deprecation"]["skip_apps"] + assert sql == "" + + def test_select_geo_when_skipped_table( + self, + mock_bq_env, + ): + """If table is in geo_deprecation.skip_tables, we should get an empty string.""" + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + ) + + live_table = "moz-fx-data-shared-prod.firefox_ios_live.newtab_v1" + sql = _select_geo(live_table, mock_client) + + assert "newtab" in GEO_CONFIG["geo_deprecation"]["skip_tables"] + assert sql == "" + + def test_select_geo_when_not_glean_app( + self, + mock_bq_env, + ): + """If app_id is not in the glean mapping, we should get an empty string.""" + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + ) + + live_table = "moz-fx-data-shared-prod.telemetry_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert "telemetry" not in GLEAN_MAPPING + assert sql == "" + + def test_select_geo_when_client_id_label_false( + self, + mock_bq_env, + ): + """If include_client_id label is false, we should get an empty string.""" + table = Mock() + table.labels = {"include_client_id": "false"} + table.schema = VALID_GEO_DEPRECATION_SCHEMA + + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=table, + ) + + live_table = "moz-fx-data-shared-prod.org_mozilla_firefox_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert table.labels["include_client_id"] == "false" + assert sql == "" + + def test_select_geo_when_client_id_label_missing( + self, + mock_bq_env, + ): + """If include_client_id label is missing, we should get an empty string.""" + table = Mock() + table.labels = {"owner1": "wichan"} # no include_client_id + table.schema = VALID_GEO_DEPRECATION_SCHEMA + + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=table, + ) + + live_table = "moz-fx-data-shared-prod.org_mozilla_firefox_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert "include_client_id" not in table.labels + assert sql == "" + + def test_select_geo_when_client_id_field_missing( + self, + mock_bq_env, + ): + """If one of the required geo fields is missing, we should get an empty string.""" + invalid_geo_deprecation_schema = [ + bigquery.SchemaField("document_id", "STRING"), + bigquery.SchemaField( + "client_info", + "RECORD", + fields=[ + bigquery.SchemaField("os", "STRING"), + ], + ), + bigquery.SchemaField( + "metadata", + "RECORD", + fields=[ + bigquery.SchemaField( + "geo", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("subdivision1", "STRING"), + bigquery.SchemaField("subdivision2", "STRING"), + ], + ) + ], + ), + ] + + table = Mock() + table.labels = {"include_client_id": "true"} + table.schema = invalid_geo_deprecation_schema + + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=table, + ) + + live_table = "moz-fx-data-shared-prod.org_mozilla_firefox_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert not _has_field_path( + invalid_geo_deprecation_schema, ["client_info", "client_id"] + ) + assert sql == "" + + def test_select_geo_when_geo_field_missing( + self, + mock_bq_env, + ): + """If one of the required geo fields is missing, we should get an empty string.""" + invalid_geo_deprecation_schema = [ + bigquery.SchemaField("document_id", "STRING"), + bigquery.SchemaField( + "client_info", + "RECORD", + fields=[ + bigquery.SchemaField("client_id", "STRING"), + ], + ), + bigquery.SchemaField( + "metadata", + "RECORD", + fields=[ + bigquery.SchemaField( + "geo", + "RECORD", + fields=[ + bigquery.SchemaField("subdivision1", "STRING"), + bigquery.SchemaField("subdivision2", "STRING"), + ], + ) + ], + ), + ] + + table = Mock() + table.labels = {"include_client_id": "true"} + table.schema = invalid_geo_deprecation_schema + + mock_client, _, _ = mock_bq_env( + mapping=GLEAN_MAPPING, + config=GEO_CONFIG, + table=table, + ) + + live_table = "moz-fx-data-shared-prod.org_mozilla_firefox_live.baseline_v1" + sql = _select_geo(live_table, mock_client) + + assert not _has_field_path( + invalid_geo_deprecation_schema, ["metadata", "geo", "city"] + ) + assert sql == ""