Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion bigquery_etl/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import os
import re
from fnmatch import fnmatchcase
from functools import cache
from glob import glob
from pathlib import Path
from typing import Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple

import click
import requests
from google.auth.exceptions import DefaultCredentialsError
from google.cloud import bigquery

Expand All @@ -23,6 +25,7 @@
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
r"(?:checks\.sql)$"
)
GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings"


def is_valid_dir(ctx, param, value):
Expand Down Expand Up @@ -250,3 +253,26 @@ def temp_dataset_option(
help="Dataset where intermediate query results will be temporarily stored, "
"formatted as PROJECT_ID.DATASET_ID",
)


@cache
def get_glean_app_id_to_app_name_mapping() -> Dict[str, str]:
"""Return a dict where key is the channel app id and the value is the shared app name.

e.g. {
"org_mozilla_firefox": "fenix",
"org_mozilla_firefox_beta": "fenix",
"org_mozilla_ios_firefox": "firefox_ios",
"org_mozilla_ios_firefoxbeta": "firefox_ios",
}
"""
response = requests.get(GLEAN_APP_LISTINGS_URL)
response.raise_for_status()

app_listings = response.json()

return {
app["bq_dataset_family"]: app["app_name"]
for app in app_listings
if "bq_dataset_family" in app and "app_name" in app
}
43 changes: 28 additions & 15 deletions bigquery_etl/copy_deduplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@
from google.api_core.exceptions import BadRequest
from google.cloud import bigquery

from bigquery_etl.cli.utils import table_matches_patterns
from bigquery_etl.cli.utils import (
get_glean_app_id_to_app_name_mapping,
parallelism_option,
project_id_option,
table_matches_patterns,
)
from bigquery_etl.config import ConfigLoader
from bigquery_etl.util.bigquery_id import sql_table_id
from bigquery_etl.util.client_queue import ClientQueue
from bigquery_etl.util.common import TempDatasetReference

from .cli.utils import parallelism_option, project_id_option

QUERY_TEMPLATE = """
WITH
-- Distinct document_ids and their minimum submission_timestamp today
Expand Down Expand Up @@ -97,33 +100,43 @@ def _has_field_path(schema: List[bigquery.SchemaField], path: List[str]) -> bool
def _select_geo(live_table: str, client: bigquery.Client) -> str:
"""Build a SELECT REPLACE clause that NULLs metadata.geo.* if applicable."""
_, dataset_id, table_id = live_table.split(".")
channel_to_app_name = get_glean_app_id_to_app_name_mapping()
app_id = re.sub("_live$", "", dataset_id)

excluded_apps = set(ConfigLoader.get("geo_deprecation", "skip_apps", fallback=[]))
app_name = channel_to_app_name.get(app_id)
if app_name in excluded_apps:
return ""

excluded_tables = set(
ConfigLoader.get("geo_deprecation", "skip_tables", fallback=[])
)
if re.sub(r"_v\d+$", "", table_id) in excluded_tables:
return ""

app_id = dataset_id.removesuffix("_live")
included_apps = set(
ConfigLoader.get("geo_deprecation", "include_app_ids", fallback=[])
)
if app_id not in included_apps:
return ""

table = client.get_table(live_table)

# Only deprecating the geo fields for glean apps. Legacy tables would be deprecated after glean migration
if app_id not in channel_to_app_name.keys():
return ""

# only glean tables have this label
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean we're explicitly only doing this for glean apps? I don't remember if this was discussed already

Copy link
Contributor Author

@wwyc wwyc Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes we are only deprecating the geo fields for glean apps/tables for now. After the glean migration is completed the legacy tables would be deprecated.

include_client_id = table.labels.get("include_client_id") == "true"
if not include_client_id:
return ""

# Check schema to ensure geo fields exists
# Check schema to ensure required fields exists
schema = table.schema
required_fields = ("city", "subdivision1", "subdivision2")
has_required_fields = all(
_has_field_path(schema, ["metadata", "geo", field]) for field in required_fields
has_client_id_field = _has_field_path(schema, ["client_info", "client_id"])
if not has_client_id_field:
return ""

required_geo_fields = ("city", "subdivision1", "subdivision2")
has_required_geo_fields = all(
_has_field_path(schema, ["metadata", "geo", field])
for field in required_geo_fields
)
if not has_required_fields:
if not has_required_geo_fields:
return ""

return """
Expand Down
30 changes: 4 additions & 26 deletions bigquery_etl/shredder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
from functools import partial
from itertools import chain
from multiprocessing.pool import ThreadPool
from typing import Dict, List
from typing import List

import requests
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping

from ..util.bigquery_id import qualified_table_id

MOZDATA = "mozdata"
SHARED_PROD = "moz-fx-data-shared-prod"
GLEAN_SCHEMA_ID = "glean_ping_1"
GLEAN_MIN_SCHEMA_ID = "glean-min_ping_1"
GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings"


@dataclass(frozen=True)
Expand Down Expand Up @@ -789,7 +789,7 @@ def stable_tables_by_schema(schema_id):

glean_stable_tables = stable_tables_by_schema(GLEAN_SCHEMA_ID)

channel_to_app_name = get_glean_channel_to_app_name_mapping()
channel_to_app_name = get_glean_app_id_to_app_name_mapping()

# create mapping of dataset -> (tables containing associated deletion requests)
# construct values as tuples because that is what they must be in the return type
Expand Down Expand Up @@ -958,28 +958,6 @@ def stable_tables_by_schema(schema_id):
}


def get_glean_channel_to_app_name_mapping() -> Dict[str, str]:
"""Return a dict where key is the channel app id and the value is the shared app name.

e.g. {
"org_mozilla_firefox": "fenix",
"org_mozilla_firefox_beta": "fenix",
"org_mozilla_ios_firefox": "firefox_ios",
"org_mozilla_ios_firefoxbeta": "firefox_ios",
}
"""
response = requests.get(GLEAN_APP_LISTINGS_URL)
response.raise_for_status()

app_listings = response.json()

return {
app["bq_dataset_family"]: app["app_name"]
for app in app_listings
if "bq_dataset_family" in app and "app_name" in app
}


def _list_tables(
dataset_ref: bigquery.DatasetReference,
client: bigquery.Client,
Expand Down
26 changes: 7 additions & 19 deletions bqetl_project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -563,27 +563,15 @@ generate:
- firefox_desktop_background_tasks

geo_deprecation:
include_app_ids:
- org_mozilla_ios_klar
- org_mozilla_klar
- org_mozilla_ios_focus
- org_mozilla_focus
- org_mozilla_focus_beta
- org_mozilla_focus_nightly
- org_mozilla_ios_firefox
- org_mozilla_ios_firefoxbeta
- org_mozilla_ios_fennec
- org_mozilla_firefox
- org_mozilla_firefox_beta
- org_mozilla_fenix
- org_mozilla_fenix_nightly
- org_mozilla_fennec_aurora
- firefox_desktop
- firefox_desktop_background_update
- firefox_desktop_background_defaultagent
- firefox_desktop_background_tasks
skip_apps:
- ads_backend
- mozilla_vpn
- mozillavpn_backend_cirrus
- accounts_backend
- accounts_frontend
skip_tables:
- newtab
- newtab_content

retention_exclusion_list:
- sql/moz-fx-data-shared-prod/search_derived/acer_cohort_v1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from google.cloud.bigquery import TableReference
from google.cloud.exceptions import NotFound

from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping
from bigquery_etl.schema import Schema
from bigquery_etl.shredder.config import (
CLIENT_ID,
Expand All @@ -22,7 +23,6 @@
SHARED_PROD,
DeleteSource,
find_glean_targets,
get_glean_channel_to_app_name_mapping,
)

FIND_TABLES_QUERY_TEMPLATE = """
Expand Down Expand Up @@ -171,7 +171,7 @@ def get_associated_deletions(
dataset_name.replace("_derived", "_stable")
] = f"{dataset_name}.additional_deletion_requests_v1"

glean_channel_names = get_glean_channel_to_app_name_mapping()
glean_channel_names = get_glean_app_id_to_app_name_mapping()

for table_name, stable_tables in upstream_stable_tables.items():
deletion_tables: Set[DeleteSource] = set()
Expand Down Expand Up @@ -248,7 +248,7 @@ def get_missing_deletions(
bigquery_client = bigquery.Client()
glean_delete_targets = find_glean_targets(pool, client=bigquery_client)

glean_channel_names = get_glean_channel_to_app_name_mapping()
glean_channel_names = get_glean_app_id_to_app_name_mapping()
glean_app_name_to_channels = defaultdict(list)
for channel, app_name in glean_channel_names.items():
glean_app_name_to_channels[app_name].append(channel)
Expand Down
10 changes: 5 additions & 5 deletions tests/shredder/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from google.cloud import bigquery
from google.cloud.bigquery import DatasetReference

from bigquery_etl.cli.utils import get_glean_app_id_to_app_name_mapping
from bigquery_etl.shredder.config import (
CLIENT_ID,
DELETE_TARGETS,
Expand All @@ -14,7 +15,6 @@
DeleteTarget,
_list_tables,
find_glean_targets,
get_glean_channel_to_app_name_mapping,
)

GLEAN_APP_LISTING = [
Expand Down Expand Up @@ -130,7 +130,7 @@ def get_table(self, table_ref):
return table


@mock.patch("bigquery_etl.shredder.config.requests")
@mock.patch("bigquery_etl.cli.utils.requests")
def test_glean_targets(mock_requests):
mock_response = mock.Mock()
mock_response.json.return_value = GLEAN_APP_LISTING
Expand Down Expand Up @@ -295,7 +295,7 @@ def test_glean_targets(mock_requests):
}


@mock.patch("bigquery_etl.shredder.config.requests")
@mock.patch("bigquery_etl.cli.utils.requests")
def test_glean_targets_override(mock_requests):
"""Targets in GLEAN_DERIVED_OVERRIDES should override the target in find_glean_targets."""

Expand Down Expand Up @@ -379,13 +379,13 @@ def get_table(self, table_ref):
}


@mock.patch("bigquery_etl.shredder.config.requests")
@mock.patch("bigquery_etl.cli.utils.requests")
def test_glean_channel_app_mapping(mock_requests):
mock_response = mock.Mock()
mock_response.json.return_value = GLEAN_APP_LISTING
mock_requests.get.return_value = mock_response

actual = get_glean_channel_to_app_name_mapping()
actual = get_glean_app_id_to_app_name_mapping()

expected = {
"org_mozilla_firefox": "fenix",
Expand Down
Loading