diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 00000000..1e3bbea2 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,2 @@ +pattern: + - 'patterns/community/**/*.json' diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 00000000..0d552523 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,23 @@ +name: Auto-label PRs + +# pull_request_target runs in the context of the BASE branch and has write +# access to the PR. The actions/labeler action only reads the file diff and +# applies labels — no untrusted PR input flows into shell commands here. + +on: + pull_request_target: + types: [opened, synchronize, reopened] + +permissions: + contents: read + pull-requests: write + +jobs: + label: + name: Apply path-based labels + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v5 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + configuration-path: .github/labeler.yml diff --git a/.github/workflows/regenerate-manifest.yml b/.github/workflows/regenerate-manifest.yml new file mode 100644 index 00000000..b6789dcd --- /dev/null +++ b/.github/workflows/regenerate-manifest.yml @@ -0,0 +1,74 @@ +name: Regenerate community pattern manifest + +# Triggers when a push to main changes anything under patterns/community/ +# (typically a merged PR that added, removed, or updated a pattern file). +# The workflow rebuilds patterns/community/index.json from the source pattern +# files and commits the updated manifest back to main. + +on: + push: + branches: + - main + paths: + - 'patterns/community/**.json' + - '!patterns/community/index.json' # avoid recursive triggers from our own commits + +# Prevent overlapping runs. If two pattern PRs merge back-to-back, the second +# run waits for the first so the manifest is consistent. +concurrency: + group: regenerate-manifest + cancel-in-progress: false + +permissions: + contents: write # required to commit the manifest back to main + +jobs: + regenerate: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # Use a PAT or GITHUB_TOKEN that can push back to main. + # Default GITHUB_TOKEN works for same-repo pushes that don't trigger + # further workflows; if branch protection requires PRs, swap for a + # PAT secret. + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Regenerate manifest + run: | + python -m src.tools.generate_manifest + + - name: Check for manifest changes + id: check + run: | + if git diff --quiet patterns/community/index.json; then + echo "changed=false" >> "$GITHUB_OUTPUT" + else + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Commit and push updated manifest + if: steps.check.outputs.changed == 'true' + run: | + git config user.name "minuspod-bot" + git config user.email "minuspod-bot@users.noreply.github.com" + git add patterns/community/index.json + git commit -m "chore: regenerate community pattern manifest [skip ci]" + git push + + - name: Report no-op + if: steps.check.outputs.changed == 'false' + run: echo "Manifest already up to date. Nothing to commit." diff --git a/.github/workflows/validate-community-patterns.yml b/.github/workflows/validate-community-patterns.yml new file mode 100644 index 00000000..5226f784 --- /dev/null +++ b/.github/workflows/validate-community-patterns.yml @@ -0,0 +1,84 @@ +name: Community Pattern Validation + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'patterns/community/**' + +permissions: + contents: read + pull-requests: write + +jobs: + validate: + name: Validate community pattern submissions + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v6 + with: + fetch-depth: 2 + + - name: List changed JSON files under patterns/community/ + id: changed + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + set -e + changed=$(git diff --diff-filter=AM --name-only "$BASE_SHA" "$HEAD_SHA" \ + | grep -E '^patterns/community/.+\.json$' \ + | grep -v 'patterns/community/index.json' \ + | tr '\n' ' ' || true) + echo "files=$changed" >> "$GITHUB_OUTPUT" + echo "Changed files: $changed" + + - name: Setup Python + if: steps.changed.outputs.files != '' + uses: actions/setup-python@v6 + with: + python-version: '3.11' + cache: pip + + - name: Install lightweight deps + if: steps.changed.outputs.files != '' + run: | + python -m pip install --upgrade pip + # The validator imports only stdlib + utils/community_tags, which + # itself uses stdlib only. No requirements.txt install required. + + - name: Run validator + if: steps.changed.outputs.files != '' + id: validate + env: + CHANGED_FILES: ${{ steps.changed.outputs.files }} + run: | + set +e + python -m src.tools.community_pattern_validator \ + --pr-files $CHANGED_FILES \ + --comment-output /tmp/comment.md \ + --status-output /tmp/status.txt + echo "exit=$?" >> "$GITHUB_OUTPUT" + + - name: Post results comment + if: always() && steps.changed.outputs.files != '' + uses: actions/github-script@v8 + with: + script: | + const fs = require('fs'); + const path = '/tmp/comment.md'; + if (!fs.existsSync(path)) return; + const body = fs.readFileSync(path, 'utf8'); + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body, + }); + + - name: Fail job on rejection + if: steps.changed.outputs.files != '' && steps.validate.outputs.exit != '0' + run: | + echo "Community pattern validation rejected one or more files." + exit 1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 40b3a51e..2953e5f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,108 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.4.6] - 2026-05-15 + +### Fixed + +- **intro/outro_variants no longer get double-JSON-encoded on auto-created patterns.** `text_pattern_matcher` was calling `db.create_ad_pattern(intro_variants=json.dumps([intro]))` while the DB layer also `json.dumps`'d its input, so the column stored `'"[\\"text\\"]"'`. Submitting any of those patterns through the community bundle pipeline exploded the value into a list of single characters (the user's first bundle had `intro_variants` of length 196 starting with `['[', '"', 'E', 'm', ...]`). Now the matcher passes a plain list and the DB layer encodes once. +- **Migration repairs existing rows.** A one-shot `_repair_double_encoded_variants` migration re-encodes any `ad_patterns.intro_variants` / `outro_variants` whose stored value parses to a string (instead of a list) on the first `json.loads`. Idempotent; stamped via the `variant_reencode_revision` setting. Operators on 2.4.5 with broken rows will see them fix themselves on next container start. +- **Community export pipeline is defensive about the same bug.** `_safe_parse_variants` in `community_export.py` retries the decode when the first parse returns a string, so bundles built from a not-yet-migrated DB still produce clean output. + +### Changed + +- Dialog CLI snippet now hints `gh pr create --fill --label pattern` so the label gets requested directly. The labeler workflow still applies it automatically on path match; this is belt-and-suspenders. + +## [2.4.5] - 2026-05-15 + +### Changed + +- **Submit-to-community is now a bundle download, not N prefilled PR tabs.** Picking "Submit to community" in the Export dialog opens a preview that lists which patterns will pass quality gates and which will not (with reasons). Confirming downloads a single `minuspod-submission-.json` containing every passing pattern. You open one PR for the whole bundle in your fork. The old per-tab flow fell over at scale: 215 selected -> 8 tabs survived the popup blocker, 20 forced JSON downloads (each over the 7 KB URL limit), 187 silent 400s rendered as `[object Object]`. +- The PR-side validator and the manifest builder both handle the new bundle format (`format: minuspod-community-submission`) by flattening `patterns[]` into per-pattern validations / manifest entries. Existing per-file submissions still work. + +### Added + +- `POST /api/v1/patterns/preview-export` returning ready / rejected counts plus per-id rejection reasons. +- `POST /api/v1/patterns/submit-bundle` returning the downloadable bundle JSON. Includes `X-Bundle-Pattern-Count` and `X-Bundle-Rejected-Count` headers for the UI. + +### Fixed + +- **`POST /api/v1/community-patterns/sync` no longer returns 502 when the upstream manifest URL doesn't exist yet.** A 404 from `raw.githubusercontent.com` (e.g. the patterns feature is still on a branch and `main` doesn't have `patterns/community/index.json`) now returns 200 with `{status: "no_manifest_yet"}`. Other failures still surface as 502. Caught from the user's console showing six 502s. +- **`apiRequest` no longer stringifies error response bodies as `[object Object]`.** Backend 4xx responses can be `{error: {message: "...", reasons: [...]}}`; the old throw passed that object verbatim into `new Error(...)`. `extractErrorMessage` now prefers `error.error.message`, falls back to a stringified `error.error`, then the HTTP status. Affects every API call site. + +## [2.4.4] - 2026-05-15 + +### Added + +- **Bulk submit-to-community in the Export dialog.** The per-pattern "Submit to community" buttons on the Patterns page are gone. The Export button now opens one dialog with a destination radio: Download as JSON (the existing flow) or Submit to community (opens one prefilled PR per selected pattern). Patterns whose source is already `community` are filtered out automatically; round-tripping them is pointless. +- **Remove all community patterns.** Settings -> Community Patterns has a destructive action that wipes every `source='community'` row on this instance, including any you marked Protect from sync. Local and imported patterns are untouched. If sync is enabled, the next tick repopulates from the manifest. API: `DELETE /api/v1/community-patterns/all`, returns `{deleted: N}`. +- **Single-pattern check in the PR validator.** Community submissions must describe one ad. The validator now rejects a PR if `text_template` mentions any other seed sponsor by name or alias. Closes the gap on the import side; the export side already had the same check. See `patterns/CONTRIBUTING.md`. +- **Seeded `patterns/community/` with 12 initial patterns** (Capital One, Carvana x2, Instacart, Kayak, Mint Mobile, Monday.com, Progressive, SimpliSafe, Squarespace, ThreatLocker, Zyn). Pulled from a real instance export, cleaned up by hand, now in the published manifest. Earlier instances on this version pick them up on the next sync tick. + +## [2.4.3] - 2026-05-14 + +### Fixed + +- **Fingerprint slow-fallback no longer burns 10 minutes when the audio is bad for fpcalc.** When `_generate_full_fingerprint` fails (e.g. fpcalc rejects an MP3 with "Invalid data found when processing input"), the per-window fallback used to inherit the full 600-second timeout — every window scan also called fpcalc, almost always produced zero new matches, and ate the entire budget before timing out. The fallback now caps at 90 seconds via a new `FALLBACK_SLOW_TIMEOUT` constant. Stage 1 still tries; it just doesn't block Stage 2 + Stage 3 for ten minutes on broken audio. Caught from production logs (cordkillers-only-audio episode that took 14 min on pass 1 alone). +- **`processing_timeouts._resolve` was silently falling back to env / defaults every refresh tick.** The `from database import get_database` import is invalid — `get_database` lives in the `api` package — and the try/except swallowed the resulting ImportError. Fixed the import path. Effect: user-configured `processing_soft_timeout_seconds` / `processing_hard_timeout_seconds` are now actually read from the DB instead of being silently shadowed by env-var fallbacks. +- **`community_sync` no longer WARNs every 15 minutes when the manifest URL returns 404.** A 404 is the expected state when upstream hasn't published a manifest yet (e.g., the feature branch hasn't merged to main). Now logged at INFO level with an explanatory message. Non-404 fetch failures still log as WARNING so real problems stay visible. +- **`set_podcast_tags` short-circuits the episode-aggregation pass when the incoming RSS tags are already covered by the row's union.** Pre-fix: every feed refresh on a 300-episode podcast did one SELECT + 300 JSON parses across all episodes even when nothing was going to change. Now a single subset check before the heavy work. Materially lower SQLite write contention with the queue processor on instances with large feeds. + +## [2.4.2] - 2026-05-14 + +### Fixed + +- **Reviewer-trim now actually trims** instead of rebuilding the template from one episode's transcription. `rewrite_pattern_from_bounds` takes the original AND new bounds, computes the head/tail transcript slices, and splices them out of the existing `text_template` only when they appear at its start/end. The earlier behavior (Operation 2 "full replace within threshold") was a misnomer — it fit the template to a single episode and risked breaking matches on episodes that had captured the cleaner version. `intro_variants` / `outro_variants` get the same prefix/suffix trim so they stay aligned. Returns False when neither head nor tail slice matches the existing template, leaving the pattern untouched. +- **`community_sync.apply_manifest` now reads `manifest['vocabulary_version']`.** When the manifest carries a vocabulary newer than the app's, the sync writes a warning to the log and a `vocabulary_warning` field into `community_sync_last_summary` so the operator can spot a stale image. Non-integer values are caught and logged cleanly instead of crashing the sync. +- **Hardcoded `ttlequals0/MinusPod` is now a single constant.** `GITHUB_REPO` and `COMMUNITY_MANIFEST_URL` live in `src/utils/community_tags.py`; the export pipeline and sync job both import from there. One source of truth for the upstream identity. +- **`MANIFEST_VERSION` and `VOCABULARY_VERSION` moved out of `src/tools/generate_manifest.py`** (a build-time CLI) into `src/utils/community_tags.py` next to the vocabulary loader. Both the generator and the sync job import from there. Removes a wrong-direction runtime → build-time layering import. +- **`pattern_service.py` no longer imports from `api/`.** Switched the transcript helper to `from utils.text import extract_text_in_range` directly. Service layer importing from API layer was the wrong dependency direction and would have become a circular import. + +### Added + +- **`.github/workflows/labeler.yml`** wiring `actions/labeler@v5` to apply `.github/labeler.yml` on PRs. The path-glob for `pattern` existed since 2.4.0 but nothing invoked it — community-pattern PRs will now get the label automatically. + +## [2.4.1] - 2026-05-14 + +### Added + +- **Per-feed tag editor.** New "Tags" card on every feed-detail page. Shows the effective tag set grouped by source (RSS / episode / user), with an "+ Add tag" button that opens a grouped picker of the remaining vocabulary tags. User-added tags carry an inline X to remove them, and saves are auto-applied via `PUT /api/v1/feeds/{slug}/tags`. This was the missing companion UI to the backend endpoint shipped in 2.4.0. +- **Multi-select pattern export.** The Patterns page **Export** button now opens a dialog with a checkbox per pattern (Select-all, optional include-disabled / include-corrections flags) instead of dumping the whole local pattern DB. `GET /api/v1/patterns/export` gained an `?ids=1,2,3` filter to support this. +- **Per-row Submit-to-community / Protect-from-sync buttons in the desktop pattern table.** Previously only the mobile card layout had these — desktop reviewers had to switch viewports. New 9th "Actions" column hosts them. +- **`GET /api/v1/tags/vocabulary`** — returns the canonical 49-tag vocabulary plus per-tag descriptions, grouped into podcast_genres / sponsor_industries / special_tags. Used by the new tag picker. + +### Changed + +- Tag vocabulary loading is now cached (`utils.community_tags.vocabulary_payload` with `@lru_cache(maxsize=1)`); the CSV is parsed once at process start instead of per request. Frontend cache: `staleTime: Infinity` since vocabulary ships with the app image. +- `/tags/vocabulary` lives at `src/api/tags.py` (was inline in `sponsors.py`); no behavior change, better discoverability. +- TypeScript now exports `PATTERN_SOURCES` and `PatternSource` from `frontend/src/api/patterns.ts`, mirroring the Python `PATTERN_SOURCES` frozenset — frontend and backend can no longer drift on source-discriminator spellings. + +### Fixed + +- **`community_sync.apply_manifest` version stamping**: was using `dict.setdefault('version', manifest_version)` which silently kept a stale `version` carried in the inner `data` dict. Now assigns unconditionally so the manifest's version is authoritative for the `import_community_pattern` version gate. +- **CodeQL py/reflective-xss in bulk-op endpoints**: hardened `_resolve_bulk_target` so `ids` and `expected_count` are coerced to integers up front. Non-integer payloads return 400 with a clean message instead of being f-stringed into the response body. +- **Inline CREATE TABLE shape drift**: `_create_new_tables_only` definitions for `ad_patterns` and `known_sponsors` were missing the 2.4.0 columns; brought back into sync with `SCHEMA_SQL`. End state was already correct via the ALTER TABLE migrations, but the "must match SCHEMA_SQL exactly" comment invariant is now true again. +- **Sponsor reseed migration order**: moved `_reseed_known_sponsors` to run AFTER `_migrate_sponsor_fk` and the Zyn cleanups so it operates on the post-dedup canonical state. Previously a v2.1.x → 2.4.x jump could let dedup discard the freshly-tagged row. + +## [2.4.0] - 2026-05-14 + +### Added + +- **Community ad patterns.** Patterns can now be shared via the `patterns/community/` directory in the GitHub repository. A new "Submit to community" button on each local pattern row runs an export pipeline (quality gates, PII strip, sponsor classification, metadata strip) and opens a prefilled GitHub PR. A new GitHub Action validates incoming PRs against the same gates and a three-tier dedupe (95%+ duplicate, 75–95% variant, <75% distinct) before the maintainer reviews them. +- **49-tag vocabulary + tagging system.** Sponsors and podcasts now carry tags from a 48-entry vocabulary (`src/seed_data/tag_vocabulary.csv`) plus a special `universal` flag for sponsors with broad appeal. Community patterns only enter the text-matching loop when their sponsor's tags overlap the podcast's tags, when the sponsor is `universal`, or when either side has no tags. Local patterns bypass tag filtering entirely. +- **Authoritative sponsor seed.** A new schema migration loads 255 sponsors (with aliases and tags) from `src/seed_data/sponsors_final.csv`. Migration semantics: UPDATE on name match (preserves `ad_patterns.sponsor_id` FKs), INSERT new, soft-delete (`is_active=0`) any pre-existing sponsor whose name is not in the seed. +- **Auto-pull / sync.** Optional opt-in: when enabled, the server polls `https://raw.githubusercontent.com/ttlequals0/MinusPod/main/patterns/community/index.json` on a configurable cron (default Sunday 3am UTC) and applies INSERT / UPDATE / DELETE against community patterns. The new "Protect from sync" toggle on each community pattern row pins it so a future manifest can't overwrite or delete it. +- **Reviewer-trim auto-rewrite.** When a reviewer narrows an ad's bounds by more than the configurable trim threshold (default 20 s), the local pattern's `text_template` and intro/outro variants are re-extracted from the new transcript bounds. Off by default for community patterns; toggleable in the new **Ad Reviewer** settings panel. +- **iTunes category parsing.** RSS feed refresh now extracts `` at both podcast and episode level and maps it through `src/seed_data/itunes_category_map.json` to the vocabulary tags above. +- **API additions** under `/api/v1/`: `POST /patterns/bulk-delete`, `POST /patterns/bulk-disable` (both guarded by `confirm: true` + `expected_count`), `POST /patterns/{id}/submit-to-community`, `POST | DELETE /patterns/{id}/protect`, `GET | PUT /feeds/{slug}/tags`, `PUT /sponsors/{id}/tags`, `GET | PUT /settings/reviewer`, `GET | PUT /settings/community-sync`, `POST /community-patterns/sync`, `GET /community-patterns/sync-status`. Documented in `openapi.yaml` (now at version 2.4.0). +- **Frontend additions.** Patterns page gains an Import/Export header pair, a Source filter (Local / Community / Imported), a community badge on each community-sourced row, per-row Submit-to-community / Protect-from-sync buttons, and a last-synced indicator with manual refresh. Settings page gains two new sections: **Ad Reviewer** (toggle + threshold) and **Community Patterns** (enable, cron, Sync Now, last-sync display). +- **GitHub workflow + path labeler** for community PRs (`.github/workflows/validate-community-patterns.yml`, `.github/labeler.yml`). Validator is also a CLI: `python -m tools.community_pattern_validator --pr-files X.json Y.json --comment-output /tmp/comment.md`. + +### Changed + +- `known_sponsors`, `podcasts`, `episodes` now carry a JSON `tags` column; `ad_patterns` carries `source`, `community_id`, `version`, `submitted_app_version`, `protected_from_sync`. Migration is additive and idempotent. +- Editing a community pattern in the UI now auto-sets `protected_from_sync=1` so the next sync run doesn't clobber the edit. + ## [2.3.4] - 2026-05-13 ### Fixed diff --git a/README.md b/README.md index 70b56519..8a21030b 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ MinusPod is a self-hosted server that removes ads before you ever hit play. It t - [Screenshots](#screenshots) - [Configuration](#configuration) - [Experiments](#experiments) +- [Community Patterns (Optional)](#community-patterns-optional) - [Finding Podcast RSS Feeds](#finding-podcast-rss-feeds) - [Usage](#usage) - [Audiobookshelf](#audiobookshelf) @@ -487,6 +488,41 @@ Detection, verification, and reviewer prompts use explicit placeholder substitut If you customized your system or verification prompt before this release, the upgrade automatically appends `{sponsor_database}` to your prompt so behavior is preserved. The migration is idempotent and runs once. +## Community Patterns (Optional) + +MinusPod can share and receive ad patterns from a community-maintained seed list. Patterns describe recognized ad reads (sponsor scripts, host-read pre-rolls, etc.) so new MinusPod instances skip the LLM detection step for ads that have already been identified elsewhere. + +The feature is **opt-in** and **off by default**. When enabled, your MinusPod instance pulls a manifest of community patterns from this repo on a schedule you control. To submit your own patterns back, open the Patterns page Export dialog and pick **Submit to community**: the app runs quality gates over your selection, shows what will pass, and downloads a single bundle file. Drop it into your fork of `patterns/community/` and open one PR. + +### What you get when enabled + +- Faster ad detection for sponsors other MinusPod users have already identified +- New patterns appear automatically as the community contributes them +- Local patterns you build stay private unless you choose to submit them + +### What you control + +- **Sync schedule** - cron expression in Settings (default: weekly, Sunday 3am) +- **Manual sync** - "Sync now" button in Settings +- **Per-pattern protection** - pin any community pattern with **Protect from sync** to prevent automatic updates or deletion +- **Disable at any time** - flipping the toggle stops sync; existing community patterns remain unless you delete them +- **Remove all at once** - "Remove all community patterns" in Settings wipes every community pattern (including any you marked Protect from sync). Useful for a clean reset before re-enabling sync. + +### What is shared if you submit + +Submitting a pattern is a separate action you trigger from the Export dialog and never automatic. Before submission, the app: + +- Strips local identifiers (which podcast, which network, your match counts, your timestamps) +- Strips PII from pattern text (consumer email addresses, non-toll-free phone numbers) +- Validates the pattern meets quality thresholds +- Generates a JSON file and opens a prefilled GitHub PR in your browser + +You retain everything locally. Submission is a copy, not a move. + +### Full details + +See [`patterns/README.md`](patterns/README.md) for the technical reference (sync mechanics, file formats, tag vocabulary) and [`patterns/CONTRIBUTING.md`](patterns/CONTRIBUTING.md) for what happens when you submit a pattern. + ## Finding Podcast RSS Feeds MinusPod includes a built-in podcast search powered by [PodcastIndex.org](https://podcastindex.org). Search by name directly from the Add Feed page. To enable search, get free API credentials at [api.podcastindex.org/signup](https://api.podcastindex.org/signup) and add them in Settings > Podcast Search. diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 1661405a..1711d4c2 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -60,6 +60,27 @@ function sleep(ms: number, signal?: AbortSignal): Promise { }); } +// Turn whatever the backend returned in its error body into a usable string, +// so `new Error(...).message` never ends up as "[object Object]". Backend +// 4xx bodies vary: sometimes `{error: 'string'}`, sometimes +// `{error: {message: 'x', reasons: [...]}}` (e.g. submit-to-community 400). +export function extractErrorMessage(body: unknown, status: number): string { + if (body && typeof body === 'object') { + const err = (body as { error?: unknown }).error; + if (typeof err === 'string' && err) return err; + if (err && typeof err === 'object') { + const msg = (err as { message?: unknown }).message; + if (typeof msg === 'string' && msg) return msg; + try { + return JSON.stringify(err); + } catch { + // fall through to status code + } + } + } + return `HTTP ${status}`; +} + export async function apiRequest(endpoint: string, options: RequestOptions = {}): Promise { const { method = 'GET', body, skipAuthRedirect = false, signal, skipRetry = false } = options; const maxAttempts = skipRetry ? 1 : RETRY_DELAYS.length + 1; @@ -96,7 +117,7 @@ export async function apiRequest(endpoint: string, options: RequestOptions = continue; } const error = await response.json().catch(() => ({ error: 'Request failed' })); - throw new Error(error.error || `HTTP ${response.status}`); + throw new Error(extractErrorMessage(error, response.status)); } const contentType = response.headers.get('content-type'); diff --git a/frontend/src/api/community.ts b/frontend/src/api/community.ts new file mode 100644 index 00000000..c1cd92f3 --- /dev/null +++ b/frontend/src/api/community.ts @@ -0,0 +1,109 @@ +import { apiRequest } from './client'; + +export interface CommunitySyncSettings { + enabled: boolean; + cron: string; + lastRun: string | null; + lastError: string | null; + manifestVersion: string | null; + lastSummary: string | null; +} + +export interface CommunitySyncSummary { + inserted: number; + updated: number; + deleted: number; + skipped: number; + errors: number; + manifest_version?: number | string | null; + fetched_at?: string; +} + +export async function getCommunitySyncSettings(): Promise { + return apiRequest('/settings/community-sync'); +} + +export async function updateCommunitySyncSettings(args: { + enabled?: boolean; + cron?: string; +}): Promise { + return apiRequest('/settings/community-sync', { + method: 'PUT', + body: args, + }); +} + +export async function triggerCommunitySync(): Promise { + return apiRequest('/community-patterns/sync', { + method: 'POST', + }); +} + +export async function getCommunitySyncStatus(): Promise { + return apiRequest('/community-patterns/sync-status'); +} + +export async function purgeAllCommunityPatterns(): Promise<{ deleted: number }> { + return apiRequest<{ deleted: number }>('/community-patterns/all', { + method: 'DELETE', + body: { confirm: true }, + }); +} + +export interface ReviewerSettings { + updatePatternsFromReviewerAdjustments: boolean; + minTrimThreshold: number; +} + +export async function getReviewerSettings(): Promise { + return apiRequest('/settings/reviewer'); +} + +export async function updateReviewerSettings(args: Partial): Promise { + return apiRequest('/settings/reviewer', { + method: 'PUT', + body: args, + }); +} + +export interface FeedTagBreakdown { + effective: string[]; + rss: string[]; + episode: string[]; + user: string[]; +} + +export async function getFeedTags(slug: string): Promise { + return apiRequest(`/feeds/${encodeURIComponent(slug)}/tags`); +} + +export async function setFeedUserTags(slug: string, userTags: string[]): Promise { + return apiRequest(`/feeds/${encodeURIComponent(slug)}/tags`, { + method: 'PUT', + body: { user_tags: userTags }, + }); +} + +export async function updateSponsorTags(sponsorId: number, tags: string[]): Promise<{ sponsor_id: number; tags: string[] }> { + return apiRequest(`/sponsors/${sponsorId}/tags`, { + method: 'PUT', + body: { tags }, + }); +} + +export interface TagVocabularyEntry { + tag: string; + description: string; +} + +export interface TagVocabulary { + vocabulary_version: number; + all_tags: string[]; + podcast_genres: TagVocabularyEntry[]; + sponsor_industries: TagVocabularyEntry[]; + special_tags: TagVocabularyEntry[]; +} + +export async function getTagVocabulary(): Promise { + return apiRequest('/tags/vocabulary'); +} diff --git a/frontend/src/api/patterns.ts b/frontend/src/api/patterns.ts index f6fcdf0a..ea274b0e 100644 --- a/frontend/src/api/patterns.ts +++ b/frontend/src/api/patterns.ts @@ -1,4 +1,17 @@ -import { apiRequest, buildQueryString } from './client'; +import { apiRequest, buildQueryString, csrfHeaders, extractErrorMessage } from './client'; +import { downloadBlob } from './history'; + +// Mirrors src/utils/community_tags.py:PATTERN_SOURCES so the frontend +// and backend can't drift on the source-discriminator string spellings. +export const PATTERN_SOURCE_LOCAL = 'local'; +export const PATTERN_SOURCE_COMMUNITY = 'community'; +export const PATTERN_SOURCE_IMPORTED = 'imported'; +export const PATTERN_SOURCES = [ + PATTERN_SOURCE_LOCAL, + PATTERN_SOURCE_COMMUNITY, + PATTERN_SOURCE_IMPORTED, +] as const; +export type PatternSource = typeof PATTERN_SOURCES[number]; export interface AdPattern { id: number; @@ -21,6 +34,11 @@ export interface AdPattern { disabled_at: string | null; disabled_reason: string | null; created_by?: string | null; + source?: PatternSource; + community_id?: string | null; + version?: number; + submitted_app_version?: string | null; + protected_from_sync?: number; } export interface PatternCorrection { @@ -92,12 +110,14 @@ export async function getPatterns(params?: { podcast_id?: string; network_id?: string; active?: boolean; + source?: PatternSource; }): Promise { const qs = buildQueryString({ scope: params?.scope, podcast_id: params?.podcast_id, network_id: params?.network_id, active: params?.active, + source: params?.source, }); const response = await apiRequest<{ patterns: AdPattern[] }>(`/patterns${qs}`); @@ -144,3 +164,101 @@ export async function submitCorrection( body: correction, }); } + +// Bulk + community-pattern API + +export interface BulkPatternResult { + deleted?: number; + disabled?: number; + ids: number[]; +} + +export async function bulkDeletePatterns(args: { + ids?: number[]; + source?: 'local' | 'community' | 'imported'; + expected_count: number; +}): Promise { + return apiRequest(`/patterns/bulk-delete`, { + method: 'POST', + body: { ...args, confirm: true }, + }); +} + +export async function bulkDisablePatterns(args: { + ids?: number[]; + source?: 'local' | 'community' | 'imported'; + expected_count: number; +}): Promise { + return apiRequest(`/patterns/bulk-disable`, { + method: 'POST', + body: { ...args, confirm: true }, + }); +} + +export interface CommunityExportResult { + payload: Record; + filename: string; + pr_url: string; + too_large: boolean; + sponsor_match: 'exact' | 'alias' | 'fuzzy' | 'unknown'; +} + +export async function submitPatternToCommunity(id: number): Promise { + return apiRequest(`/patterns/${id}/submit-to-community`, { + method: 'POST', + }); +} + +export interface BundlePreviewRejection { + id: number; + sponsor: string | null; + reasons: string[]; +} + +export interface BundlePreview { + ready: number[]; + rejected: BundlePreviewRejection[]; + ready_count: number; + rejected_count: number; + pattern_count: number; +} + +export async function previewExportBundle(ids: number[]): Promise { + return apiRequest('/patterns/preview-export', { + method: 'POST', + body: { ids }, + }); +} + +// apiRequest assumes JSON responses; the bundle endpoint streams a file, +// so fall back to a raw fetch. CSRF + error-stringification helpers are +// reused from client.ts. The actual browser download happens here so +// callers only deal with the resulting filename. +export async function downloadCommunityBundle(ids: number[]): Promise<{ filename: string }> { + const response = await fetch('/api/v1/patterns/submit-bundle', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...csrfHeaders('POST'), + }, + body: JSON.stringify({ ids }), + }); + if (!response.ok) { + const error = await response.json().catch(() => ({ error: 'Download failed' })); + throw new Error(extractErrorMessage(error, response.status)); + } + const blob = await response.blob(); + const cd = response.headers.get('Content-Disposition') || ''; + const match = cd.match(/filename="?([^"]+)"?/); + const filename = match?.[1] || 'minuspod-community-submission.json'; + downloadBlob(blob, filename); + return { filename }; +} + +export async function protectPattern(id: number): Promise { + await apiRequest(`/patterns/${id}/protect`, { method: 'POST' }); +} + +export async function unprotectPattern(id: number): Promise { + await apiRequest(`/patterns/${id}/protect`, { method: 'DELETE' }); +} diff --git a/frontend/src/components/CommunityBadge.tsx b/frontend/src/components/CommunityBadge.tsx new file mode 100644 index 00000000..8f205fd5 --- /dev/null +++ b/frontend/src/components/CommunityBadge.tsx @@ -0,0 +1,26 @@ +import { useState } from 'react'; + +interface Props { + communityId: string; + version?: number; + protected?: boolean; +} + +export function CommunityBadge({ communityId, version, protected: isProtected }: Props) { + const [expanded, setExpanded] = useState(false); + const short = communityId.split('-')[0]; + return ( + + ); +} diff --git a/frontend/src/components/FeedTagsEditor.tsx b/frontend/src/components/FeedTagsEditor.tsx new file mode 100644 index 00000000..014df763 --- /dev/null +++ b/frontend/src/components/FeedTagsEditor.tsx @@ -0,0 +1,180 @@ +import { useState } from 'react'; +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { + getFeedTags, + setFeedUserTags, + getTagVocabulary, + type FeedTagBreakdown, +} from '../api/community'; +import { TagChips } from './TagChips'; +import LoadingSpinner from './LoadingSpinner'; + +interface Props { + slug: string; +} + +export function FeedTagsEditor({ slug }: Props) { + const qc = useQueryClient(); + const [adding, setAdding] = useState(false); + + const { data: tags, isLoading } = useQuery({ + queryKey: ['feedTags', slug], + queryFn: () => getFeedTags(slug), + }); + + const { data: vocab } = useQuery({ + queryKey: ['tagVocabulary'], + queryFn: getTagVocabulary, + // Vocabulary ships with the app image; a runtime change would require a + // restart, which blows the React Query cache anyway. Infinity = one fetch + // per page load, no refresh-on-focus. + staleTime: Infinity, + gcTime: Infinity, + }); + + const save = useMutation({ + mutationFn: (userTags: string[]) => setFeedUserTags(slug, userTags), + onSuccess: (next) => { + qc.setQueryData(['feedTags', slug], next); + }, + }); + + if (isLoading || !tags) { + return ( +
+

Tags

+ +
+ ); + } + + const userSet = new Set(tags.user); + const rssSet = new Set(tags.rss); + const episodeSet = new Set(tags.episode); + const effective = tags.effective; + + const remainingVocab = (vocab?.all_tags || []).filter((t) => !userSet.has(t) && !rssSet.has(t) && !episodeSet.has(t)); + + function addTag(tag: string) { + if (!tags) return; + save.mutate([...tags.user, tag].filter((t, i, a) => a.indexOf(t) === i)); + setAdding(false); + } + + function removeTag(tag: string) { + if (!tags) return; + save.mutate(tags.user.filter((t) => t !== tag)); + } + + return ( +
+
+

Tags

+ + Used to filter community ad patterns. Tags from RSS metadata and episodes are automatic; you can also add or remove your own below. + +
+ + {effective.length === 0 && ( +

+ No tags yet. The next RSS refresh will populate iTunes categories automatically, or add one below. +

+ )} + + {effective.length > 0 && ( +
+ {tags.rss.length > 0 && ( +
+ From RSS: + +
+ )} + {tags.episode.length > 0 && ( +
+ Episodes: + +
+ )} + {tags.user.length > 0 && ( +
+ Yours: +
+ {tags.user.map((t) => ( + + {t} + + + ))} +
+
+ )} +
+ )} + +
+ {!adding ? ( + + ) : ( + <> + + + + )} + {save.isError && ( + + {(save.error as Error)?.message || 'Save failed'} + + )} +
+
+ ); +} diff --git a/frontend/src/components/PatternExportDialog.tsx b/frontend/src/components/PatternExportDialog.tsx new file mode 100644 index 00000000..14674c54 --- /dev/null +++ b/frontend/src/components/PatternExportDialog.tsx @@ -0,0 +1,421 @@ +import { useMemo, useState } from 'react'; +import type { AdPattern, BundlePreview } from '../api/patterns'; +import { + PATTERN_SOURCE_COMMUNITY, + previewExportBundle, + downloadCommunityBundle, +} from '../api/patterns'; + +interface Props { + open: boolean; + patterns: AdPattern[]; + onClose: () => void; +} + +type Destination = 'download' | 'community'; + +function PatternExportDialogImpl({ patterns, onClose }: Omit) { + const [destination, setDestination] = useState('download'); + // Initial selection = every pattern in the current filter. The parent + // remounts this component each open (see key= below), so useState + // initializer runs fresh and stays in sync with the filter. + const [selected, setSelected] = useState>( + () => new Set(patterns.map((p) => p.id)), + ); + const [includeDisabled, setIncludeDisabled] = useState(false); + const [includeCorrections, setIncludeCorrections] = useState(false); + const [busy, setBusy] = useState(false); + const [preview, setPreview] = useState(null); + const [downloadedFilename, setDownloadedFilename] = useState(null); + const [error, setError] = useState(null); + // Stage is fully derivable from the two artifacts; making it a separate + // useState would let it drift out of sync (preview=null but stage='preview'). + const stage: 'pick' | 'preview' | 'done' = downloadedFilename + ? 'done' + : preview + ? 'preview' + : 'pick'; + + // Re-sharing community-sourced patterns just round-trips them; filter them + // out of the eligible set. Local and imported are both eligible. + const visiblePatterns = useMemo( + () => destination === 'community' + ? patterns.filter((p) => p.source !== PATTERN_SOURCE_COMMUNITY) + : patterns, + [patterns, destination], + ); + + const effectiveSelection = useMemo(() => { + const visibleIds = new Set(visiblePatterns.map((p) => p.id)); + return new Set(Array.from(selected).filter((id) => visibleIds.has(id))); + }, [selected, visiblePatterns]); + + const allSelected = useMemo( + () => visiblePatterns.length > 0 + && visiblePatterns.every((p) => effectiveSelection.has(p.id)), + [visiblePatterns, effectiveSelection], + ); + + function toggleAll() { + if (allSelected) { + const visibleIds = new Set(visiblePatterns.map((p) => p.id)); + setSelected((prev) => new Set(Array.from(prev).filter((id) => !visibleIds.has(id)))); + } else { + setSelected((prev) => { + const next = new Set(prev); + for (const p of visiblePatterns) next.add(p.id); + return next; + }); + } + } + + function toggleOne(id: number) { + setSelected((prev) => { + const next = new Set(prev); + if (next.has(id)) next.delete(id); + else next.add(id); + return next; + }); + } + + function handleClose() { + if (busy) return; + onClose(); + } + + function changeDestination(d: Destination) { + setDestination(d); + setPreview(null); + setDownloadedFilename(null); + setError(null); + } + + function downloadSelected() { + if (effectiveSelection.size === 0) return; + const params = new URLSearchParams(); + params.set('ids', Array.from(effectiveSelection).join(',')); + if (includeDisabled) params.set('include_disabled', 'true'); + if (includeCorrections) params.set('include_corrections', 'true'); + const url = `/api/v1/patterns/export?${params.toString()}`; + const a = document.createElement('a'); + a.href = url; + a.download = 'minuspod-patterns.json'; + a.click(); + onClose(); + } + + async function runPreview() { + if (effectiveSelection.size === 0 || busy) return; + setBusy(true); + setError(null); + try { + const result = await previewExportBundle(Array.from(effectiveSelection)); + setPreview(result); + } catch (e) { + setError(e instanceof Error ? e.message : 'Preview failed'); + } finally { + setBusy(false); + } + } + + async function downloadBundle() { + if (!preview || preview.ready_count === 0 || busy) return; + setBusy(true); + setError(null); + try { + const { filename } = await downloadCommunityBundle(preview.ready); + setDownloadedFilename(filename); + } catch (e) { + setError(e instanceof Error ? e.message : 'Download failed'); + } finally { + setBusy(false); + } + } + + const totalEligible = visiblePatterns.length; + + return ( +
+
e.stopPropagation()} + > +
+

Export patterns

+

+ Pick the patterns to include, then choose what to do with them. +

+
+ +
+ + +
+ + {destination === 'community' && stage === 'preview' && preview && ( + setPreview(null)} + onDownload={downloadBundle} + busy={busy} + /> + )} + + {destination === 'community' && stage === 'done' && downloadedFilename && ( + + )} + + {(destination === 'download' || stage === 'pick') && ( + <> +
+ + + {effectiveSelection.size} of {totalEligible} selected + +
+ +
+ {visiblePatterns.length === 0 && ( +

+ {destination === 'community' + ? 'Nothing to submit. Community patterns are excluded; only local or imported patterns can be shared.' + : 'No patterns match the current filters.'} +

+ )} +
    + {visiblePatterns.map((p) => ( +
  • + +
  • + ))} +
+
+ +
+ {destination === 'download' && ( +
+ + +
+ )} + + {error && ( +

{error}

+ )} + +
+ + {destination === 'download' ? ( + + ) : ( + + )} +
+
+ + )} +
+
+ ); +} + +function CommunityPreview({ + preview, onBack, onDownload, busy, +}: { + preview: BundlePreview; + onBack: () => void; + onDownload: () => void; + busy: boolean; +}) { + const { ready_count, rejected_count, rejected } = preview; + return ( + <> +
+

+ {ready_count} ready to submit,{' '} + {rejected_count} will be rejected. +

+
+
+ {rejected.length === 0 ? ( +

+ Every selected pattern passes the quality gates. +

+ ) : ( +
+ + Rejected patterns ({rejected.length}) + +
    + {rejected.map((r) => ( +
  • +
    + #{r.id} + {r.sponsor || '(unknown sponsor)'} +
    +
      + {r.reasons.map((reason, i) =>
    • {reason}
    • )} +
    +
  • + ))} +
+
+ )} +
+
+ + +
+ + ); +} + +function CommunityDone({ filename, onClose }: { filename: string; onClose: () => void }) { + const snippet = [ + '# 1. Fork ttlequals0/MinusPod and clone your fork', + '# 2. Drop the file into patterns/community/, then:', + `mv ~/Downloads/${filename} patterns/community/`, + 'git checkout -b community-submission', + `git add patterns/community/${filename}`, + 'git commit -m "Submit community ad patterns"', + 'git push -u origin community-submission', + 'gh pr create --fill --label pattern', + ].join('\n'); + return ( + <> +
+

+ Bundle downloaded as {filename}. Open a PR with it + via your usual git flow, or copy the commands below: +

+
+
+
{snippet}
+
+
+ +
+ + ); +} + +export function PatternExportDialog({ open, patterns, onClose }: Props) { + if (!open) return null; + return ; +} diff --git a/frontend/src/components/PatternImportDialog.tsx b/frontend/src/components/PatternImportDialog.tsx new file mode 100644 index 00000000..f8dfeab0 --- /dev/null +++ b/frontend/src/components/PatternImportDialog.tsx @@ -0,0 +1,122 @@ +import { useRef, useState } from 'react'; +import { apiRequest } from '../api/client'; + +interface Props { + open: boolean; + onClose: () => void; + onComplete: () => void; +} + +type ImportMode = 'merge' | 'replace' | 'supplement'; + +export function PatternImportDialog({ open, onClose, onComplete }: Props) { + const fileRef = useRef(null); + const [mode, setMode] = useState('supplement'); + const [busy, setBusy] = useState(false); + const [result, setResult] = useState<{ + importedCount?: number; + updatedCount?: number; + skippedCount?: number; + error?: string; + } | null>(null); + + if (!open) return null; + + async function handleImport() { + setResult(null); + const file = fileRef.current?.files?.[0]; + if (!file) { + setResult({ error: 'Please pick a JSON file first.' }); + return; + } + setBusy(true); + try { + const text = await file.text(); + const parsed = JSON.parse(text); + const body = + Array.isArray(parsed) + ? { patterns: parsed, mode } + : Array.isArray(parsed?.patterns) + ? { ...parsed, mode } + : { patterns: [parsed], mode }; + const res = await apiRequest<{ + importedCount: number; + updatedCount: number; + skippedCount: number; + }>('/patterns/import', { method: 'POST', body }); + setResult(res); + onComplete(); + } catch (e) { + setResult({ error: e instanceof Error ? e.message : 'Import failed' }); + } finally { + setBusy(false); + } + } + + return ( +
+
e.stopPropagation()} + > +

Import patterns

+

+ Upload a JSON file exported from MinusPod, or a single community-pattern JSON. +

+ + + +
+ + +
+ + {result && ( +
+ {result.error ? ( +

{result.error}

+ ) : ( +

+ Imported {result.importedCount ?? 0}, updated {result.updatedCount ?? 0}, + skipped {result.skippedCount ?? 0}. +

+ )} +
+ )} + +
+ + +
+
+
+ ); +} diff --git a/frontend/src/components/TagChips.tsx b/frontend/src/components/TagChips.tsx new file mode 100644 index 00000000..47cbf0e9 --- /dev/null +++ b/frontend/src/components/TagChips.tsx @@ -0,0 +1,48 @@ +import { memo } from 'react'; + +interface Props { + tags: string[]; + variant?: 'sponsor' | 'podcast'; + className?: string; +} + +const ACCENT_BY_TAG: Record = { + // Podcast genres + news: 'bg-blue-500/15 text-blue-700 dark:text-blue-400', + politics: 'bg-blue-600/15 text-blue-700 dark:text-blue-400', + business: 'bg-amber-500/15 text-amber-700 dark:text-amber-400', + technology: 'bg-violet-500/15 text-violet-700 dark:text-violet-400', + comedy: 'bg-pink-500/15 text-pink-700 dark:text-pink-400', + true_crime: 'bg-red-500/15 text-red-700 dark:text-red-400', + sports: 'bg-emerald-500/15 text-emerald-700 dark:text-emerald-400', + science: 'bg-cyan-500/15 text-cyan-700 dark:text-cyan-400', + health: 'bg-rose-500/15 text-rose-700 dark:text-rose-400', + mental_health: 'bg-rose-600/15 text-rose-700 dark:text-rose-400', +}; + +const DEFAULT_ACCENT = 'bg-slate-500/15 text-slate-700 dark:text-slate-300'; +const UNIVERSAL_ACCENT = 'bg-indigo-500/20 text-indigo-700 dark:text-indigo-400 border border-indigo-500/40'; + +function tagClass(tag: string): string { + if (tag === 'universal') return UNIVERSAL_ACCENT; + return ACCENT_BY_TAG[tag] || DEFAULT_ACCENT; +} + +function TagChipsImpl({ tags, className = '' }: Props) { + if (!tags || tags.length === 0) return null; + return ( +
+ {tags.map((tag) => ( + + {tag === 'universal' ? '★ universal' : tag} + + ))} +
+ ); +} + +export const TagChips = memo(TagChipsImpl); diff --git a/frontend/src/pages/FeedDetail.tsx b/frontend/src/pages/FeedDetail.tsx index 65eb8a0c..b2cd5e53 100644 --- a/frontend/src/pages/FeedDetail.tsx +++ b/frontend/src/pages/FeedDetail.tsx @@ -8,6 +8,7 @@ import DropdownMenu from '../components/DropdownMenu'; import EpisodeList from '../components/EpisodeList'; import LoadingSpinner from '../components/LoadingSpinner'; import TriStateSelect from '../components/TriStateSelect'; +import { FeedTagsEditor } from '../components/FeedTagsEditor'; import { formatStorage } from './settings/settingsUtils'; function FeedDetail() { @@ -414,6 +415,8 @@ function FeedDetail() { + {slug && } + {/* Episodes header with status filter */}

diff --git a/frontend/src/pages/PatternsPage.tsx b/frontend/src/pages/PatternsPage.tsx index bf5b7494..426cbe8f 100644 --- a/frontend/src/pages/PatternsPage.tsx +++ b/frontend/src/pages/PatternsPage.tsx @@ -1,12 +1,22 @@ import { useState, useEffect } from 'react'; import { useQuery } from '@tanstack/react-query'; import { useSearchParams } from 'react-router-dom'; -import { getPatterns, getPatternStats, AdPattern } from '../api/patterns'; +import { + getPatterns, getPatternStats, AdPattern, + protectPattern, unprotectPattern, PATTERN_SOURCE_COMMUNITY, +} from '../api/patterns'; +import { + triggerCommunitySync, getCommunitySyncStatus, +} from '../api/community'; import PatternDetailModal from '../components/PatternDetailModal'; import LoadingSpinner from '../components/LoadingSpinner'; +import { CommunityBadge } from '../components/CommunityBadge'; +import { PatternImportDialog } from '../components/PatternImportDialog'; +import { PatternExportDialog } from '../components/PatternExportDialog'; type ScopeFilter = 'all' | 'global' | 'network' | 'podcast'; type OriginFilter = 'all' | 'auto' | 'user'; +type SourceFilter = 'all' | 'local' | 'community' | 'imported'; type SortDirection = 'asc' | 'desc'; function SortHeader({ @@ -42,23 +52,57 @@ function SortHeader({ function PatternsPage() { const [scopeFilter, setScopeFilter] = useState('all'); const [originFilter, setOriginFilter] = useState('all'); + const [sourceFilter, setSourceFilter] = useState('all'); const [searchQuery, setSearchQuery] = useState(''); const [showInactive, setShowInactive] = useState(false); const [selectedPattern, setSelectedPattern] = useState(null); const [sortField, setSortField] = useState('created_at'); const [sortDirection, setSortDirection] = useState('desc'); const [page, setPage] = useState(1); + const [importOpen, setImportOpen] = useState(false); + const [exportOpen, setExportOpen] = useState(false); const limit = 20; const [searchParams, setSearchParams] = useSearchParams(); const { data: patterns, isLoading, error, refetch } = useQuery({ - queryKey: ['patterns', scopeFilter, showInactive], + queryKey: ['patterns', scopeFilter, showInactive, sourceFilter], queryFn: () => getPatterns({ scope: scopeFilter === 'all' ? undefined : scopeFilter, active: showInactive ? undefined : true, + source: sourceFilter === 'all' ? undefined : sourceFilter, }), }); + const { data: syncStatus, refetch: refetchSyncStatus } = useQuery({ + queryKey: ['communitySyncStatus'], + queryFn: getCommunitySyncStatus, + refetchInterval: 60_000, + }); + + async function handleSyncNow() { + try { + await triggerCommunitySync(); + refetchSyncStatus(); + refetch(); + } catch (e) { + // Errors surface via /community-patterns/sync-status lastError. + console.error('Sync failed', e); + } + } + + async function handleToggleProtect(pattern: AdPattern) { + try { + if (pattern.protected_from_sync) { + await unprotectPattern(pattern.id); + } else { + await protectPattern(pattern.id); + } + refetch(); + } catch (e) { + console.error('Protect toggle failed', e); + } + } + const { data: stats } = useQuery({ queryKey: ['patternStats'], queryFn: getPatternStats, @@ -196,10 +240,46 @@ function PatternsPage() {

Ad Patterns

-
- {sortedPatterns?.length || 0} patterns +
+ + {sortedPatterns?.length || 0} patterns + + {syncStatus?.lastRun && ( + + )} + +
+ setImportOpen(false)} + onComplete={() => refetch()} + /> + setExportOpen(false)} + /> {/* Stats Summary */} {stats && ( @@ -280,6 +360,24 @@ function PatternsPage() {
+ {/* Source filter */} +
+ + +
+ {/* Search */}
setSelectedPattern(pattern)} > -
+
#{pattern.id} -
+
{getScopeBadge(pattern)} {pattern.created_by === 'user' && ( Manual )} + {pattern.source === PATTERN_SOURCE_COMMUNITY && pattern.community_id && ( + + )} {getStatusBadge(pattern.is_active)}
+ {pattern.source === PATTERN_SOURCE_COMMUNITY && ( +
+ +
+ )}
{pattern.sponsor || '(Unknown)'}
@@ -363,14 +479,15 @@ function PatternsPage() {
- - - - - - - + + + + + + + + @@ -384,6 +501,9 @@ function PatternsPage() { + @@ -397,13 +517,20 @@ function PatternsPage() { #{pattern.id} + ))} {paginatedPatterns?.length === 0 && ( - diff --git a/frontend/src/pages/Settings.tsx b/frontend/src/pages/Settings.tsx index ec7e5837..7cb738d0 100644 --- a/frontend/src/pages/Settings.tsx +++ b/frontend/src/pages/Settings.tsx @@ -33,6 +33,8 @@ import GlobalDefaultsSection from './settings/GlobalDefaultsSection'; import Podcasting20Section from './settings/Podcasting20Section'; import PromptsSection from './settings/PromptsSection'; import ExperimentsSection from './settings/ExperimentsSection'; +import AdReviewerSection from './settings/AdReviewerSection'; +import CommunityPatternsSection from './settings/CommunityPatternsSection'; import { formatModelLabel } from './settings/settingsUtils'; function SettingsGroupHeader({ title }: { title: string }) { @@ -520,6 +522,10 @@ function Settings() { resetIsPending={resetPromptsMutation.isPending} /> + + + + ({}); + const enabled = draft.enabled ?? data?.updatePatternsFromReviewerAdjustments ?? true; + const threshold = draft.threshold ?? data?.minTrimThreshold ?? 20; + + const save = useMutation({ + mutationFn: () => + updateReviewerSettings({ + updatePatternsFromReviewerAdjustments: enabled, + minTrimThreshold: threshold, + }), + onSuccess: () => { + setDraft({}); + qc.invalidateQueries({ queryKey: ['reviewerSettings'] }); + }, + }); + + return ( + + {isLoading ? ( +

Loading…

+ ) : ( +
+ +

+ When a reviewer narrows an ad's boundaries by more than the threshold + below, the matching local pattern's text is re-extracted from the + new bounds. Community patterns are never auto-rewritten. +

+ + {enabled && ( +
+ + + setDraft((d) => ({ ...d, threshold: parseFloat(e.target.value) || 0 })) + } + className="w-24 px-3 py-1.5 rounded-lg border border-input bg-background text-foreground" + /> + seconds +
+ )} + + + {save.isSuccess && ( + Saved + )} +
+ )} +
+ ); +} + +export default AdReviewerSection; diff --git a/frontend/src/pages/settings/CommunityPatternsSection.tsx b/frontend/src/pages/settings/CommunityPatternsSection.tsx new file mode 100644 index 00000000..98369f7e --- /dev/null +++ b/frontend/src/pages/settings/CommunityPatternsSection.tsx @@ -0,0 +1,222 @@ +import { useState } from 'react'; +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import CollapsibleSection from '../../components/CollapsibleSection'; +import ToggleSwitch from '../../components/ToggleSwitch'; +import { + getCommunitySyncSettings, + updateCommunitySyncSettings, + triggerCommunitySync, + purgeAllCommunityPatterns, +} from '../../api/community'; + +interface Draft { + enabled?: boolean; + cron?: string; +} + +function CommunityPatternsSection() { + const qc = useQueryClient(); + const { data, isLoading } = useQuery({ + queryKey: ['communitySync'], + queryFn: getCommunitySyncSettings, + refetchInterval: 60_000, + }); + + const [draft, setDraft] = useState({}); + const [cronError, setCronError] = useState(null); + const [confirmPurge, setConfirmPurge] = useState(false); + const [purgeResult, setPurgeResult] = useState(null); + const enabled = draft.enabled ?? data?.enabled ?? false; + const cron = draft.cron ?? data?.cron ?? '0 3 * * 0'; + + const save = useMutation({ + mutationFn: () => updateCommunitySyncSettings({ enabled, cron }), + onSuccess: () => { + setCronError(null); + setDraft({}); + qc.invalidateQueries({ queryKey: ['communitySync'] }); + }, + onError: (e: unknown) => + setCronError(e instanceof Error ? e.message : 'Save failed'), + }); + + const syncNow = useMutation({ + mutationFn: triggerCommunitySync, + onSuccess: () => qc.invalidateQueries({ queryKey: ['communitySync'] }), + }); + + const purge = useMutation({ + mutationFn: purgeAllCommunityPatterns, + onSuccess: (res) => { + setPurgeResult(`Removed ${res.deleted} community pattern${res.deleted === 1 ? '' : 's'}.`); + setConfirmPurge(false); + qc.invalidateQueries({ queryKey: ['patterns'] }); + qc.invalidateQueries({ queryKey: ['patternStats'] }); + qc.invalidateQueries({ queryKey: ['communitySync'] }); + }, + onError: (e: unknown) => + setPurgeResult(e instanceof Error ? `Purge failed: ${e.message}` : 'Purge failed'), + }); + + // React Compiler memoizes this automatically; manual useMemo trips the + // preserve-memoization rule because the inferred dep is `data` (broader + // than `data?.lastSummary`). + const lastSummary = (() => { + if (!data?.lastSummary) return null; + try { + return JSON.parse(data.lastSummary) as { + inserted: number; + updated: number; + deleted: number; + skipped: number; + errors: number; + }; + } catch { + return null; + } + })(); + + return ( + + {isLoading ? ( +

Loading...

+ ) : ( +
+ +

+ Pulls a curated list of common-sponsor patterns from the MinusPod + GitHub repository so a fresh install gets coverage without having + to build a library from scratch. Off by default; opt in here. +

+ + {enabled && ( +
+ + setDraft((d) => ({ ...d, cron: e.target.value }))} + placeholder="0 3 * * 0" + className="w-40 px-3 py-1.5 rounded-lg border border-input bg-background text-foreground font-mono text-sm" + /> + UTC +
+ )} + + {cronError && ( +

{cronError}

+ )} + +
+ + + {save.isSuccess && ( + Saved + )} + {syncNow.isError && ( + + {(syncNow.error as Error)?.message || 'Sync failed'} + + )} +
+ +
+
+ Last sync:{' '} + {data?.lastRun ? new Date(data.lastRun).toLocaleString() : 'never'} +
+ {data?.manifestVersion && ( +
+ Manifest version:{' '} + {data.manifestVersion} +
+ )} + {lastSummary && ( +
+ Last result:{' '} + {lastSummary.inserted} added, {lastSummary.updated} updated,{' '} + {lastSummary.deleted} removed, {lastSummary.skipped} skipped,{' '} + {lastSummary.errors} errors. +
+ )} + {data?.lastError && ( +
+ Last error: {data.lastError} +
+ )} +
+ +
+

Remove all community patterns

+

+ Wipes every pattern with source=community from this instance, including any + you marked Protect from sync. Local and imported patterns are left alone. + If sync is on, the next tick repopulates. +

+ {confirmPurge ? ( +
+ + +
+ ) : ( + + )} + {purgeResult && ( +

+ {purgeResult} +

+ )} +
+
+ )} +
+ ); +} + +export default CommunityPatternsSection; diff --git a/openapi.yaml b/openapi.yaml index 575956e7..519518e5 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -12,7 +12,7 @@ info: - Monitor system status and trigger cleanup operations - Manage cross-episode ad patterns with network and podcast scope - Submit corrections to improve ad detection accuracy - version: 2.3.4 + version: 2.4.6 contact: name: MinusPod license: @@ -3908,6 +3908,415 @@ paths: type: integer description: Number of pending items removed + /patterns/bulk-delete: + post: + summary: Bulk-delete patterns + description: | + Hard-delete patterns by id or by source filter. Both `confirm: true` + and `expected_count: N` are required; the call is rejected if the + actual matched count does not equal `expected_count`. + tags: [Patterns] + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [confirm, expected_count] + properties: + ids: + type: array + items: { type: integer } + source: + type: string + enum: [local, community, imported] + confirm: + type: boolean + expected_count: + type: integer + responses: + '200': + description: Deletion summary + content: + application/json: + schema: + type: object + properties: + deleted: { type: integer } + ids: + type: array + items: { type: integer } + '400': + description: Validation failure + + /patterns/bulk-disable: + post: + summary: Bulk-disable patterns + description: Same shape as bulk-delete; sets is_active=0 instead. + tags: [Patterns] + requestBody: + required: true + content: + application/json: + schema: { $ref: '#/components/schemas/BulkPatternOp' } + responses: + '200': + description: Disable summary + '400': + description: Validation failure + + /patterns/preview-export: + post: + summary: Dry-run the community export pipeline for a list of pattern ids + description: | + Returns which patterns would pass the quality gates and which would + be rejected, with per-id reasons. Used by the UI to show a preview + before the user builds the actual submission bundle. + tags: [Patterns] + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [ids] + properties: + ids: + type: array + items: { type: integer } + minItems: 1 + responses: + '200': + description: Preview result + content: + application/json: + schema: + type: object + properties: + ready: + type: array + items: { type: integer } + rejected: + type: array + items: + type: object + properties: + id: { type: integer } + sponsor: { type: string, nullable: true } + reasons: + type: array + items: { type: string } + ready_count: { type: integer } + rejected_count: { type: integer } + pattern_count: { type: integer } + '400': + description: ids list missing or empty + + /patterns/submit-bundle: + post: + summary: Build a downloadable community-submission bundle + description: | + Runs the export pipeline on each id and returns a single JSON file + (Content-Disposition: attachment) containing every pattern that + passed quality gates. The file is the artifact the contributor + commits into `patterns/community/` to open one PR for all the + selected patterns. The PR-side validator and the manifest builder + both handle the bundle format natively, so the maintainer does + not need to split it after merge. + tags: [Patterns] + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [ids] + properties: + ids: + type: array + items: { type: integer } + minItems: 1 + responses: + '200': + description: Bundle JSON file (downloaded as attachment) + headers: + Content-Disposition: + schema: { type: string } + description: 'attachment; filename="minuspod-submission-XXXX.json"' + X-Bundle-Pattern-Count: + schema: { type: integer } + X-Bundle-Rejected-Count: + schema: { type: integer } + content: + application/json: + schema: + type: object + properties: + format: { type: string, example: minuspod-community-submission } + bundle_version: { type: integer } + submitted_at: { type: string, format: date-time } + submitted_app_version: { type: string } + pattern_count: { type: integer } + patterns: + type: array + items: { type: object } + '400': + description: ids missing, or every pattern failed quality gates + + /patterns/{pattern_id}/submit-to-community: + post: + summary: Submit a local pattern to the community repository + description: | + Runs the community export pipeline: quality gates, tag validation, + PII strip, metadata strip, sponsor classification, and returns a + prefilled GitHub PR URL plus the JSON payload. When `too_large` is + true the URL exceeds GitHub's limit and the client should offer + the JSON file as a download instead. + tags: [Patterns] + parameters: + - in: path + name: pattern_id + required: true + schema: { type: integer } + responses: + '200': + description: Export result + content: + application/json: + schema: + type: object + properties: + payload: { type: object } + filename: { type: string } + pr_url: { type: string } + too_large: { type: boolean } + sponsor_match: + type: string + enum: [exact, alias, fuzzy, unknown] + '400': + description: Pattern failed export gates + + /patterns/{pattern_id}/protect: + post: + summary: Mark a community pattern as protected from sync + tags: [Patterns] + parameters: + - in: path + name: pattern_id + required: true + schema: { type: integer } + responses: + '200': + description: Updated protection state + '400': + description: Pattern is not source=community + delete: + summary: Clear the protected_from_sync flag on a community pattern + tags: [Patterns] + parameters: + - in: path + name: pattern_id + required: true + schema: { type: integer } + responses: + '200': + description: Updated protection state + + /feeds/{slug}/tags: + get: + summary: Get a feed's tag breakdown (rss / episode / user / effective) + tags: [Feeds] + parameters: + - in: path + name: slug + required: true + schema: { type: string } + responses: + '200': + description: Tag breakdown + content: + application/json: + schema: + type: object + properties: + effective: { type: array, items: { type: string } } + rss: { type: array, items: { type: string } } + episode: { type: array, items: { type: string } } + user: { type: array, items: { type: string } } + '404': + description: Feed not found + put: + summary: Update a feed's user-added tags + tags: [Feeds] + parameters: + - in: path + name: slug + required: true + schema: { type: string } + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [user_tags] + properties: + user_tags: + type: array + items: { type: string } + responses: + '200': + description: Updated tag breakdown + '400': + description: Unknown or invalid tag(s) + + /sponsors/{sponsor_id}/tags: + put: + summary: Update a sponsor's tags + tags: [Sponsors] + parameters: + - in: path + name: sponsor_id + required: true + schema: { type: integer } + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [tags] + properties: + tags: + type: array + items: { type: string } + responses: + '200': + description: Updated sponsor tags + '404': + description: Sponsor not found + + /settings/reviewer: + get: + summary: Read ad-reviewer auto-update settings + tags: [Settings] + responses: + '200': + description: Current settings + content: + application/json: + schema: + type: object + properties: + updatePatternsFromReviewerAdjustments: { type: boolean } + minTrimThreshold: { type: number } + put: + summary: Update ad-reviewer auto-update settings + tags: [Settings] + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + updatePatternsFromReviewerAdjustments: { type: boolean } + minTrimThreshold: { type: number, minimum: 1, maximum: 120 } + responses: + '200': + description: Updated settings + + /settings/community-sync: + get: + summary: Read community-pattern sync settings + tags: [Settings] + responses: + '200': + description: Current settings + content: + application/json: + schema: { $ref: '#/components/schemas/CommunitySyncSettings' } + put: + summary: Update community-pattern sync settings + tags: [Settings] + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + enabled: { type: boolean } + cron: { type: string } + responses: + '200': + description: Updated settings + + /community-patterns/sync: + post: + summary: Force a community-pattern sync now + description: Rate limited to 6/hour. + tags: [CommunityPatterns] + responses: + '200': + description: Sync summary + content: + application/json: + schema: { $ref: '#/components/schemas/CommunitySyncSummary' } + '429': + description: Rate-limited + '502': + description: Manifest fetch or apply failed + + /community-patterns/sync-status: + get: + summary: Get community-pattern sync status + tags: [CommunityPatterns] + responses: + '200': + description: Status + content: + application/json: + schema: { $ref: '#/components/schemas/CommunitySyncSettings' } + + /community-patterns/all: + delete: + summary: Remove every community pattern on this instance + description: | + Hard-deletes every row where ``source='community'``. Ignores + ``protected_from_sync``; the flag guards sync reconciliation, + not an explicit operator purge. Local and imported patterns are + untouched. If sync is enabled, the next tick repopulates from + the manifest. + + Requires ``{"confirm": true}`` in the body as a fat-finger guard + (matches the ``/patterns/bulk-delete`` convention). + tags: [CommunityPatterns] + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [confirm] + properties: + confirm: + type: boolean + const: true + responses: + '200': + description: Deletion count + content: + application/json: + schema: + type: object + properties: + deleted: + type: integer + example: 47 + '400': + description: confirm flag missing or not true + security: - sessionCookie: [] @@ -3979,6 +4388,49 @@ components: example: 550e8400-e29b-41d4-a716-446655440000 schemas: + BulkPatternOp: + type: object + required: [confirm, expected_count] + properties: + ids: + type: array + items: { type: integer } + source: + type: string + enum: [local, community, imported] + confirm: { type: boolean } + expected_count: { type: integer } + + CommunitySyncSettings: + type: object + properties: + enabled: { type: boolean } + cron: { type: string } + lastRun: + type: string + nullable: true + lastError: + type: string + nullable: true + manifestVersion: + type: string + nullable: true + lastSummary: + type: string + nullable: true + description: JSON-encoded summary of the last sync (counts + manifest version) + + CommunitySyncSummary: + type: object + properties: + inserted: { type: integer } + updated: { type: integer } + deleted: { type: integer } + skipped: { type: integer } + errors: { type: integer } + manifest_version: { type: string } + fetched_at: { type: string } + Feed: type: object properties: diff --git a/patterns/CONTRIBUTING.md b/patterns/CONTRIBUTING.md new file mode 100644 index 00000000..30bf6cac --- /dev/null +++ b/patterns/CONTRIBUTING.md @@ -0,0 +1,175 @@ +# Contributing community patterns + +This explains what happens when you submit a pattern, what gets stripped before submission, and what the automated checks look for. + +----- + +## What is a community pattern + +A community pattern is an ad pattern from your local MinusPod instance that you've chosen to share. Once accepted, it ships to other MinusPod users via the periodic sync. You retain everything you had locally. Submission is a copy, not a move. + +----- + +## What gets submitted + +When you pick **Submit to community** in the Export dialog, the app runs quality gates over your selection, lets you preview the ready vs rejected split (with reasons), and downloads one bundle file. The bundle has this shape: + +```json +{ + "format": "minuspod-community-submission", + "bundle_version": 1, + "submitted_at": "...", + "submitted_app_version": "2.4.5", + "pattern_count": N, + "patterns": [ /* one entry per pattern that passed quality gates */ ] +} +``` + +Each entry in `patterns[]` includes: + +- Pattern text (`text_template`, `intro_variants`, `outro_variants`) +- Sponsor name and aliases +- Tags +- Pattern scope and matching parameters +- A fresh UUID (`community_id`) +- Version number (starts at 1) +- Submission timestamp +- App version that submitted the pattern + +The bundle does not include any data identifying you, your podcasts, or your listening habits. The PR-side validator handles bundle files natively (one validation per entry), and the manifest builder flattens them into per-pattern entries in `patterns/community/index.json`, so the maintainer does not have to split them on merge. + +----- + +## What gets stripped before submission + +### Local identifiers + +The following fields are removed entirely: + +- Local pattern ID +- Podcast ID and network ID +- DAI platform +- Created/updated timestamps +- Match counts and last match timestamp +- Confirmation count and false positive count +- Local reviewer notes +- The `protected_from_sync` flag +- The `source` field (set fresh by the importer) + +These reveal which podcasts you listen to. + +### PII in the pattern text + +Pattern text is scanned and the following are stripped: + +**Email addresses with consumer domains.** Emails at these domains are removed: `gmail.com`, `yahoo.com`, `aol.com`, `hotmail.com`, `outlook.com`, `icloud.com`, `me.com`, `mac.com`, `protonmail.com`, `proton.me`, `mail.com`, `gmx.com`, `gmx.net`, `yandex.com`, `yandex.ru`, `qq.com`, `163.com`, `live.com`, `msn.com`, `hey.com`, `fastmail.com`, `tutanota.com`. Business emails like `support@nordvpn.com` are kept because they are part of the sponsor's actual ad copy. + +**Phone numbers that are not toll-free.** Toll-free numbers are kept (US/CA `800`, `833`, `844`, `855`, `866`, `877`, `888`; UK `0800`, `0808`; AU `1800`; international `+800`). Anything else matching a phone pattern is removed. + +The PII strip list is best-effort and tunable. Open an issue if you find a gap. + +----- + +## Quality checks before submission + +The app refuses to submit if any of these are true: + +- Pattern text is shorter than 50 characters +- Pattern text is longer than 3500 characters +- Pattern duration is longer than 120 seconds +- You have not confirmed the pattern at least once locally +- Your false positive count exceeds your confirmation count +- The pattern is not tied to a single sponsor +- The assigned sponsor's name (or any known alias) does not appear in the pattern text +- A different sponsor's name appears in the pattern text (multi-sponsor contamination) +- Any tag on the pattern is not in the canonical vocabulary + +When a check fails, the app shows which one and does not generate a submission. + +----- + +## What happens after you click submit + +1. The app runs the quality gates and shows a preview: how many patterns will pass, and the reasons for any rejections. +1. You confirm; `minuspod-submission-.json` downloads to your machine. +1. You fork `ttlequals0/MinusPod`, drop the file into `patterns/community/` on a new branch, commit, push, and open a PR. A CLI snippet is shown right after the download. +1. The GitHub Action validates the bundle (one validation per entry). +1. A maintainer reviews the PR. +1. If accepted, every pattern in the bundle joins the next published manifest and reaches other instances on their next sync. + +You need git installed (or the `gh` CLI) for the last step. The app does not push anything on your behalf. + +----- + +## What the automated PR checks do + +The GitHub Action runs the same quality checks as the in-app submission as a safety net, plus dedupe. + +### Re-validation + +All quality checks run again. If something was missed (or someone hand-edited the JSON), the action catches it. + +### Single-pattern check + +Each submitted file must describe exactly one ad. The action scans `text_template` for the names (and aliases) of any other seed sponsor; if any match on a word boundary, the PR is rejected with the list of foreign sponsors found. Usually that means a multi-sponsor ad block got pasted in. Trim the text to one sponsor and resubmit. Reviewers should eyeball this too: automation catches obvious stitches but not edge cases like a sponsor that isn't in the seed list yet. + +### Tag validation + +Every tag is checked against the canonical vocabulary. Unknown tags fail the check. + +### Sponsor validation + +The sponsor name is looked up in the seed list. An exact or alias match passes silently. A fuzzy match passes but the action flags it in a comment for the reviewer to confirm. An unknown sponsor also passes (a new sponsor isn't a rejection) but the action flags it in a comment for maintainer triage. + +### Dedupe + +The action canonicalizes the new pattern's text (lowercase, strip punctuation, remove stopwords, dates, day names) and compares it against every existing community pattern for the same sponsor: + +- **95% or higher similarity** → DUPLICATE. PR rejected. Comment points to the existing pattern. +- **75% to 94% similarity** → VARIANT. PR passes. Comment suggests merging the new text into the existing pattern's variants list. The maintainer decides during review. +- **Less than 75% similarity** → DISTINCT. Accepted as a new pattern. + +Genuinely different ads from the same sponsor are expected. NordVPN has had many ad scripts; each one is a distinct pattern. + +----- + +## What the maintainer does + +The maintainer reviews the PR and either: + +- Merges as-is +- Asks for revisions +- Merges after manually applying a variant merge +- Closes as duplicate +- Triages a flagged sponsor before merging + +Approved patterns land in the next published manifest and reach opted-in instances on their next sync. + +This process is new and not set in stone. Open an issue if you have ideas to improve it. + +----- + +## What does NOT get shared + +- Your username, email, IP, or any account identifier (you do not have one) +- Names of podcasts you listen to +- Episode information +- Any metric about your listening habits +- Any local config from your MinusPod instance +- Anything that could deanonymize you across submissions + +The only personally-attributable trace is the GitHub PR itself, opened from your GitHub account. For anonymity at the GitHub level, use an account that is not tied to your identity. MinusPod does not handle this. + +----- + +## Questions or issues + +Open an issue on the main MinusPod repo if: + +- A PII pattern is being missed by the strip rules +- A tag in the vocabulary needs a change +- A sponsor needs to be added to the seed list +- The PR validator gives an unclear error +- A pattern was incorrectly rejected as a duplicate + +This document evolves with the project. \ No newline at end of file diff --git a/patterns/README.md b/patterns/README.md new file mode 100644 index 00000000..afc308d3 --- /dev/null +++ b/patterns/README.md @@ -0,0 +1,219 @@ +# MinusPod community patterns + +This directory holds the crowdsourced ad pattern set. Each file is one pattern. Other MinusPod instances pull the manifest from this directory on a schedule (opt-in) and import patterns so a fresh install benefits from coverage built up elsewhere. + +----- + +## Directory structure + +``` +patterns/ +├── README.md ← this file +├── CONTRIBUTING.md ← submitter-facing PR explainer +├── community/ +│ ├── index.json ← manifest published to clients +│ ├── -.json ← one file per pattern +│ └── ... +└── vocabulary.json ← reference copy of the canonical tag list (canonical version lives in the app code) +``` + +----- + +## How sync works + +Opted-in MinusPod instances fetch: + +``` +https://raw.githubusercontent.com/ttlequals0/MinusPod/main/patterns/community/index.json +``` + +on a configurable cron schedule (default weekly). The manifest lists every published pattern with its `community_id` and `version`. The client: + +- Inserts patterns it has not seen before +- Updates patterns whose `version` is higher than the local copy +- Deletes patterns that are no longer in the manifest + +Patterns the user has edited locally or pinned with **Protect from sync** are skipped on update and delete. + +Failed or partial fetches make zero changes. Deletion only happens on a fully successful pull. + +----- + +## Manifest format + +```json +{ + "manifest_version": 1, + "published_at": "2026-05-14T00:00:00Z", + "vocabulary_version": 1, + "patterns": [ + { + "community_id": "uuid-string", + "version": 1, + "data": { ... full pattern JSON inline ... } + } + ] +} +``` + +Patterns are embedded inline so the client fetches everything in a single request. The individual files in `community/` are the source of truth for review and curation; the manifest is regenerated from them on publish. + +`vocabulary_version` lets clients detect when the tag vocabulary has changed and refresh accordingly. + +----- + +## Pattern file format + +```json +{ + "community_id": "uuid-string", + "version": 1, + "submitted_at": "2026-05-14T00:00:00Z", + "submitted_app_version": "2.3.1", + "sponsor": { + "name": "NordVPN", + "aliases": ["Nord VPN"], + "tags": ["vpn", "tech", "security", "universal"] + }, + "sponsor_match": "exact", + "scope": "global", + "text_template": "...", + "intro_variants": ["...", "..."], + "outro_variants": ["...", "..."], + "matching_params": { + "confidence_threshold": 0.85, + "min_text_length": 50 + } +} +``` + +Fields: + +- `community_id` - stable identifier across all instances +- `version` - increments when the pattern is updated upstream +- `submitted_at` - ISO 8601 timestamp of the original submission +- `submitted_app_version` - version of the submitting MinusPod app, used for triage +- `sponsor.name` / `sponsor.aliases` - sponsor identity, looked up against the seed list on import +- `sponsor.tags` - multi-tag classification, all values must exist in the vocabulary +- `sponsor_match` - set by the app on submission: `exact`, `alias`, `fuzzy`, or `unknown` +- `scope` - always `global` for community patterns +- `text_template` / `intro_variants` / `outro_variants` - the actual matched text +- `matching_params` - pattern-level matching configuration + +----- + +## Tag vocabulary + +49 tags in a flat namespace. The canonical source is `src/seed_data/tag_vocabulary.csv` in the MinusPod app code (read by `src/utils/community_tags.py`). A reference copy lives in `vocabulary.json` alongside the patterns for human readability. + +Tag categories (informal grouping for documentation only; they are all in one namespace): + +- **Podcast genres** (26): `arts`, `books`, `business`, `comedy`, `education`, `language_learning`, `self_improvement`, `fiction`, `history`, `health`, `mental_health`, `kids_family`, `leisure`, `gaming`, `automotive`, `music`, `news`, `politics`, `religion`, `science`, `society_culture`, `travel`, `sports`, `technology`, `true_crime`, `tv_film` +- **Sponsor industries** (22): `tech`, `saas`, `vpn`, `security`, `finance`, `insurance`, `food`, `meal_kit`, `beverage`, `supplements`, `apparel`, `home_goods`, `mattress`, `home_security`, `personal_care`, `auto`, `telecom`, `jobs`, `streaming`, `gambling`, `nicotine`, `dtc` +- **Special** (1): `universal` (sponsor advertises across all podcast genres) + +### Matching rule + +A community pattern is eligible for a podcast when ANY of the following is true: + +1. The sponsor has the `universal` tag +1. The sponsor's tag set and the podcast's tag set share at least one tag +1. The sponsor has no tags (fallback) +1. The podcast has no tags (fallback) + +This filter runs before any fuzzy text matching. Patterns filtered out by tags never enter the matching loop, which is how the system stays fast as the community set grows. + +### Podcast tagging + +Podcasts carry their own tag set from the same vocabulary. The effective tag set is the union of three sources. The RSS `` is parsed when the podcast is added, mapped to vocabulary tags via a fixed table (e.g. `Technology` → `technology`, `True Crime` → `true_crime`). Episode-level RSS category tags add to the effective set for that episode. User-added tags entered through the UI are useful when the RSS metadata is missing or wrong, or when a specific episode departs from the podcast's normal genre. + +User-added tags never override or replace RSS-derived tags; they only add to them. A podcast with no tags from any source falls back to matching all community patterns (matching rule 4). + +API: + +- `GET /api/podcasts/{id}/tags` - returns the effective tag union with source breakdown +- `PUT /api/podcasts/{id}/tags` - updates the user-added tag layer (the only mutable layer) + +----- + +## How to submit a pattern + +Open the Export dialog on the Patterns page and pick the **Submit to community** destination. The app runs the quality gates, shows which patterns will pass and which will be rejected (with reasons), and downloads one bundle file with every passing pattern. Drop the file into your fork of `patterns/community/` and open one PR. + +See `CONTRIBUTING.md` for the full explainer on what gets submitted, what gets stripped, and what the automated checks look for. + +Manual submission (without the app) is possible but discouraged. You would need to: + +1. Hand-craft a pattern JSON file matching the format above +1. Open a PR adding the file to `patterns/community/` +1. The GitHub Action will validate; expect the same checks as automatic submission + +----- + +## How to add a sponsor to the seed list + +The authoritative seed list lives in `src/seed_data/sponsors_final.csv` and is loaded into `known_sponsors` by the v2.4.0 schema migration. Adding a sponsor is a PR to that CSV; the next instance startup picks it up via the reseed migration step. + +When a community pattern is submitted with an unknown sponsor, the GitHub Action flags it. A maintainer then either: + +- Opens a follow-up PR adding the sponsor to the seed list, then merges the original pattern PR +- Decides the submitted sponsor is actually an alias of an existing one and updates the alias list +- Closes the PR if the sponsor is not appropriate for community sharing + +----- + +## Reviewer workflow + +Maintainer responsibilities for incoming PRs: + +1. Check the Action result on the PR. Red checks must be resolved before review proceeds. +1. Read the Action's comment for context (variant suggestions, sponsor flags, dedupe notes). +1. Verify the pattern looks reasonable: real ad copy, no obvious junk, sponsor identification correct. +1. If the Action flagged a variant suggestion and the suggestion is correct, manually edit the existing pattern file in the PR to add the new text to its `intro_variants` or `outro_variants` array, then close the new-file PR. (Auto-apply is deferred to a later release.) +1. If a sponsor is flagged as unknown, decide whether to add it to the seed list (separate PR) or treat it as an alias. +1. Approve and merge when satisfied. + +----- + +## Operational notes + +### Publishing a new manifest + +The manifest at `community/index.json` is regenerated automatically on every push to `main` that touches `patterns/community/`. The `regenerate-manifest` GitHub Action scans `patterns/community/*.json`, bumps `published_at`, embeds all patterns inline, and commits the updated `index.json` back to `main`. + +Maintainers do not need to run anything manually. Merge the pattern PR and the manifest updates within a minute. + +For local testing or recovery from a workflow failure, the same logic can be invoked manually: + +``` +python -m src.tools.generate_manifest +``` + +This rewrites `index.json` in place. Commit it like any other file. + +### Vocabulary changes + +When new tags are added or removed from the canonical vocabulary: + +1. Update `src/seed_data/tag_vocabulary.csv` in app code (requires app release). +1. Update `vocabulary.json` in this directory to match. +1. Bump `vocabulary_version` in the manifest (handled automatically by the regeneration workflow when the source files change). +1. Clients refresh their reference on the next sync. + +### Removing a bad pattern + +Open a PR removing the JSON file from `patterns/community/`. On merge, the regeneration workflow rebuilds the manifest without the removed pattern. On the next client sync, instances delete the pattern (unless the user has pinned it locally). + +----- + +## Privacy + +No personal information is captured in any pattern file. See `CONTRIBUTING.md` for the full list of what gets stripped before submission. + +The GitHub PR author identity is visible because PRs are public. This is a property of GitHub, not MinusPod. Submitters who want anonymity at that level should use a separate GitHub account. + +----- + +## Questions + +Open an issue on the main MinusPod repo for anything not covered here. \ No newline at end of file diff --git a/patterns/community/capital-one-44026a0f.json b/patterns/community/capital-one-44026a0f.json new file mode 100644 index 00000000..6a88c61c --- /dev/null +++ b/patterns/community/capital-one-44026a0f.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": "brought to you by Capital One. Capital One's tech team isn't just talking about multi-agentic AI. They already deployed one. It's called Chat Concierge and it's simplifying car shopping. Using self-reflection and layered reasoning with live API checks, it doesn't just help buyers find a car they love. It helps schedule a test drive, get pre-approved for financing, and estimate trade-in value. Advanced, intuitive, and deployed. That's how they stack. That's technology at Capital One.", + "intro_variants": [ + "brought to you by Capital One. Capital One's tech team isn't just talking about multi-agentic AI. They already deployed one. It's called Chat Concierge and it's simplifying car shopping. Using self-reflection and layered reasoning with live API checks, it doesn't just help buyers find a car they love. It helps" + ], + "outro_variants": [ + "" + ], + "avg_duration": null, + "sponsor": "Capital One", + "sponsor_aliases": [], + "sponsor_tags": [ + "finance", + "universal" + ], + "community_id": "44026a0f-9d7a-4f39-815e-5b12575b8107", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.481669+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/carvana-a934bf9a.json b/patterns/community/carvana-a934bf9a.json new file mode 100644 index 00000000..8d5436d4 --- /dev/null +++ b/patterns/community/carvana-a934bf9a.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": " just sold my car online. Let's go, Grandpa. Wait, you did? Yep, on Carvana. Just put in the license plate, answered a few questions, got an offer in minutes. Easier than setting up that new digital picture frame. You don't say. Yeah, they're even picking it up tomorrow. Talk about fast. Wow, way to go. So, about that picture frame. Ah, forget about it. Until Carvana makes one, I'm not interested.", + "intro_variants": [ + "I just sold my car online. Let's go, Grandpa. Wait, you did? Yep, on Carvana. Just put in the license plate, answered a few questions, got an offer in minutes. Easier than setting up that new digital picture frame. You don't say. Yeah, they're even picking it up tomorrow." + ], + "outro_variants": [ + "Yeah, they're even picking it up tomorrow. Talk about fast. Wow, way to go. So, about that picture frame. Ah, forget about it. Until Carvana makes one, I'm not interested." + ], + "avg_duration": null, + "sponsor": "Carvana", + "sponsor_aliases": [], + "sponsor_tags": [ + "auto", + "universal" + ], + "community_id": "a934bf9a-ebcc-4d64-b734-b82a6de7f25a", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.295723+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/carvana-f41cf617.json b/patterns/community/carvana-f41cf617.json new file mode 100644 index 00000000..9135aba3 --- /dev/null +++ b/patterns/community/carvana-f41cf617.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": "Carvana's so easy, just a click and we've got ourselves a car. See? So many cars. That's a click-tastic inventory. And check out the financing options. Payments to fit our budget. I mean, that's... Clickonomics 101. Delivery to our door. Just a hop, skip, and a click away. And... bought. No better feeling than when everything just... clicks. Buy your car today on... Carvana. Delivery fees may apply.", + "intro_variants": [ + "Carvana's so easy, just a click and we've got ourselves a car. See? So many cars. That's a click-tastic inventory. And check out the financing options. Payments to fit our budget. I mean, that's... Clickonomics 101. Delivery to our door. Just a hop, skip, and a click away. And... bought." + ], + "outro_variants": [ + "our door. Just a hop, skip, and a click away. And... bought. No better feeling than when everything just... clicks. Buy your car today on... Carvana. Delivery fees may apply." + ], + "avg_duration": null, + "sponsor": "Carvana", + "sponsor_aliases": [], + "sponsor_tags": [ + "auto", + "universal" + ], + "community_id": "f41cf617-6ad0-4df5-a07a-a4daeb449697", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.293654+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/index.json b/patterns/community/index.json new file mode 100644 index 00000000..b00df5e1 --- /dev/null +++ b/patterns/community/index.json @@ -0,0 +1,329 @@ +{ + "manifest_version": 1, + "published_at": "2026-05-15T16:45:06Z", + "vocabulary_version": 1, + "patterns": [ + { + "community_id": "3c348177-fdf5-4f4f-a3de-25f4e65bf591", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "What are you reaching for? If you're a smoker or vaper, you could be reaching for so much more with Zyn Nicotine Pouches. When you reach for Zyn, you're reaching for 10 satisfying varieties and two strengths, for a smoke-free experience that lets you lean in, for chances to break free from your routine, and a unique nationwide community. Whatever you're reaching for, reach for it with America's number one nicotine pouch brand. Find your Zyn wherever nicotine products are sold near you. This product contains nicotine. Nicotine is an addictive chemical. ", + "intro_variants": [], + "outro_variants": [], + "avg_duration": null, + "sponsor": "Zyn", + "sponsor_aliases": [ + "ZYN", + "Zinn" + ], + "sponsor_tags": [ + "nicotine", + "universal" + ], + "community_id": "3c348177-fdf5-4f4f-a3de-25f4e65bf591", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.098726+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "6b1b16df-70f4-4cd2-9e3c-18d2559b3195", + "version": 1, + "data": { + "scope": "global", + "text_template": "Zerotrust is clearly the future as threats get faster, quieter, and harder to detect. But implementing it shouldn't disrupt the business. ThreatLocker enforces default deny and execution in a way that remains enterprise ready, scalable, and operationally clean. Unknown software is stopped cold, trusted apps stay contained, and drift is locked down across the environment. It's zero trust that works in real enterprises and prepares you for the threats ahead. See why CISOs are adopting it at .com forward slash ThreatLocker.", + "intro_variants": [ + "Zerotrust is clearly the future as threats get faster, quieter, and harder to detect. But implementing it shouldn't disrupt the business. ThreatLocker enforces default deny and execution in a way that" + ], + "outro_variants": [ + "see why CISOs are adopting it at .com forward slash ThreatLocker." + ], + "avg_duration": null, + "sponsor": "ThreatLocker", + "sponsor_aliases": [], + "sponsor_tags": [ + "tech", + "security", + "saas" + ], + "community_id": "6b1b16df-70f4-4cd2-9e3c-18d2559b3195", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.123354+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "3f2f65ff-c654-42ca-a724-d6dce1f45881", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "Kayak gets my flight, hotel, and rental car right, so I can tune out travel advice that's just plain wrong. Bro, Skycoin. way better than points never fly during a Scorpio full moon just tell the manager you'll sue instant room upgrade stop taking bad travel advice start comparing hundreds of sites with Kayak and get your trip right Kayak got that right", + "intro_variants": [ + "Kayak gets my flight, hotel, and rental car right, so I can tune out travel advice that's just plain wrong." + ], + "outro_variants": [ + "way better than points never fly during a Scorpio full moon just tell the manager you'll sue instant room upgrade stop taking bad travel advice start comparing hundreds of sites with Kayak and get your trip right Kayak got that right" + ], + "avg_duration": null, + "sponsor": "Kayak", + "sponsor_aliases": [], + "sponsor_tags": [ + "travel", + "universal" + ], + "community_id": "3f2f65ff-c654-42ca-a724-d6dce1f45881", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.173447+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "1c07273d-4e1c-41fd-93dd-0455904ada87", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "\brought to you by Progressive Insurance. Do you ever find yourself playing the budgeting game? Well, with the Name Your Price tool from Progressive, you can find options that fit your budget and potentially lower your bills. Try it at Progressive.com. Progressive casualty insurance company and affiliates. Price and coverage match limited by state law. Not available in all states.", + "intro_variants": [ + "brought to you by Progressive Insurance. Do you ever find yourself playing the budgeting game?" + ], + "outro_variants": [ + "Price and coverage match limited by state law. Not available in all states." + ], + "avg_duration": null, + "sponsor": "Progressive", + "sponsor_aliases": [], + "sponsor_tags": [ + "insurance", + "universal" + ], + "community_id": "1c07273d-4e1c-41fd-93dd-0455904ada87", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.266967+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "17b2b4b8-660e-4ab8-96c0-c51a0dc3163d", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "Ryan Reynolds here from Mint Mobile. I don't know if you knew this, but anyone can get the same premium wireless for $15 a month plan that I've been enjoying. It's not just for celebrities. So do like I did and have one of your assistant's assistants switch you to Mint Mobile today. I'm told it's super easy to do at mintmobile.com slash switch. Upfront payment of $45 for three-month plan equivalent to $15 per month required. Intro rate first three months only, then full price plan options available. Taxes and fees extra. See full terms at mintmobile.com.", + "intro_variants": [ + "Ryan Reynolds here from Mint Mobile. I don't know if you knew this, but anyone can get the same premium wireless for $15 a month plan that I've been enjoying. It's not just for celebrities. So do like I did and have one of your assistant's assistants switch you to" + ], + "outro_variants": [ + "for three-month plan equivalent to $15 per month required. Intro rate first three months only, then full price plan options available. Taxes and fees extra. See full terms at mintmobile.com." + ], + "avg_duration": null, + "sponsor": "Mint Mobile", + "sponsor_aliases": [ + "MintMobile" + ], + "sponsor_tags": [ + "telecom", + "universal" + ], + "community_id": "17b2b4b8-660e-4ab8-96c0-c51a0dc3163d", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.269526+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "f41cf617-6ad0-4df5-a07a-a4daeb449697", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "Carvana's so easy, just a click and we've got ourselves a car. See? So many cars. That's a click-tastic inventory. And check out the financing options. Payments to fit our budget. I mean, that's... Clickonomics 101. Delivery to our door. Just a hop, skip, and a click away. And... bought. No better feeling than when everything just... clicks. Buy your car today on... Carvana. Delivery fees may apply.", + "intro_variants": [ + "Carvana's so easy, just a click and we've got ourselves a car. See? So many cars. That's a click-tastic inventory. And check out the financing options. Payments to fit our budget. I mean, that's... Clickonomics 101. Delivery to our door. Just a hop, skip, and a click away. And... bought." + ], + "outro_variants": [ + "our door. Just a hop, skip, and a click away. And... bought. No better feeling than when everything just... clicks. Buy your car today on... Carvana. Delivery fees may apply." + ], + "avg_duration": null, + "sponsor": "Carvana", + "sponsor_aliases": [], + "sponsor_tags": [ + "auto", + "universal" + ], + "community_id": "f41cf617-6ad0-4df5-a07a-a4daeb449697", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.293654+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "a934bf9a-ebcc-4d64-b734-b82a6de7f25a", + "version": 1, + "data": { + "scope": "podcast", + "text_template": " just sold my car online. Let's go, Grandpa. Wait, you did? Yep, on Carvana. Just put in the license plate, answered a few questions, got an offer in minutes. Easier than setting up that new digital picture frame. You don't say. Yeah, they're even picking it up tomorrow. Talk about fast. Wow, way to go. So, about that picture frame. Ah, forget about it. Until Carvana makes one, I'm not interested.", + "intro_variants": [ + "I just sold my car online. Let's go, Grandpa. Wait, you did? Yep, on Carvana. Just put in the license plate, answered a few questions, got an offer in minutes. Easier than setting up that new digital picture frame. You don't say. Yeah, they're even picking it up tomorrow." + ], + "outro_variants": [ + "Yeah, they're even picking it up tomorrow. Talk about fast. Wow, way to go. So, about that picture frame. Ah, forget about it. Until Carvana makes one, I'm not interested." + ], + "avg_duration": null, + "sponsor": "Carvana", + "sponsor_aliases": [], + "sponsor_tags": [ + "auto", + "universal" + ], + "community_id": "a934bf9a-ebcc-4d64-b734-b82a6de7f25a", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.295723+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "b77d02bc-5ccb-465d-8ecc-c0f3c53efa3d", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "We all prefer things a certain way, like groceries. If you want groceries just how you like them, you gotta try Instacart. They have a new preference picker that lets you pick how ripe or unripe you want your bananas. Shoppers can see your preferences up front, helping guide their choices. Because when it comes to groceries, the details matter. Instacart. Get groceries just how you like.", + "intro_variants": [ + "We all prefer things a certain way, like groceries. If you want groceries just how you like them, you gotta try Instacart. They have a new preference picker that lets you pick how ripe or unripe you want your bananas. Shoppers can see your preferences up front, helping guide their" + ], + "outro_variants": [ + "want your bananas. Shoppers can see your preferences up front, helping guide their choices. Because when it comes to groceries, the details matter. Instacart. Get groceries just how you like." + ], + "avg_duration": null, + "sponsor": "Instacart", + "sponsor_aliases": [], + "sponsor_tags": [ + "food", + "universal" + ], + "community_id": "b77d02bc-5ccb-465d-8ecc-c0f3c53efa3d", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.339888+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "b052ed12-557d-4cdb-b24a-28d547dbd9bd", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "brought to you by Squarespace. Squarespace makes building and managing your website ridiculously easy. They give you everything you need to showcase what you do and get paid all in one place. And with cutting-edge design tools, anyone can create a custom site that truly fits their brand. Head to squarespace.com slash rogan for a free trial. And when you're ready... Ready to launch? Use the offer code ROGAN to save 10% off your first purchase of a website or domain.", + "intro_variants": [ + "brought to you by Squarespace. Squarespace makes building and managing your website ridiculously easy. They give you everything you need to showcase what you do and get paid all in one place. And with cutting-edge design tools, anyone can create a custom site that truly fits their brand. Head to" + ], + "outro_variants": [ + "slash for a free trial. And when you're ready... Ready to launch? Use the offer code to save 10% off your first purchase of a website or domain." + ], + "avg_duration": null, + "sponsor": "Squarespace", + "sponsor_aliases": [ + "Square Space" + ], + "sponsor_tags": [ + "tech", + "saas", + "dtc", + "universal" + ], + "community_id": "b052ed12-557d-4cdb-b24a-28d547dbd9bd", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.382004+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "c114bd7f-39af-4100-826d-e9ed1346d76f", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "brought to you by SimpliSafe. The world can be scary sometimes. I mean, take the news for example. It feels like we hear about something outrageous happening Every week. Now, more than ever, it's more important to have a security system for your safety and peace of mind, and SimpliSafe is one of the best options out there, partly because of how proactive it is. It can help stop and prevent crime in real time. AI-powered cameras can detect suspicious activity and alert security agents who can immediately take action. They can speak to intruders, warn them away, flashlights and sirens and and Dispatch Police. With how great a job it does, no long-term contracts and no hidden fees, it's easy to see why SimpliSafe continues to be named best home security systems by U.S. News and World Report. Try it out right now. My listeners can get 50% off a SimpliSafe home security system at simplisafe.com slash . That's 50% off at simplisafe.com slash . There's no safe.", + "intro_variants": [ + "brought to you by SimpliSafe. The world can be scary sometimes. I mean, take the news for example. It feels like we hear about something outrageous happening Every week. Now, more than ever, it's more important to have a security system for your safety and peace of mind, and SimpliSafe" + ], + "outro_variants": [ + "Try it out right now. My listeners can get 50% off a SimpliSafe home security system at simplisafe.com slash . That's 50% off at simplisafe.com slash . There's no safe." + ], + "avg_duration": null, + "sponsor": "SimpliSafe", + "sponsor_aliases": [ + "Simpli Safe" + ], + "sponsor_tags": [ + "home_security", + "universal" + ], + "community_id": "c114bd7f-39af-4100-826d-e9ed1346d76f", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.401485+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "44026a0f-9d7a-4f39-815e-5b12575b8107", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "brought to you by Capital One. Capital One's tech team isn't just talking about multi-agentic AI. They already deployed one. It's called Chat Concierge and it's simplifying car shopping. Using self-reflection and layered reasoning with live API checks, it doesn't just help buyers find a car they love. It helps schedule a test drive, get pre-approved for financing, and estimate trade-in value. Advanced, intuitive, and deployed. That's how they stack. That's technology at Capital One.", + "intro_variants": [ + "brought to you by Capital One. Capital One's tech team isn't just talking about multi-agentic AI. They already deployed one. It's called Chat Concierge and it's simplifying car shopping. Using self-reflection and layered reasoning with live API checks, it doesn't just help buyers find a car they love. It helps" + ], + "outro_variants": [ + "" + ], + "avg_duration": null, + "sponsor": "Capital One", + "sponsor_aliases": [], + "sponsor_tags": [ + "finance", + "universal" + ], + "community_id": "44026a0f-9d7a-4f39-815e-5b12575b8107", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.481669+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + }, + { + "community_id": "9e83a5f6-097b-4378-8077-b9ccec49b389", + "version": 1, + "data": { + "scope": "podcast", + "text_template": "This is a Monday.com ad. The same Monday.com helping people worldwide getting work done faster and better. The same Monday.com designed for every team and every industry. The same Monday.com with built-in AI, scaling your work from day one. The same Monday.com that your team will actually love using. The same Monday.com with an easy and intuitive setup. Go to Monday.com and try it for free. Yes. TheSameMonday.com", + "intro_variants": [ + "This is a Monday.com ad. The same Monday.com helping people worldwide getting work done faster and better. The same Monday.com designed for every team and every industry. The same Monday.com with built-in AI, scaling your work from day one. The same Monday.com that your team will actually love using. The" + ], + "outro_variants": [ + "one. The same Monday.com that your team will actually love using. The same Monday.com with an easy and intuitive setup. Go to Monday.com and try it for free. Yes. TheSameMonday.com" + ], + "avg_duration": null, + "sponsor": "Monday.com", + "sponsor_aliases": [ + "Monday" + ], + "sponsor_tags": [ + "tech", + "saas" + ], + "community_id": "9e83a5f6-097b-4378-8077-b9ccec49b389", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.533543+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" + } + } + ] +} diff --git a/patterns/community/instacart-b77d02bc.json b/patterns/community/instacart-b77d02bc.json new file mode 100644 index 00000000..a71cc729 --- /dev/null +++ b/patterns/community/instacart-b77d02bc.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": "We all prefer things a certain way, like groceries. If you want groceries just how you like them, you gotta try Instacart. They have a new preference picker that lets you pick how ripe or unripe you want your bananas. Shoppers can see your preferences up front, helping guide their choices. Because when it comes to groceries, the details matter. Instacart. Get groceries just how you like.", + "intro_variants": [ + "We all prefer things a certain way, like groceries. If you want groceries just how you like them, you gotta try Instacart. They have a new preference picker that lets you pick how ripe or unripe you want your bananas. Shoppers can see your preferences up front, helping guide their" + ], + "outro_variants": [ + "want your bananas. Shoppers can see your preferences up front, helping guide their choices. Because when it comes to groceries, the details matter. Instacart. Get groceries just how you like." + ], + "avg_duration": null, + "sponsor": "Instacart", + "sponsor_aliases": [], + "sponsor_tags": [ + "food", + "universal" + ], + "community_id": "b77d02bc-5ccb-465d-8ecc-c0f3c53efa3d", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.339888+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/kayak-3f2f65ff.json b/patterns/community/kayak-3f2f65ff.json new file mode 100644 index 00000000..1683c572 --- /dev/null +++ b/patterns/community/kayak-3f2f65ff.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": "Kayak gets my flight, hotel, and rental car right, so I can tune out travel advice that's just plain wrong. Bro, Skycoin. way better than points never fly during a Scorpio full moon just tell the manager you'll sue instant room upgrade stop taking bad travel advice start comparing hundreds of sites with Kayak and get your trip right Kayak got that right", + "intro_variants": [ + "Kayak gets my flight, hotel, and rental car right, so I can tune out travel advice that's just plain wrong." + ], + "outro_variants": [ + "way better than points never fly during a Scorpio full moon just tell the manager you'll sue instant room upgrade stop taking bad travel advice start comparing hundreds of sites with Kayak and get your trip right Kayak got that right" + ], + "avg_duration": null, + "sponsor": "Kayak", + "sponsor_aliases": [], + "sponsor_tags": [ + "travel", + "universal" + ], + "community_id": "3f2f65ff-c654-42ca-a724-d6dce1f45881", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.173447+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/mint-mobile-17b2b4b8.json b/patterns/community/mint-mobile-17b2b4b8.json new file mode 100644 index 00000000..e0666f7d --- /dev/null +++ b/patterns/community/mint-mobile-17b2b4b8.json @@ -0,0 +1,24 @@ +{ + "scope": "podcast", + "text_template": "Ryan Reynolds here from Mint Mobile. I don't know if you knew this, but anyone can get the same premium wireless for $15 a month plan that I've been enjoying. It's not just for celebrities. So do like I did and have one of your assistant's assistants switch you to Mint Mobile today. I'm told it's super easy to do at mintmobile.com slash switch. Upfront payment of $45 for three-month plan equivalent to $15 per month required. Intro rate first three months only, then full price plan options available. Taxes and fees extra. See full terms at mintmobile.com.", + "intro_variants": [ + "Ryan Reynolds here from Mint Mobile. I don't know if you knew this, but anyone can get the same premium wireless for $15 a month plan that I've been enjoying. It's not just for celebrities. So do like I did and have one of your assistant's assistants switch you to" + ], + "outro_variants": [ + "for three-month plan equivalent to $15 per month required. Intro rate first three months only, then full price plan options available. Taxes and fees extra. See full terms at mintmobile.com." + ], + "avg_duration": null, + "sponsor": "Mint Mobile", + "sponsor_aliases": [ + "MintMobile" + ], + "sponsor_tags": [ + "telecom", + "universal" + ], + "community_id": "17b2b4b8-660e-4ab8-96c0-c51a0dc3163d", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.269526+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/monday-com-9e83a5f6.json b/patterns/community/monday-com-9e83a5f6.json new file mode 100644 index 00000000..620e47d3 --- /dev/null +++ b/patterns/community/monday-com-9e83a5f6.json @@ -0,0 +1,24 @@ +{ + "scope": "podcast", + "text_template": "This is a Monday.com ad. The same Monday.com helping people worldwide getting work done faster and better. The same Monday.com designed for every team and every industry. The same Monday.com with built-in AI, scaling your work from day one. The same Monday.com that your team will actually love using. The same Monday.com with an easy and intuitive setup. Go to Monday.com and try it for free. Yes. TheSameMonday.com", + "intro_variants": [ + "This is a Monday.com ad. The same Monday.com helping people worldwide getting work done faster and better. The same Monday.com designed for every team and every industry. The same Monday.com with built-in AI, scaling your work from day one. The same Monday.com that your team will actually love using. The" + ], + "outro_variants": [ + "one. The same Monday.com that your team will actually love using. The same Monday.com with an easy and intuitive setup. Go to Monday.com and try it for free. Yes. TheSameMonday.com" + ], + "avg_duration": null, + "sponsor": "Monday.com", + "sponsor_aliases": [ + "Monday" + ], + "sponsor_tags": [ + "tech", + "saas" + ], + "community_id": "9e83a5f6-097b-4378-8077-b9ccec49b389", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.533543+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/progressive-1c07273d.json b/patterns/community/progressive-1c07273d.json new file mode 100644 index 00000000..1da526a0 --- /dev/null +++ b/patterns/community/progressive-1c07273d.json @@ -0,0 +1,22 @@ +{ + "scope": "podcast", + "text_template": "\brought to you by Progressive Insurance. Do you ever find yourself playing the budgeting game? Well, with the Name Your Price tool from Progressive, you can find options that fit your budget and potentially lower your bills. Try it at Progressive.com. Progressive casualty insurance company and affiliates. Price and coverage match limited by state law. Not available in all states.", + "intro_variants": [ + "brought to you by Progressive Insurance. Do you ever find yourself playing the budgeting game?" + ], + "outro_variants": [ + "Price and coverage match limited by state law. Not available in all states." + ], + "avg_duration": null, + "sponsor": "Progressive", + "sponsor_aliases": [], + "sponsor_tags": [ + "insurance", + "universal" + ], + "community_id": "1c07273d-4e1c-41fd-93dd-0455904ada87", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.266967+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/simplisafe-c114bd7f.json b/patterns/community/simplisafe-c114bd7f.json new file mode 100644 index 00000000..23ce1a12 --- /dev/null +++ b/patterns/community/simplisafe-c114bd7f.json @@ -0,0 +1,24 @@ +{ + "scope": "podcast", + "text_template": "brought to you by SimpliSafe. The world can be scary sometimes. I mean, take the news for example. It feels like we hear about something outrageous happening Every week. Now, more than ever, it's more important to have a security system for your safety and peace of mind, and SimpliSafe is one of the best options out there, partly because of how proactive it is. It can help stop and prevent crime in real time. AI-powered cameras can detect suspicious activity and alert security agents who can immediately take action. They can speak to intruders, warn them away, flashlights and sirens and and Dispatch Police. With how great a job it does, no long-term contracts and no hidden fees, it's easy to see why SimpliSafe continues to be named best home security systems by U.S. News and World Report. Try it out right now. My listeners can get 50% off a SimpliSafe home security system at simplisafe.com slash . That's 50% off at simplisafe.com slash . There's no safe.", + "intro_variants": [ + "brought to you by SimpliSafe. The world can be scary sometimes. I mean, take the news for example. It feels like we hear about something outrageous happening Every week. Now, more than ever, it's more important to have a security system for your safety and peace of mind, and SimpliSafe" + ], + "outro_variants": [ + "Try it out right now. My listeners can get 50% off a SimpliSafe home security system at simplisafe.com slash . That's 50% off at simplisafe.com slash . There's no safe." + ], + "avg_duration": null, + "sponsor": "SimpliSafe", + "sponsor_aliases": [ + "Simpli Safe" + ], + "sponsor_tags": [ + "home_security", + "universal" + ], + "community_id": "c114bd7f-39af-4100-826d-e9ed1346d76f", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.401485+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/squarespace-b052ed12.json b/patterns/community/squarespace-b052ed12.json new file mode 100644 index 00000000..5aefb8ee --- /dev/null +++ b/patterns/community/squarespace-b052ed12.json @@ -0,0 +1,26 @@ +{ + "scope": "podcast", + "text_template": "brought to you by Squarespace. Squarespace makes building and managing your website ridiculously easy. They give you everything you need to showcase what you do and get paid all in one place. And with cutting-edge design tools, anyone can create a custom site that truly fits their brand. Head to squarespace.com slash rogan for a free trial. And when you're ready... Ready to launch? Use the offer code ROGAN to save 10% off your first purchase of a website or domain.", + "intro_variants": [ + "brought to you by Squarespace. Squarespace makes building and managing your website ridiculously easy. They give you everything you need to showcase what you do and get paid all in one place. And with cutting-edge design tools, anyone can create a custom site that truly fits their brand. Head to" + ], + "outro_variants": [ + "slash for a free trial. And when you're ready... Ready to launch? Use the offer code to save 10% off your first purchase of a website or domain." + ], + "avg_duration": null, + "sponsor": "Squarespace", + "sponsor_aliases": [ + "Square Space" + ], + "sponsor_tags": [ + "tech", + "saas", + "dtc", + "universal" + ], + "community_id": "b052ed12-557d-4cdb-b24a-28d547dbd9bd", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.382004+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/threatlocker-6b1b16df.json b/patterns/community/threatlocker-6b1b16df.json new file mode 100644 index 00000000..c28caad9 --- /dev/null +++ b/patterns/community/threatlocker-6b1b16df.json @@ -0,0 +1,23 @@ +{ + "scope": "global", + "text_template": "Zerotrust is clearly the future as threats get faster, quieter, and harder to detect. But implementing it shouldn't disrupt the business. ThreatLocker enforces default deny and execution in a way that remains enterprise ready, scalable, and operationally clean. Unknown software is stopped cold, trusted apps stay contained, and drift is locked down across the environment. It's zero trust that works in real enterprises and prepares you for the threats ahead. See why CISOs are adopting it at .com forward slash ThreatLocker.", + "intro_variants": [ + "Zerotrust is clearly the future as threats get faster, quieter, and harder to detect. But implementing it shouldn't disrupt the business. ThreatLocker enforces default deny and execution in a way that" + ], + "outro_variants": [ + "see why CISOs are adopting it at .com forward slash ThreatLocker." + ], + "avg_duration": null, + "sponsor": "ThreatLocker", + "sponsor_aliases": [], + "sponsor_tags": [ + "tech", + "security", + "saas" + ], + "community_id": "6b1b16df-70f4-4cd2-9e3c-18d2559b3195", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.123354+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/community/zyn-3c348177.json b/patterns/community/zyn-3c348177.json new file mode 100644 index 00000000..7748c40f --- /dev/null +++ b/patterns/community/zyn-3c348177.json @@ -0,0 +1,21 @@ +{ + "scope": "podcast", + "text_template": "What are you reaching for? If you're a smoker or vaper, you could be reaching for so much more with Zyn Nicotine Pouches. When you reach for Zyn, you're reaching for 10 satisfying varieties and two strengths, for a smoke-free experience that lets you lean in, for chances to break free from your routine, and a unique nationwide community. Whatever you're reaching for, reach for it with America's number one nicotine pouch brand. Find your Zyn wherever nicotine products are sold near you. This product contains nicotine. Nicotine is an addictive chemical. ", + "intro_variants": [], + "outro_variants": [], + "avg_duration": null, + "sponsor": "Zyn", + "sponsor_aliases": [ + "ZYN", + "Zinn" + ], + "sponsor_tags": [ + "nicotine", + "universal" + ], + "community_id": "3c348177-fdf5-4f4f-a3de-25f4e65bf591", + "version": 1, + "submitted_at": "2026-05-15T13:44:56.098726+00:00", + "submitted_app_version": "unknown", + "sponsor_match": "exact" +} diff --git a/patterns/vocabulary.json b/patterns/vocabulary.json new file mode 100644 index 00000000..eea4b148 --- /dev/null +++ b/patterns/vocabulary.json @@ -0,0 +1,257 @@ +{ + "vocabulary_version": 1, + "description": "Canonical tag list for MinusPod community patterns. Authoritative source is the VALID_TAGS constant + tag_vocabulary.csv in app code. Regenerate this file when the source changes.", + "special_tags": [ + { + "tag": "universal", + "description": "Sponsor advertises broadly across all podcast genres. Bypasses tag-overlap matching." + } + ], + "podcast_genres": [ + { + "tag": "arts", + "description": "Arts, design, fashion, performing/visual arts" + }, + { + "tag": "books", + "description": "Books, audiobook-targeted shows" + }, + { + "tag": "business", + "description": "Business, careers, entrepreneurship, investing, management, marketing" + }, + { + "tag": "comedy", + "description": "Comedy, stand-up, improv" + }, + { + "tag": "education", + "description": "General education, courses, how-to" + }, + { + "tag": "language_learning", + "description": "Language learning specifically (Babbel/Rosetta target)" + }, + { + "tag": "self_improvement", + "description": "Self-improvement, productivity, life advice" + }, + { + "tag": "fiction", + "description": "Narrative fiction, audio drama" + }, + { + "tag": "history", + "description": "History, historical analysis" + }, + { + "tag": "health", + "description": "Health & fitness, alternative health, medicine, nutrition, sexuality" + }, + { + "tag": "mental_health", + "description": "Mental health, meditation, mindfulness" + }, + { + "tag": "kids_family", + "description": "Kids, family, parenting, pets" + }, + { + "tag": "leisure", + "description": "Leisure, crafts, hobbies, home & garden" + }, + { + "tag": "gaming", + "description": "Video games, board games, gaming culture" + }, + { + "tag": "automotive", + "description": "Cars, car culture, auto enthusiast" + }, + { + "tag": "music", + "description": "Music, music commentary, music interviews" + }, + { + "tag": "news", + "description": "General news, daily news, news commentary, business/tech news" + }, + { + "tag": "politics", + "description": "Politics, political commentary, government" + }, + { + "tag": "religion", + "description": "Religion, spirituality, faith" + }, + { + "tag": "science", + "description": "Science, astronomy, physics, chemistry, life sciences" + }, + { + "tag": "society_culture", + "description": "Society & culture, documentary, philosophy, relationships" + }, + { + "tag": "travel", + "description": "Places & travel" + }, + { + "tag": "sports", + "description": "Sports, all sports subcategories" + }, + { + "tag": "technology", + "description": "Technology shows specifically" + }, + { + "tag": "true_crime", + "description": "True crime" + }, + { + "tag": "tv_film", + "description": "TV & film, after shows, film reviews" + } + ], + "sponsor_industries": [ + { + "tag": "tech", + "description": "Software, apps, digital tools (broad)" + }, + { + "tag": "saas", + "description": "Subscription B2B/B2C software" + }, + { + "tag": "vpn", + "description": "VPN and privacy networking" + }, + { + "tag": "security", + "description": "Password managers, identity protection, antivirus" + }, + { + "tag": "finance", + "description": "Banking, investing, lending, credit" + }, + { + "tag": "insurance", + "description": "Insurance products (auto, home, life, etc.)" + }, + { + "tag": "food", + "description": "Restaurants, grocery, delivery" + }, + { + "tag": "meal_kit", + "description": "Meal kit delivery (HelloFresh, Factor, etc.)" + }, + { + "tag": "beverage", + "description": "Drinks, alcohol, energy, hydration" + }, + { + "tag": "supplements", + "description": "Supplements, vitamins, nutrition powders" + }, + { + "tag": "apparel", + "description": "Clothing, shoes, accessories" + }, + { + "tag": "home_goods", + "description": "Furniture, decor, home goods (non-mattress)" + }, + { + "tag": "mattress", + "description": "Mattresses, bedding (Casper, Helix, etc.)" + }, + { + "tag": "home_security", + "description": "Alarms, cameras, smart locks" + }, + { + "tag": "personal_care", + "description": "Grooming, skincare, haircare, hygiene" + }, + { + "tag": "auto", + "description": "Cars, car buying, auto services" + }, + { + "tag": "telecom", + "description": "Mobile carriers, internet providers" + }, + { + "tag": "jobs", + "description": "Recruitment, hiring platforms" + }, + { + "tag": "streaming", + "description": "Streaming services, audiobooks (Audible, Netflix, etc.)" + }, + { + "tag": "gambling", + "description": "Sportsbooks, casinos, fantasy sports" + }, + { + "tag": "nicotine", + "description": "Tobacco, vape, nicotine pouches" + }, + { + "tag": "dtc", + "description": "Direct-to-consumer brand (cross-cutting tag)" + } + ], + "all_tags": [ + "apparel", + "arts", + "auto", + "automotive", + "beverage", + "books", + "business", + "comedy", + "dtc", + "education", + "fiction", + "finance", + "food", + "gambling", + "gaming", + "health", + "history", + "home_goods", + "home_security", + "insurance", + "jobs", + "kids_family", + "language_learning", + "leisure", + "mattress", + "meal_kit", + "mental_health", + "music", + "news", + "nicotine", + "personal_care", + "politics", + "religion", + "saas", + "science", + "security", + "self_improvement", + "society_culture", + "sports", + "streaming", + "supplements", + "tech", + "technology", + "telecom", + "travel", + "true_crime", + "tv_film", + "universal", + "vpn" + ] +} \ No newline at end of file diff --git a/src/ad_detector.py b/src/ad_detector.py index 11b08709..3be31b5a 100644 --- a/src/ad_detector.py +++ b/src/ad_detector.py @@ -1812,6 +1812,7 @@ def process_transcript(self, segments: List[Dict], podcast_name: str = "Unknown" podcast_id: str = None, network_id: str = None, skip_patterns: bool = False, podcast_description: str = None, + podcast_tags: Optional[set] = None, progress_callback=None, audio_analysis=None, cancel_event=None) -> Dict: @@ -1928,7 +1929,8 @@ def process_transcript(self, segments: List[Dict], podcast_name: str = "Unknown" text_matches = self.text_pattern_matcher.find_matches( segments, podcast_id=podcast_id, - network_id=network_id + network_id=network_id, + podcast_tags=podcast_tags, ) tp_added = 0 diff --git a/src/api/__init__.py b/src/api/__init__.py index 34072b6b..ec383a31 100644 --- a/src/api/__init__.py +++ b/src/api/__init__.py @@ -353,4 +353,4 @@ def _find_similar_pattern(db, pattern_data: dict) -> Optional[dict]: # Import all sub-modules to trigger route registration -from api import feeds, episodes, history, settings, system, patterns, sponsors, status, auth, search, podcast_search, stats, providers +from api import feeds, episodes, history, settings, system, patterns, sponsors, status, auth, search, podcast_search, stats, providers, tags diff --git a/src/api/episodes.py b/src/api/episodes.py index b3a22880..61169af6 100644 --- a/src/api/episodes.py +++ b/src/api/episodes.py @@ -829,9 +829,17 @@ def retry_ad_detection(slug, episode_id): from ad_detector import AdDetector ad_detector = AdDetector() try: + # Load podcast tags for community-pattern eligibility. + try: + _pod_row = db.get_podcast_by_slug(slug) + _tags_json = _pod_row.get('tags') if _pod_row else None + podcast_tags = set(json.loads(_tags_json)) if _tags_json else None + except Exception: + podcast_tags = None ad_result = ad_detector.process_transcript( segments, podcast_name, episode.get('title', 'Unknown'), slug, episode_id, - podcast_id=slug # Pass slug as podcast_id for pattern matching + podcast_id=slug, # Pass slug as podcast_id for pattern matching + podcast_tags=podcast_tags, ) finally: token_totals = get_episode_token_totals() diff --git a/src/api/feeds.py b/src/api/feeds.py index 2b979998..20804a5e 100644 --- a/src/api/feeds.py +++ b/src/api/feeds.py @@ -650,3 +650,44 @@ def get_artwork(slug): response.headers['X-Content-Type-Options'] = 'nosniff' response.headers['Content-Security-Policy'] = "default-src 'none'" return response + + +# ========== Tag endpoints ========== + +@api.route('/feeds//tags', methods=['GET']) +@log_request +def get_feed_tags(slug): + """Return the source breakdown of a podcast's tags. + + Output: {effective: [...], rss: [...], episode: [...], user: [...]} + """ + db = get_database() + if not db.get_podcast_by_slug(slug): + return error_response('Feed not found', 404) + return json_response(db.get_podcast_tags(slug)) + + +@api.route('/feeds//tags', methods=['PUT']) +@log_request +def update_feed_tags(slug): + """Update a podcast's user-added tags. Body: {user_tags: ['tag1', ...]}. + + Validates each tag against VALID_TAGS. The denormalized `tags` field + on the row is rewritten as the union of (existing rss + new user + episode tags). + """ + from utils.community_tags import valid_tags + db = get_database() + if not db.get_podcast_by_slug(slug): + return error_response('Feed not found', 404) + data = request.get_json() or {} + user_tags = data.get('user_tags') + if not isinstance(user_tags, list): + return error_response('user_tags must be a list of strings', 400) + + vt = valid_tags() + bad = [t for t in user_tags if t not in vt] + if bad: + return error_response(f'unknown tags: {", ".join(bad)}', 400) + + db.set_podcast_tags(slug, user_tags=user_tags) + return json_response(db.get_podcast_tags(slug)) diff --git a/src/api/patterns.py b/src/api/patterns.py index 1904aefa..13c497d8 100644 --- a/src/api/patterns.py +++ b/src/api/patterns.py @@ -7,7 +7,7 @@ from utils.time import utc_now_iso, parse_iso_datetime from sponsor_normalize import get_or_create_known_sponsor -from flask import request +from flask import Response, request from api import ( api, limiter, log_request, json_response, error_response, @@ -24,19 +24,29 @@ @api.route('/patterns', methods=['GET']) @log_request def list_patterns(): - """List all ad patterns with optional filtering.""" + """List all ad patterns with optional filtering. + + Query params: + scope, podcast_id, network_id, active (bool, default true), + source (one of 'local', 'community', 'imported') + """ + from utils.community_tags import PATTERN_SOURCES db = get_database() scope = request.args.get('scope') podcast_id = request.args.get('podcast_id') network_id = request.args.get('network_id') active_only = request.args.get('active', 'true').lower() == 'true' + source = request.args.get('source') + if source and source not in PATTERN_SOURCES: + source = None # ignore garbage values rather than 400; preserves prior behavior patterns = db.get_ad_patterns( scope=scope, podcast_id=podcast_id, network_id=network_id, - active_only=active_only + active_only=active_only, + source=source, ) return json_response({'patterns': patterns}) @@ -287,6 +297,11 @@ def update_pattern(pattern_id): updates['sponsor_id'] = None if updates: + # Auto-protect community patterns from being clobbered by the next + # auto-sync when the user edits them in the UI. + from utils.community_tags import PATTERN_SOURCE_COMMUNITY, PATTERN_SOURCE_LOCAL + if (pattern.get('source') or PATTERN_SOURCE_LOCAL) == PATTERN_SOURCE_COMMUNITY: + updates.setdefault('protected_from_sync', 1) db.update_ad_pattern(pattern_id, **updates) return json_response({'message': 'Pattern updated'}) @@ -688,6 +703,39 @@ def submit_correction(slug, episode_id): pattern_service = PatternService(db) pattern_service.record_pattern_match(pattern_id, episode_id) logger.info(f"Recorded adjustment as confirmation for pattern {pattern_id}") + + # Reviewer-trim auto-update: when the reviewer narrows the bounds + # by at least `min_trim_threshold` seconds AND settings allow it, + # rewrite the pattern's text_template/variants from the new bounds. + # Community patterns are never auto-rewritten (handled in + # pattern_service.rewrite_pattern_from_bounds). + try: + narrowed = ( + adjusted_start >= original_start + and adjusted_end <= original_end + ) + trim_seconds = ( + (adjusted_start - original_start) + (original_end - adjusted_end) + ) + enabled = db.get_setting_bool( + 'update_patterns_from_reviewer_adjustments', default=True + ) + threshold = db.get_setting_float( + 'min_trim_threshold', default=20.0 + ) + if enabled and narrowed and trim_seconds >= threshold and transcript: + rewritten = pattern_service.rewrite_pattern_from_bounds( + pattern_id, transcript, + original_start, original_end, + adjusted_start, adjusted_end, + ) + if rewritten: + logger.info( + f"Pattern {pattern_id} auto-trimmed by {trim_seconds:.1f}s " + f"(threshold={threshold:.1f}s)" + ) + except Exception as e: + logger.warning(f"Reviewer-trim auto-update failed for pattern {pattern_id}: {e}") elif adjusted_text and len(adjusted_text) >= 50: # No pattern exists - create one from adjusted boundaries (like confirm does) podcast = db.get_podcast_by_slug(slug) @@ -747,15 +795,26 @@ def export_patterns(): Query params: - include_disabled: Include disabled patterns (default: false) - include_corrections: Include correction history (default: false) + - ids: Optional comma-separated pattern ids. If set, only those rows + are exported (intersected with the include_disabled filter). """ db = get_database() include_disabled = request.args.get('include_disabled', 'false').lower() == 'true' include_corrections = request.args.get('include_corrections', 'false').lower() == 'true' + ids_param = request.args.get('ids') # Get patterns patterns = db.get_ad_patterns(active_only=not include_disabled) + if ids_param: + try: + wanted = {int(x) for x in ids_param.split(',') if x.strip()} + except ValueError: + return error_response('ids must be a comma-separated list of integers', 400) + if wanted: + patterns = [p for p in patterns if int(p['id']) in wanted] + # Build export data export_data = { 'version': '1.0', @@ -999,3 +1058,194 @@ def backfill_false_positive_texts(): 'updated': updated, 'skipped': skipped }) + + +# ========== Bulk operations + community ========== + +def _resolve_bulk_target(db, data: dict, active_only_for_source: bool): + """Shared validation for bulk-delete + bulk-disable. + + Returns (ids, error_response). ids is None when error_response is set. + All user-supplied fields are coerced to their expected types before + being reflected in any response or used in a SQL query. + """ + from utils.community_tags import PATTERN_SOURCES + if not data.get('confirm'): + return None, error_response('confirm: true is required', 400) + try: + expected = int(data['expected_count']) + except (KeyError, TypeError, ValueError): + return None, error_response('expected_count must be an integer', 400) + + raw_ids = data.get('ids') + source = data.get('source') + if raw_ids is not None and not isinstance(raw_ids, list): + return None, error_response('ids must be a list of integers', 400) + if not raw_ids and source not in PATTERN_SOURCES: + return None, error_response('Provide either ids or a valid source', 400) + + if raw_ids: + try: + ids = [int(x) for x in raw_ids] + except (TypeError, ValueError): + return None, error_response('ids must contain only integers', 400) + else: + rows = db.get_patterns_by_source(source, active_only=active_only_for_source) + ids = [int(r['id']) for r in rows] + + if len(ids) != expected: + return None, error_response( + f'expected_count mismatch (expected {expected}, matched {len(ids)})', + 400, + ) + return ids, None + + +@api.route('/patterns/bulk-delete', methods=['POST']) +@log_request +def bulk_delete_patterns(): + """Hard-delete patterns. Body: {ids?, source?, confirm: true, expected_count: N}. + + Either `ids` or `source` must be provided. `expected_count` MUST match + the actual number of matched rows or the call is rejected with 400 — + this is the fat-finger guard from the plan. + """ + db = get_database() + ids, err = _resolve_bulk_target(db, request.get_json() or {}, active_only_for_source=False) + if err is not None: + return err + deleted = db.bulk_delete_patterns(ids) + return json_response({'deleted': deleted, 'ids': ids}) + + +@api.route('/patterns/bulk-disable', methods=['POST']) +@log_request +def bulk_disable_patterns(): + """Mark patterns is_active=0. Same shape and guards as bulk-delete.""" + db = get_database() + ids, err = _resolve_bulk_target(db, request.get_json() or {}, active_only_for_source=True) + if err is not None: + return err + disabled = db.bulk_disable_patterns(ids) + return json_response({'disabled': disabled, 'ids': ids}) + + +def _coerce_int_ids(raw) -> list: + """Coerce request body `ids` to a list of ints. Drops bad entries.""" + if not isinstance(raw, list): + return [] + out = [] + for v in raw: + try: + out.append(int(v)) + except (TypeError, ValueError): + continue + return out + + +@api.route('/patterns/preview-export', methods=['POST']) +@log_request +def preview_export_to_community(): + """Dry-run: report which of these pattern ids would pass quality gates. + + Body: ``{"ids": [int, ...]}``. Returns + ``{ready, rejected, ready_count, rejected_count}`` where ``rejected`` + is ``[{id, sponsor, reasons:[str]}]``. + """ + from community_export import build_bundle + body = request.get_json(silent=True) or {} + ids = _coerce_int_ids(body.get('ids')) + if not ids: + return error_response('ids required (non-empty list of integers)', 400) + db = get_database() + bundle, rejected = build_bundle(ids, db) + rejected_id_set = {r['id'] for r in rejected} + ready_ids = [i for i in ids if i not in rejected_id_set] + return json_response({ + 'ready': ready_ids, + 'rejected': rejected, + 'ready_count': len(ready_ids), + 'rejected_count': len(rejected), + 'pattern_count': bundle['pattern_count'], + }) + + +@api.route('/patterns/submit-bundle', methods=['POST']) +@log_request +def submit_bundle_to_community(): + """Build a single-file community submission bundle for download. + + Body: ``{"ids": [int, ...]}``. Returns ``application/json`` with a + ``Content-Disposition: attachment`` header so the browser downloads + the file. The bundle is the artifact the user commits into + ``patterns/community/`` to open one PR for all selected patterns. + """ + from community_export import build_bundle + body = request.get_json(silent=True) or {} + ids = _coerce_int_ids(body.get('ids')) + if not ids: + return error_response('ids required (non-empty list of integers)', 400) + db = get_database() + bundle, rejected = build_bundle(ids, db) + if bundle['pattern_count'] == 0: + return error_response({ + 'message': 'No patterns passed quality gates', + 'rejected': rejected, + }, 400) + first_cid = bundle['patterns'][0]['community_id'] + filename = f'minuspod-submission-{first_cid.split("-")[0]}.json' + body_text = json.dumps(bundle, indent=2, ensure_ascii=False) + return Response( + body_text, + mimetype='application/json', + headers={ + 'Content-Disposition': f'attachment; filename="{filename}"', + 'X-Bundle-Rejected-Count': str(len(rejected)), + 'X-Bundle-Pattern-Count': str(bundle['pattern_count']), + }, + ) + + +@api.route('/patterns//submit-to-community', methods=['POST']) +@log_request +def submit_pattern_to_community(pattern_id: int): + """Run the community export pipeline for a single local pattern. + + Returns the JSON payload + a prefilled GitHub PR URL. When the encoded + URL would exceed the GitHub limit (`too_large=True`), the frontend + falls back to offering the payload as a downloadable file. + """ + from community_export import run_export_pipeline, ExportError + db = get_database() + try: + result = run_export_pipeline(pattern_id, db) + except ExportError as e: + return error_response({'message': 'Export failed', 'reasons': e.reasons}, 400) + return json_response(result) + + +@api.route('/patterns//protect', methods=['POST']) +@log_request +def protect_pattern(pattern_id: int): + """Set protected_from_sync=1 on a community pattern.""" + from utils.community_tags import PATTERN_SOURCE_COMMUNITY, PATTERN_SOURCE_LOCAL + db = get_database() + pattern = db.get_ad_pattern_by_id(pattern_id) + if not pattern: + return error_response('pattern not found', 404) + if (pattern.get('source') or PATTERN_SOURCE_LOCAL) != PATTERN_SOURCE_COMMUNITY: + return error_response('only community patterns can be protected', 400) + db.set_pattern_protected(pattern_id, True) + return json_response({'pattern_id': pattern_id, 'protected_from_sync': 1}) + + +@api.route('/patterns//protect', methods=['DELETE']) +@log_request +def unprotect_pattern(pattern_id: int): + """Set protected_from_sync=0 on a community pattern.""" + db = get_database() + pattern = db.get_ad_pattern_by_id(pattern_id) + if not pattern: + return error_response('pattern not found', 404) + db.set_pattern_protected(pattern_id, False) + return json_response({'pattern_id': pattern_id, 'protected_from_sync': 0}) diff --git a/src/api/settings.py b/src/api/settings.py index a0fd142d..f0eb1e14 100644 --- a/src/api/settings.py +++ b/src/api/settings.py @@ -1156,3 +1156,137 @@ def test_webhook(webhook_id): 'success': False, 'message': 'webhook test failed; see server logs for details', }) + + +# ========== Ad Reviewer settings ========== + +@api.route('/settings/reviewer', methods=['GET']) +@log_request +def get_reviewer_settings(): + """Return the ad-reviewer auto-update settings.""" + db = get_database() + return json_response({ + 'updatePatternsFromReviewerAdjustments': db.get_setting_bool( + 'update_patterns_from_reviewer_adjustments', default=True + ), + 'minTrimThreshold': db.get_setting_float('min_trim_threshold', default=20.0), + }) + + +@api.route('/settings/reviewer', methods=['PUT']) +@log_request +def update_reviewer_settings(): + """Update the ad-reviewer auto-update settings. + + Body: {updatePatternsFromReviewerAdjustments: bool, minTrimThreshold: float} + """ + db = get_database() + data = request.get_json() or {} + if 'updatePatternsFromReviewerAdjustments' in data: + v = bool(data['updatePatternsFromReviewerAdjustments']) + db.set_setting('update_patterns_from_reviewer_adjustments', 'true' if v else 'false') + if 'minTrimThreshold' in data: + try: + v = float(data['minTrimThreshold']) + except (TypeError, ValueError): + return error_response('minTrimThreshold must be a number', 400) + if v <= 0 or v > 120: + return error_response('minTrimThreshold must be between 1 and 120', 400) + db.set_setting('min_trim_threshold', str(v)) + return get_reviewer_settings() + + +# ========== Community-pattern sync settings ========== + +@api.route('/settings/community-sync', methods=['GET']) +@log_request +def get_community_sync_settings(): + """Return the community-pattern sync settings.""" + from community_sync import DEFAULT_CRON + db = get_database() + return json_response({ + 'enabled': db.get_setting_bool('community_sync_enabled', default=False), + 'cron': db.get_setting('community_sync_cron') or DEFAULT_CRON, + 'lastRun': db.get_setting('community_sync_last_run') or None, + 'lastError': db.get_setting('community_sync_last_error') or None, + 'manifestVersion': db.get_setting('community_sync_manifest_version') or None, + 'lastSummary': db.get_setting('community_sync_last_summary') or None, + }) + + +@api.route('/settings/community-sync', methods=['PUT']) +@log_request +def update_community_sync_settings(): + """Update community-pattern sync settings. + + Body: {enabled?: bool, cron?: str}. Cron expression is validated. + """ + from utils.cron import is_valid_expression + db = get_database() + data = request.get_json() or {} + if 'enabled' in data: + db.set_setting('community_sync_enabled', 'true' if bool(data['enabled']) else 'false') + if 'cron' in data: + cron = (data['cron'] or '').strip() + if not is_valid_expression(cron): + return error_response(f'invalid cron expression: {cron}', 400) + db.set_setting('community_sync_cron', cron) + return get_community_sync_settings() + + +# ========== Community-pattern sync triggers ========== + +@api.route('/community-patterns/sync', methods=['POST']) +@limiter.limit('6/hour') +@log_request +def trigger_community_pattern_sync(): + """Force a sync now. Rate-limited to 6 calls per hour. + + A 404 from the upstream manifest URL is expected when the repo hasn't + published `patterns/community/index.json` to its default branch yet + (e.g. the feature is still on a feature branch). Surface that as a + soft 200 with ``status: no_manifest_yet`` rather than a 502, since + the local instance is healthy and there's nothing the user can do. + """ + import requests + from community_sync import sync_now + db = get_database() + try: + summary = sync_now(db) + except requests.HTTPError as e: + resp = e.response + if resp is not None and resp.status_code == 404: + return json_response({ + 'status': 'no_manifest_yet', + 'message': 'Upstream has not published a manifest at this URL yet.', + }) + return error_response({'message': 'Sync failed', 'reason': str(e)}, 502) + except Exception as e: + return error_response({'message': 'Sync failed', 'reason': str(e)}, 502) + return json_response(summary) + + +@api.route('/community-patterns/sync-status', methods=['GET']) +@log_request +def community_pattern_sync_status(): + """Return last-sync metadata.""" + return get_community_sync_settings() + + +@api.route('/community-patterns/all', methods=['DELETE']) +@log_request +def delete_all_community_patterns(): + """Hard-delete every community pattern on this instance. + + Body must include ``{"confirm": true}`` as a fat-finger guard, matching + the ``/patterns/bulk-delete`` convention. The UI provides the confirm + step; this endpoint enforces it for any direct API caller too. + """ + payload = request.get_json(silent=True) or {} + if payload.get('confirm') is not True: + return error_response({ + 'message': 'confirm: true required to purge all community patterns' + }, 400) + db = get_database() + deleted = db.delete_all_community_patterns() + return json_response({'deleted': deleted}) diff --git a/src/api/sponsors.py b/src/api/sponsors.py index 74c62cd6..371e63d5 100644 --- a/src/api/sponsors.py +++ b/src/api/sponsors.py @@ -18,11 +18,15 @@ @api.route('/sponsors', methods=['GET']) @log_request def list_sponsors(): - """List all known sponsors.""" + """List all known sponsors. Optional ?tag= filter.""" service = get_sponsor_service() include_inactive = request.args.get('include_inactive', 'false').lower() == 'true' + tag = request.args.get('tag') - sponsors = service.db.get_known_sponsors(active_only=not include_inactive) + if tag: + sponsors = service.db.get_sponsors_by_tag(tag, active_only=not include_inactive) + else: + sponsors = service.db.get_known_sponsors(active_only=not include_inactive) # Parse JSON fields result = [] @@ -40,11 +44,41 @@ def list_sponsors(): sponsor_data['common_ctas'] = json.loads(sponsor_data['common_ctas']) except json.JSONDecodeError: sponsor_data['common_ctas'] = [] + # Parse tags from JSON string + if isinstance(sponsor_data.get('tags'), str): + try: + sponsor_data['tags'] = json.loads(sponsor_data['tags']) + except json.JSONDecodeError: + sponsor_data['tags'] = [] result.append(sponsor_data) return json_response({'sponsors': result}) +@api.route('/sponsors//tags', methods=['PUT']) +@log_request +def update_sponsor_tags(sponsor_id): + """Update a sponsor's tags. Body: {tags: ['tag1', ...]}. + + Each tag is validated against VALID_TAGS (which includes 'universal'). + """ + from utils.community_tags import valid_tags + service = get_sponsor_service() + sponsor = service.db.get_known_sponsor_by_id(sponsor_id) + if not sponsor: + return error_response('Sponsor not found', 404) + data = request.get_json() or {} + tags = data.get('tags') + if not isinstance(tags, list): + return error_response('tags must be a list of strings', 400) + vt = valid_tags() + bad = [t for t in tags if t not in vt] + if bad: + return error_response(f'unknown tags: {", ".join(bad)}', 400) + service.db.update_known_sponsor(sponsor_id, tags=tags) + return json_response({'sponsor_id': sponsor_id, 'tags': tags}) + + @api.route('/sponsors', methods=['POST']) @log_request def add_sponsor(): @@ -265,3 +299,6 @@ def delete_normalization(norm_id): if success: return json_response({'message': 'Normalization deleted'}) return error_response('Normalization not found', 404) + + +# Tag vocabulary moved to src/api/tags.py diff --git a/src/api/tags.py b/src/api/tags.py new file mode 100644 index 00000000..59230561 --- /dev/null +++ b/src/api/tags.py @@ -0,0 +1,16 @@ +"""Tag routes: /tags/* endpoints for the community-pattern tag vocabulary.""" +import logging + +from api import api, log_request, json_response +from utils.community_tags import vocabulary_payload + +logger = logging.getLogger('podcast.api') + + +@api.route('/tags/vocabulary', methods=['GET']) +@log_request +def get_tag_vocabulary(): + """Return the canonical 49-tag vocabulary (cached) so the frontend can + render the grouped tag picker without re-parsing the seed CSV. + """ + return json_response(vocabulary_payload()) diff --git a/src/audio_fingerprinter.py b/src/audio_fingerprinter.py index 58b9af9c..e741b21d 100644 --- a/src/audio_fingerprinter.py +++ b/src/audio_fingerprinter.py @@ -41,6 +41,13 @@ # Step size for sliding window (seconds) SLIDING_STEP_SIZE = 2.0 +# Cap for the per-window slow scan when the full-file fast path fails. +# When fpcalc can't decode the audio end-to-end, the per-window scan uses +# the same fpcalc binary on each window and almost always produces zero +# new matches — the only realistic save is a single bad frame midway. 90s +# is enough to catch that case without burning the 10-minute upper bound. +FALLBACK_SLOW_TIMEOUT = 90 + @dataclass class FingerprintMatch: @@ -503,7 +510,14 @@ def find_matches( else: logger.warning("Full-file fingerprint failed, falling back to per-window scan") - # Slow fallback: per-window subprocess scanning + # Slow fallback: per-window subprocess scanning. + # Cap separately at FALLBACK_SLOW_TIMEOUT (much shorter than the + # full-file timeout). When the fast path fails because fpcalc can't + # decode the audio source, the per-window scan uses the same fpcalc + # and almost always produces zero new matches — burning the full + # 10-minute budget is wasted work. 90s is enough to catch the rare + # case where the failure was a single bad frame. + slow_timeout = min(timeout, FALLBACK_SLOW_TIMEOUT) scan_start_time = time.time() last_log_time = scan_start_time position = 0.0 @@ -512,7 +526,7 @@ def find_matches( elapsed = now - scan_start_time # Timeout check - if elapsed > timeout: + if elapsed > slow_timeout: logger.warning( f"Fingerprint scan timed out after {elapsed:.0f}s " f"at {position:.1f}s/{total_duration:.1f}s with {len(matches)} matches" diff --git a/src/community_export.py b/src/community_export.py new file mode 100644 index 00000000..17a775af --- /dev/null +++ b/src/community_export.py @@ -0,0 +1,434 @@ +"""Community pattern export pipeline. + +Given a local pattern's id, runs quality gates, PII stripping, metadata +stripping, sponsor classification, and produces a JSON document suitable +for submission to the MinusPod patterns/community/ directory in the +upstream GitHub repository. + +Returns a structured dict the API layer can hand to the frontend. + +Pipeline (mirrors the plan, section 7): + +1. Quality gates +2. Tag validation +3. PII strip (consumer emails by domain whitelist; phone numbers, keep + toll-free, strip all else) +4. Metadata strip +5. Sponsor name classification (exact / alias / fuzzy / unknown) +6. Generate fresh fields (community_id, version=1, submitted_at, + submitted_app_version) +7. JSON output +8. Prefilled GitHub PR URL with a 7KB fallback to file-download +""" +from __future__ import annotations + +import json +import logging +import re +import urllib.parse +import uuid +from datetime import datetime, timezone +from typing import Dict, List, Optional, Tuple + +from utils.community_tags import ( + BUNDLE_FORMAT, + BUNDLE_VERSION, + CONSUMER_EMAIL_DOMAINS, + EMAIL_REGEX, + GITHUB_REPO, + PHONE_REGEX, + is_tollfree, + valid_tags, +) + +logger = logging.getLogger('podcast.community_export') + +MIN_TEXT_LEN = 50 +MAX_TEXT_LEN = 3500 +MAX_DURATION_SECONDS = 120 +URL_LENGTH_LIMIT_BYTES = 7 * 1024 # 7 KB + +PR_URL_TEMPLATE = ( + 'https://github.com/{repo}/new/main/patterns/community' + '?filename={filename}&value={value}' +) + + +class ExportError(Exception): + """Raised when a pattern fails the export pipeline.""" + + def __init__(self, reasons: List[str]): + super().__init__('; '.join(reasons)) + self.reasons = reasons + + +def _slugify(name: str) -> str: + """Lowercase, hyphenated, ASCII-safe slug.""" + s = re.sub(r'[^a-z0-9]+', '-', name.lower()) + return s.strip('-') or 'sponsor' + + +def _strip_emails(text: str) -> str: + """Strip consumer-domain emails from a text body. Returns the cleaned text. + + Business addresses (anything not in CONSUMER_EMAIL_DOMAINS) are kept. + """ + def _sub(m: re.Match) -> str: + domain = m.group(2).lower() + if domain in CONSUMER_EMAIL_DOMAINS: + return '[email]' + return m.group(0) + + return EMAIL_REGEX.sub(_sub, text) + + +def _strip_phones(text: str) -> str: + """Strip non-toll-free phone numbers from a text body.""" + def _sub(m: re.Match) -> str: + phone = m.group(0) + return phone if is_tollfree(phone) else '[phone]' + + return PHONE_REGEX.sub(_sub, text) + + +def strip_pii(text: str) -> str: + """Apply email + phone PII stripping in order.""" + if not text: + return text + return _strip_phones(_strip_emails(text)) + + +def _normalize_aliases(value) -> List[str]: + """Accept aliases as either a list (CSV-derived seed) or a JSON string + (DB rows). Return a plain list of non-empty strings.""" + if isinstance(value, list): + return [str(a) for a in value if a] + if isinstance(value, str): + try: + parsed = json.loads(value) + except (TypeError, ValueError): + return [] + if isinstance(parsed, list): + return [str(a) for a in parsed if a] + return [] + + +def find_foreign_sponsors( + text: str, + declared_names_lower: set, + sponsors: List[Dict], + *, + require_active: bool = False, +) -> List[str]: + """Return canonical names of sponsors whose name or any alias appears + in `text` on a word boundary, excluding any row whose own name/alias + matches `declared_names_lower`. Lookups are case-insensitive. + + `declared_names_lower` should already contain every lowercased name + you consider "us" -- the doc's declared sponsor plus its declared + aliases. The helper also skips any seed row that shares an + alias with the declared sponsor, so passing the canonical name alone + is enough when the seed list is authoritative. + """ + if not text: + return [] + text_l = text.lower() + foreign: List[str] = [] + for s in sponsors: + if require_active and not s.get('is_active'): + continue + name = s.get('name') or '' + name_l = name.lower() + if not name_l: + continue + candidates = [name_l] + [a.lower() for a in _normalize_aliases(s.get('aliases'))] + if any(c in declared_names_lower for c in candidates): + continue + for c in candidates: + if re.search(rf'\b{re.escape(c)}\b', text_l): + foreign.append(name) + break + return foreign + + +def _quality_gates(pattern: Dict, sponsors: List[Dict]) -> List[str]: + """Run quality gates. Returns a list of failure reasons (empty = pass).""" + reasons: List[str] = [] + text = pattern.get('text_template') or '' + + if len(text) < MIN_TEXT_LEN: + reasons.append(f'text_template too short ({len(text)} < {MIN_TEXT_LEN})') + if len(text) > MAX_TEXT_LEN: + reasons.append(f'text_template too long ({len(text)} > {MAX_TEXT_LEN})') + + duration = pattern.get('avg_duration') or 0 + if duration and duration > MAX_DURATION_SECONDS: + reasons.append(f'avg_duration too long ({duration:.0f}s > {MAX_DURATION_SECONDS}s)') + + if (pattern.get('confirmation_count') or 0) < 1: + reasons.append('confirmation_count must be >= 1') + + fp = pattern.get('false_positive_count') or 0 + cc = pattern.get('confirmation_count') or 0 + if fp > cc: + reasons.append(f'false_positive_count ({fp}) > confirmation_count ({cc})') + + sponsor_id = pattern.get('sponsor_id') + if not sponsor_id: + reasons.append('sponsor_id is required') + return reasons + + sponsor_row = next((s for s in sponsors if s['id'] == sponsor_id), None) + if not sponsor_row: + reasons.append('sponsor not found') + return reasons + + declared_aliases = _normalize_aliases(sponsor_row.get('aliases')) + sponsor_names = [sponsor_row['name']] + declared_aliases + + text_lower = text.lower() + name_present = any( + re.search(rf'\b{re.escape(n.lower())}\b', text_lower) + for n in sponsor_names if n + ) + if not name_present: + reasons.append('sponsor name (or any alias) does not appear in text_template') + + declared_lower = {n.lower() for n in sponsor_names if n} + foreign = find_foreign_sponsors(text, declared_lower, sponsors, require_active=True) + if foreign: + reasons.append(f'foreign sponsor names appear in text: {", ".join(foreign[:3])}') + + return reasons + + +def _validate_tags(pattern: Dict, sponsor_row: Dict) -> List[str]: + """Reject any tag not in VALID_TAGS.""" + bad: List[str] = [] + vt = valid_tags() + try: + tags = json.loads(sponsor_row.get('tags') or '[]') + except (TypeError, ValueError): + tags = [] + for t in tags or []: + if t not in vt: + bad.append(t) + return [f'unknown tag: {t}' for t in bad] + + +def _classify_sponsor(sponsor_name: str, sponsors: List[Dict]) -> str: + """Classify how the sponsor maps to the seed list: exact|alias|fuzzy|unknown.""" + if not sponsor_name: + return 'unknown' + lname = sponsor_name.lower() + for s in sponsors: + if s.get('name', '').lower() == lname: + return 'exact' + try: + aliases = json.loads(s.get('aliases') or '[]') + except (TypeError, ValueError): + aliases = [] + for a in aliases or []: + if a.lower() == lname: + return 'alias' + # Cheap fuzzy: substring match in either direction. + for s in sponsors: + nm = s.get('name', '').lower() + if nm and (nm in lname or lname in nm): + return 'fuzzy' + return 'unknown' + + +def _safe_parse_variants(value) -> List[str]: + """Decode the JSON-encoded intro/outro_variants column into a list[str]. + + Older code paths in `text_pattern_matcher.py` pre-encoded the list with + `json.dumps` before passing it to `Database.create_ad_pattern`, which + `json.dumps`'d it again -- so existing rows in production hold a + double-encoded value. `json.loads` once on that returns a string, + which the prior comprehension then exploded character by character. + Detect that case by trying a second decode when the first returned + a string. Idempotent on correctly-encoded rows. + """ + if not value: + return [] + try: + parsed = json.loads(value) + except (TypeError, ValueError): + return [] + if isinstance(parsed, str): + try: + parsed = json.loads(parsed) + except (TypeError, ValueError): + return [] + if not isinstance(parsed, list): + return [] + return [v for v in parsed if isinstance(v, str)] + + +def _strip_metadata(pattern: Dict, sponsor_row: Dict) -> Dict: + """Build the export payload, omitting fields the plan lists as stripped.""" + intro_variants = _safe_parse_variants(pattern.get('intro_variants')) + outro_variants = _safe_parse_variants(pattern.get('outro_variants')) + try: + sponsor_tags = json.loads(sponsor_row.get('tags') or '[]') + except (TypeError, ValueError): + sponsor_tags = [] + try: + sponsor_aliases = json.loads(sponsor_row.get('aliases') or '[]') + except (TypeError, ValueError): + sponsor_aliases = [] + + text_template = strip_pii(pattern.get('text_template') or '') + intro_variants = [strip_pii(v) for v in intro_variants] + outro_variants = [strip_pii(v) for v in outro_variants] + + return { + 'scope': pattern.get('scope') or 'global', + 'text_template': text_template, + 'intro_variants': intro_variants, + 'outro_variants': outro_variants, + 'avg_duration': pattern.get('avg_duration'), + 'sponsor': sponsor_row.get('name'), + 'sponsor_aliases': sponsor_aliases, + 'sponsor_tags': sponsor_tags, + } + + +def _app_version() -> str: + try: + from version import __version__ + return __version__ + except Exception: + return 'unknown' + + +def build_export_payload(pattern: Dict, sponsors: List[Dict]) -> Dict: + """Run the full pipeline and return the JSON payload + sponsor classification.""" + sponsor_id = pattern.get('sponsor_id') + sponsor_row = next((s for s in sponsors if s['id'] == sponsor_id), None) + + failures = _quality_gates(pattern, sponsors) + if sponsor_row: + failures.extend(_validate_tags(pattern, sponsor_row)) + if failures: + raise ExportError(failures) + + payload = _strip_metadata(pattern, sponsor_row) + + sponsor_match = _classify_sponsor(payload['sponsor'], sponsors) + + payload.update({ + 'community_id': str(uuid.uuid4()), + 'version': 1, + 'submitted_at': datetime.now(timezone.utc).isoformat(), + 'submitted_app_version': _app_version(), + 'sponsor_match': sponsor_match, + }) + return payload + + +def build_pr_url(payload: Dict) -> Tuple[str, str, bool]: + """Build the prefilled GitHub PR URL for this payload. + + Returns (url, filename, too_large). When `too_large` is True the URL is + still returned but it should NOT be opened — the caller should offer the + JSON file as a download instead. + """ + sponsor_slug = _slugify(payload.get('sponsor') or 'sponsor') + short_uuid = payload['community_id'].split('-')[0] + filename = f'{sponsor_slug}-{short_uuid}.json' + body = json.dumps(payload, indent=2, ensure_ascii=False) + encoded = urllib.parse.quote(body, safe='') + url = PR_URL_TEMPLATE.format( + repo=GITHUB_REPO, + filename=urllib.parse.quote(filename, safe=''), + value=encoded, + ) + too_large = len(url.encode('utf-8')) > URL_LENGTH_LIMIT_BYTES + return url, filename, too_large + + +def _sponsor_name_for(pattern: Dict, sponsors: List[Dict]) -> Optional[str]: + sponsor_id = pattern.get('sponsor_id') + if sponsor_id is None: + return None + row = next((s for s in sponsors if s['id'] == sponsor_id), None) + return row.get('name') if row else None + + +def build_bundle(pattern_ids: List[int], db) -> Tuple[Dict, List[Dict]]: + """Run the export pipeline on each id and produce one bundle JSON. + + Returns (bundle_payload, rejected). `rejected` is a list of + `{id, sponsor, reasons:[str]}` for patterns that failed pre-flight. + The bundle is the artifact that gets committed under + `patterns/community/` and consumed by the PR validator. + """ + sponsors = db.get_known_sponsors(active_only=False) + patterns_by_id = db.get_ad_patterns_by_ids(pattern_ids) + ready: List[Dict] = [] + rejected: List[Dict] = [] + for pid in pattern_ids: + pattern = patterns_by_id.get(pid) + if not pattern: + rejected.append({'id': pid, 'sponsor': None, 'reasons': ['pattern not found']}) + continue + if (pattern.get('source') or 'local') != 'local': + rejected.append({ + 'id': pid, + 'sponsor': _sponsor_name_for(pattern, sponsors), + 'reasons': [f"pattern source is '{pattern.get('source')}', only 'local' can be submitted"], + }) + continue + try: + payload = build_export_payload(pattern, sponsors) + except ExportError as e: + rejected.append({ + 'id': pid, + 'sponsor': _sponsor_name_for(pattern, sponsors), + 'reasons': list(e.reasons), + }) + continue + ready.append(payload) + bundle = { + 'format': BUNDLE_FORMAT, + 'bundle_version': BUNDLE_VERSION, + 'submitted_at': datetime.now(timezone.utc).isoformat(), + 'submitted_app_version': _app_version(), + 'pattern_count': len(ready), + 'patterns': ready, + } + return bundle, rejected + + +def run_export_pipeline(pattern_id: int, db) -> Dict: + """End-to-end: load pattern + sponsors, run pipeline, return result dict. + + Result dict shape: + { + 'payload': , + 'filename': '-.json', + 'pr_url': '', + 'too_large': bool, + 'sponsor_match': 'exact'|'alias'|'fuzzy'|'unknown', + } + + Raises ExportError on quality / tag failures (callers convert to 400). + """ + pattern = db.get_ad_pattern_by_id(pattern_id) + if not pattern: + raise ExportError([f'pattern {pattern_id} not found']) + if (pattern.get('source') or 'local') != 'local': + raise ExportError([f"pattern source is '{pattern.get('source')}', only 'local' can be submitted"]) + + sponsors = db.get_known_sponsors(active_only=False) + payload = build_export_payload(pattern, sponsors) + url, filename, too_large = build_pr_url(payload) + return { + 'payload': payload, + 'filename': filename, + 'pr_url': url, + 'too_large': too_large, + 'sponsor_match': payload['sponsor_match'], + } diff --git a/src/community_sync.py b/src/community_sync.py new file mode 100644 index 00000000..46a759c9 --- /dev/null +++ b/src/community_sync.py @@ -0,0 +1,235 @@ +"""Community pattern auto-pull / sync. + +Fetches https://raw.githubusercontent.com/ttlequals0/MinusPod/main/patterns/community/index.json +on a configurable cron schedule. Applies INSERT / UPDATE / DELETE semantics +against ad_patterns rows tagged `source='community'`, respecting the +`protected_from_sync` flag. + +Settings keys (in the `settings` table): + + - community_sync_enabled (bool, default false) + - community_sync_cron (str cron expression, default '0 3 * * 0') + - community_sync_last_run, community_sync_last_error, + community_sync_manifest_version, community_sync_last_summary +""" +from __future__ import annotations + +import json +import logging +from datetime import datetime, timezone +from typing import Any, Dict, Optional + +import requests + +from utils.community_tags import COMMUNITY_MANIFEST_URL, VOCABULARY_VERSION +from utils.cron import is_due +from utils.time import parse_iso_datetime, utc_now_iso + +logger = logging.getLogger('podcast.community_sync') +HTTP_TIMEOUT = 20 +DEFAULT_CRON = '0 3 * * 0' # Sunday 3am UTC + + +def _utc_now() -> datetime: + return datetime.now(timezone.utc) + + +def _parse_iso(value: Optional[str]) -> Optional[datetime]: + if not value: + return None + try: + dt = parse_iso_datetime(value) + except (TypeError, ValueError): + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +def _fetch_manifest(url: str = COMMUNITY_MANIFEST_URL) -> Dict[str, Any]: + """Fetch the manifest. Raises requests.RequestException on failure.""" + resp = requests.get(url, timeout=HTTP_TIMEOUT) + resp.raise_for_status() + return resp.json() + + +def _validate_manifest(manifest: Dict[str, Any]) -> None: + if not isinstance(manifest, dict): + raise ValueError('manifest is not a JSON object') + if 'manifest_version' not in manifest: + raise ValueError('manifest_version missing') + if 'patterns' not in manifest or not isinstance(manifest['patterns'], list): + raise ValueError('patterns array missing') + + +def apply_manifest(db, manifest: Dict[str, Any]) -> Dict[str, int]: + """Apply manifest entries against ad_patterns. Returns summary counts. + + Semantics (plan section 9): + - new community_id -> INSERT (source=community, protected_from_sync=0) + - existing community_id, higher version -> UPDATE unless protected_from_sync=1 + - community_id missing from manifest -> DELETE unless protected_from_sync=1 + """ + from pattern_service import PatternService + + pattern_service = PatternService(db) + inserts = updates = deletes = skips = errors = 0 + + # Pre-collect manifest community_ids so we can batch the existence lookup + # instead of running one SELECT per entry. + incoming_ids: set = set() + valid_entries = [] + for entry in manifest['patterns']: + if not isinstance(entry, dict): + errors += 1 + continue + community_id = entry.get('community_id') + data = entry.get('data') + if not community_id or not data: + errors += 1 + continue + incoming_ids.add(community_id) + valid_entries.append((community_id, data, entry.get('version', 1))) + + existing_by_cid = db.find_patterns_by_community_ids(list(incoming_ids)) + + for community_id, data, manifest_version in valid_entries: + existing = existing_by_cid.get(community_id) + # Stamp version from manifest entry. The manifest's per-entry + # version is authoritative — overwrite anything carried in the + # inner `data` dict so version-gating in import_community_pattern + # compares the manifest's number, not the payload's stale one. + data_with_version = dict(data) + data_with_version['community_id'] = community_id + data_with_version['version'] = manifest_version + try: + if existing is None: + pattern_service.import_community_pattern(data_with_version) + inserts += 1 + else: + if existing.get('protected_from_sync'): + skips += 1 + continue + if int(data_with_version['version']) > int(existing.get('version') or 1): + pattern_service.import_community_pattern(data_with_version) + updates += 1 + else: + skips += 1 + except Exception as e: + errors += 1 + logger.warning(f"community_sync: failed to apply {community_id}: {e}") + + # Reconcile deletes: existing community patterns absent from the manifest + community_rows = db.get_patterns_by_source('community', active_only=False) + for row in community_rows: + cid = row.get('community_id') + if not cid: + continue + if cid in incoming_ids: + continue + if row.get('protected_from_sync'): + skips += 1 + continue + try: + db.delete_ad_pattern(row['id']) + deletes += 1 + except Exception as e: + errors += 1 + logger.warning(f"community_sync: failed to delete {cid}: {e}") + + return { + 'inserted': inserts, + 'updated': updates, + 'deleted': deletes, + 'skipped': skips, + 'errors': errors, + } + + +def sync_now(db, manifest_url: str = COMMUNITY_MANIFEST_URL) -> Dict[str, Any]: + """Force a sync regardless of schedule. Returns a summary dict. + + On any failure the function raises so the caller can surface the error + to the user. The settings table is updated either way to record the + attempt timestamp / last error. + """ + started_at = utc_now_iso() + db.set_setting('community_sync_last_run', started_at) + + try: + manifest = _fetch_manifest(manifest_url) + _validate_manifest(manifest) + except requests.HTTPError as e: + # 404 = upstream hasn't published a manifest yet (e.g. the feature + # branch hasn't been merged to main). Treat as a non-issue; log at + # info-level so the every-15-min tick doesn't spam WARN. + status = e.response.status_code if e.response is not None else None + msg = f'{status} fetching manifest' if status else str(e) + db.set_setting('community_sync_last_error', msg) + if status == 404: + logger.info( + f'community_sync: no manifest at {manifest_url} (404). ' + f'Either upstream has not published one yet or sync is ' + f'pointed at the wrong URL.' + ) + else: + logger.warning(f'community_sync: manifest fetch failed: {msg}') + raise + except Exception as e: + msg = str(e) + db.set_setting('community_sync_last_error', msg) + logger.warning(f'community_sync: manifest fetch/validate failed: {msg}') + raise + + summary = apply_manifest(db, manifest) + summary['manifest_version'] = manifest.get('manifest_version') + summary['fetched_at'] = started_at + + # Compare the manifest's vocabulary_version against the value this app + # was built with. A mismatch means the upstream patterns may carry tags + # the local validator doesn't know about — surface a warning so the + # operator knows their image is behind. The vocabulary itself stays + # baked into the app code, so this is informational only. + manifest_vocab = manifest.get('vocabulary_version') + summary['vocabulary_version'] = manifest_vocab + if manifest_vocab is not None: + try: + if int(manifest_vocab) > VOCABULARY_VERSION: + warning = ( + f'manifest vocabulary_version={manifest_vocab} is newer ' + f'than this app (vocab={VOCABULARY_VERSION}); upgrade ' + f'to pick up new tags' + ) + logger.warning(f'community_sync: {warning}') + summary['vocabulary_warning'] = warning + except (TypeError, ValueError): + logger.warning( + f'community_sync: manifest vocabulary_version is not an int: ' + f'{manifest_vocab!r}' + ) + + db.set_setting('community_sync_last_error', '') + db.set_setting('community_sync_manifest_version', str(manifest.get('manifest_version'))) + db.set_setting('community_sync_last_summary', json.dumps(summary)) + logger.info(f'community_sync: {summary}') + return summary + + +def community_pattern_sync_tick(db, force: bool = False) -> Optional[Dict[str, Any]]: + """Run sync if due (or forced). Returns the summary dict, or None if skipped.""" + enabled = db.get_setting_bool('community_sync_enabled', default=False) + if not enabled and not force: + return None + + cron = db.get_setting('community_sync_cron') or DEFAULT_CRON + last_run = _parse_iso(db.get_setting('community_sync_last_run')) + now = _utc_now() + + if not force and last_run is not None and not is_due(cron, last_run, now): + return None + + try: + return sync_now(db) + except Exception: + # sync_now already logged + stamped settings. + return None diff --git a/src/database/episodes.py b/src/database/episodes.py index a3d3e9f7..04a3b0d1 100644 --- a/src/database/episodes.py +++ b/src/database/episodes.py @@ -206,6 +206,11 @@ def upsert_episode(self, slug: str, episode_id: str, **kwargs) -> int: 'published_at', 'episode_number'): fields.append(f"{key} = ?") values.append(value) + elif key == 'tags': + fields.append("tags = ?") + values.append( + json.dumps(value) if isinstance(value, list) else value + ) if fields: fields.append("updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now')") @@ -673,19 +678,21 @@ def bulk_upsert_discovered_episodes(self, slug: str, episodes: List[Dict]) -> in ) continue # Skip - episode already exists with different GUID + tags_json = json.dumps(ep.get('tags') or []) try: cursor = conn.execute( """INSERT INTO episodes (podcast_id, episode_id, original_url, title, description, - artwork_url, episode_number, published_at, status) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'discovered') + artwork_url, episode_number, published_at, tags, status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'discovered') ON CONFLICT(podcast_id, episode_id) DO UPDATE SET episode_number = COALESCE(excluded.episode_number, episodes.episode_number), published_at = COALESCE(excluded.published_at, episodes.published_at), original_url = COALESCE(episodes.original_url, excluded.original_url), title = CASE WHEN COALESCE(episodes.title, '') = '' THEN excluded.title ELSE episodes.title END, description = CASE WHEN COALESCE(episodes.description, '') = '' THEN excluded.description ELSE episodes.description END, - artwork_url = COALESCE(episodes.artwork_url, excluded.artwork_url)""", + artwork_url = COALESCE(episodes.artwork_url, excluded.artwork_url), + tags = CASE WHEN COALESCE(episodes.tags, '[]') = '[]' THEN excluded.tags ELSE episodes.tags END""", ( podcast_id, ep['id'], @@ -695,6 +702,7 @@ def bulk_upsert_discovered_episodes(self, slug: str, episodes: List[Dict]) -> in ep.get('artwork_url'), ep.get('episode_number'), iso_published, + tags_json, ) ) if cursor.rowcount > 0: diff --git a/src/database/patterns.py b/src/database/patterns.py index f5f61d5f..12da0cd0 100644 --- a/src/database/patterns.py +++ b/src/database/patterns.py @@ -10,7 +10,8 @@ class PatternMixin: """Ad pattern and correction management methods.""" def get_ad_patterns(self, scope: str = None, podcast_id: str = None, - network_id: str = None, active_only: bool = True) -> List[Dict]: + network_id: str = None, active_only: bool = True, + source: str = None) -> List[Dict]: """Get ad patterns with optional filtering. Includes podcast_name when available.""" conn = self.get_connection() @@ -37,12 +38,30 @@ def get_ad_patterns(self, scope: str = None, podcast_id: str = None, if network_id: query += " AND ap.network_id = ?" params.append(network_id) + if source: + query += " AND ap.source = ?" + params.append(source) query += " ORDER BY ap.created_at DESC" cursor = conn.execute(query, params) return [dict(row) for row in cursor.fetchall()] + def find_patterns_by_community_ids(self, community_ids: List[str]) -> Dict[str, Dict]: + """Batch lookup: return {community_id: pattern_row} for the given ids. + + Avoids the N+1 pattern when applying a manifest. + """ + if not community_ids: + return {} + conn = self.get_connection() + placeholders = ','.join('?' * len(community_ids)) + cursor = conn.execute( + f"SELECT * FROM ad_patterns WHERE community_id IN ({placeholders})", + community_ids, + ) + return {row['community_id']: dict(row) for row in cursor.fetchall()} + def active_pattern_exists_for_sponsor(self, sponsor: str) -> bool: """Return True if any active ad_patterns row exists for this sponsor (case-insensitive).""" if not sponsor: @@ -85,6 +104,25 @@ def get_ad_pattern_by_id(self, pattern_id: int) -> Optional[Dict]: row = cursor.fetchone() return dict(row) if row else None + def get_ad_patterns_by_ids(self, pattern_ids: List[int]) -> Dict[int, Dict]: + """Batch-load ad patterns by id. Returns ``{id: row}`` for every + id that exists. Used by `build_bundle` to avoid 200+ single-row + SELECTs on the "select all" path.""" + if not pattern_ids: + return {} + placeholders = ','.join('?' * len(pattern_ids)) + conn = self.get_connection() + cursor = conn.execute( + f"""SELECT ap.*, ks.name AS sponsor, + p.title as podcast_name, p.slug as podcast_slug + FROM ad_patterns ap + LEFT JOIN podcasts p ON ap.podcast_id = p.slug + LEFT JOIN known_sponsors ks ON ap.sponsor_id = ks.id + WHERE ap.id IN ({placeholders})""", + tuple(pattern_ids), + ) + return {row['id']: dict(row) for row in cursor.fetchall()} + def find_pattern_by_text(self, text_template: str, podcast_id: str = None) -> Optional[Dict]: """Find an existing pattern with the same text_template (for deduplication).""" conn = self.get_connection() @@ -114,20 +152,27 @@ def create_ad_pattern(self, scope: str, text_template: str = None, outro_variants: List[str] = None, created_from_episode_id: str = None, duration: float = None, - created_by: str = 'auto') -> int: + created_by: str = 'auto', + source: str = 'local', + community_id: str = None, + version: int = 1, + submitted_app_version: str = None, + protected_from_sync: int = 0) -> int: """Create a new ad pattern. Returns pattern ID.""" conn = self.get_connection() cursor = conn.execute( """INSERT INTO ad_patterns (scope, text_template, sponsor_id, podcast_id, network_id, dai_platform, intro_variants, outro_variants, created_from_episode_id, - avg_duration, duration_samples, created_by) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + avg_duration, duration_samples, created_by, + source, community_id, version, submitted_app_version, protected_from_sync) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", (scope, text_template, sponsor_id, podcast_id, network_id, dai_platform, json.dumps(intro_variants or []), json.dumps(outro_variants or []), created_from_episode_id, duration, 1 if duration is not None else 0, - created_by) + created_by, + source, community_id, version, submitted_app_version, protected_from_sync) ) conn.commit() return cursor.lastrowid @@ -142,7 +187,9 @@ def update_ad_pattern(self, pattern_id: int, **kwargs) -> bool: if key in ('scope', 'text_template', 'sponsor_id', 'podcast_id', 'network_id', 'dai_platform', 'confirmation_count', 'false_positive_count', 'last_matched_at', 'is_active', 'disabled_at', 'disabled_reason', - 'avg_duration', 'duration_samples', 'created_by'): + 'avg_duration', 'duration_samples', 'created_by', + 'source', 'community_id', 'version', 'submitted_app_version', + 'protected_from_sync'): fields.append(f"{key} = ?") values.append(value) elif key in ('intro_variants', 'outro_variants'): @@ -160,6 +207,81 @@ def update_ad_pattern(self, pattern_id: int, **kwargs) -> bool: conn.commit() return True + def find_pattern_by_community_id(self, community_id: str) -> Optional[Dict]: + """Find a pattern by its community_id. Returns dict or None.""" + conn = self.get_connection() + cursor = conn.execute( + "SELECT * FROM ad_patterns WHERE community_id = ?", + (community_id,) + ) + row = cursor.fetchone() + return dict(row) if row else None + + def get_patterns_by_source(self, source: str, active_only: bool = True) -> List[Dict]: + """List ad patterns filtered by source ('local'|'community'|'imported').""" + conn = self.get_connection() + query = "SELECT * FROM ad_patterns WHERE source = ?" + params: List = [source] + if active_only: + query += " AND is_active = 1" + query += " ORDER BY id" + cursor = conn.execute(query, params) + return [dict(row) for row in cursor.fetchall()] + + def set_pattern_protected(self, pattern_id: int, protected: bool) -> bool: + """Toggle the protected_from_sync flag on a pattern.""" + conn = self.get_connection() + cursor = conn.execute( + "UPDATE ad_patterns SET protected_from_sync = ? WHERE id = ?", + (1 if protected else 0, pattern_id), + ) + conn.commit() + return cursor.rowcount > 0 + + def bulk_delete_patterns(self, ids: List[int]) -> int: + """Hard-delete patterns by id. Returns rows deleted.""" + if not ids: + return 0 + conn = self.get_connection() + placeholders = ','.join('?' * len(ids)) + cursor = conn.execute( + f"DELETE FROM ad_patterns WHERE id IN ({placeholders})", + ids, + ) + conn.commit() + return cursor.rowcount + + def bulk_disable_patterns(self, ids: List[int]) -> int: + """Set is_active=0 on patterns by id. Returns rows changed.""" + if not ids: + return 0 + conn = self.get_connection() + placeholders = ','.join('?' * len(ids)) + cursor = conn.execute( + f"UPDATE ad_patterns SET is_active = 0, " + f"disabled_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'), " + f"disabled_reason = COALESCE(disabled_reason, 'bulk-disable') " + f"WHERE id IN ({placeholders})", + ids, + ) + conn.commit() + return cursor.rowcount + + def get_sponsor_tags_map(self) -> Dict[int, List[str]]: + """Return {sponsor_id: [tags]} for all active sponsors. Used by matcher.""" + conn = self.get_connection() + cursor = conn.execute( + "SELECT id, tags FROM known_sponsors WHERE is_active = 1" + ) + result: Dict[int, List[str]] = {} + for row in cursor.fetchall(): + try: + tags = json.loads(row['tags'] or '[]') + except (ValueError, TypeError): + tags = [] + result[row['id']] = tags if isinstance(tags, list) else [] + return result + def increment_pattern_match(self, pattern_id: int): """Increment pattern confirmation count and update last_matched_at.""" conn = self.get_connection() @@ -206,6 +328,28 @@ def delete_ad_pattern(self, pattern_id: int) -> bool: conn.commit() return cursor.rowcount > 0 + def delete_all_community_patterns(self) -> int: + """Hard-delete every pattern with source='community'. Returns the + rowcount. `protected_from_sync` is ignored; the flag guards sync + reconciliation, not an explicit operator purge. + + Also deletes audio_fingerprints rows that point at the doomed + patterns; the schema does not declare an ON DELETE CASCADE on + `pattern_id`, so we clean up explicitly the way + `cleanup_service._purge_pattern` does for the single-row case. + """ + conn = self.get_connection() + conn.execute( + "DELETE FROM audio_fingerprints WHERE pattern_id IN (" + "SELECT id FROM ad_patterns WHERE source = 'community'" + ")" + ) + cursor = conn.execute( + "DELETE FROM ad_patterns WHERE source = 'community'" + ) + conn.commit() + return cursor.rowcount + # ========== Pattern Corrections Methods ========== def create_pattern_correction(self, correction_type: str, pattern_id: int = None, diff --git a/src/database/podcasts.py b/src/database/podcasts.py index daf9cc6a..74dc5fa2 100644 --- a/src/database/podcasts.py +++ b/src/database/podcasts.py @@ -1,4 +1,5 @@ """Podcast CRUD mixin for MinusPod database.""" +import json import logging from typing import Optional, Dict, List @@ -67,6 +68,9 @@ def update_podcast(self, slug: str, **kwargs) -> bool: 'only_expose_processed_episodes'): fields.append(f"{key} = ?") values.append(value) + elif key in ('tags', 'user_tags'): + fields.append(f"{key} = ?") + values.append(json.dumps(value) if isinstance(value, list) else value) if not fields: return False @@ -81,6 +85,125 @@ def update_podcast(self, slug: str, **kwargs) -> bool: conn.commit() return True + def get_podcast_tags(self, slug: str) -> Dict[str, List[str]]: + """Return the source breakdown of a podcast's tags. + + Returns {'effective': [...], 'rss': [...], 'episode': [...], 'user': [...]}. + `rss` is derived as (tags - user_tags - episode_tags) and may include + any RSS-extracted tag from past parses. + """ + conn = self.get_connection() + row = conn.execute( + "SELECT id, tags, user_tags FROM podcasts WHERE slug = ?", (slug,) + ).fetchone() + if not row: + return {'effective': [], 'rss': [], 'episode': [], 'user': []} + try: + effective = json.loads(row['tags'] or '[]') or [] + except (ValueError, TypeError): + effective = [] + try: + user = json.loads(row['user_tags'] or '[]') or [] + except (ValueError, TypeError): + user = [] + # episode-level tags: union across episodes of this podcast + episode_tags: set = set() + cur = conn.execute( + "SELECT tags FROM episodes WHERE podcast_id = ?", (row['id'],) + ) + for ep in cur.fetchall(): + try: + tags = json.loads(ep['tags'] or '[]') or [] + except (ValueError, TypeError): + tags = [] + if isinstance(tags, list): + episode_tags.update(tags) + + user_set = set(user) + ep_set = set(episode_tags) + rss = [t for t in effective if t not in user_set and t not in ep_set] + return { + 'effective': effective, + 'rss': rss, + 'episode': sorted(episode_tags), + 'user': user, + } + + def set_podcast_tags(self, slug: str, *, rss_tags: List[str] = None, + user_tags: List[str] = None) -> bool: + """Recompute and persist podcasts.tags as union of provided + episode tags. + + Pass `rss_tags` to update the RSS-derived layer (caller decides which + subset is RSS-only), or `user_tags` for the user-mutable layer. The + denormalized `tags` field is rewritten to the union of (rss_tags, the + existing user_tags or the override, and all episodes.tags for this podcast). + + Fast pre-check: when `rss_tags` is provided and is already a subset of + the row's current `tags` AND `user_tags` is not changing, skip the + episode-aggregation pass entirely. This is the dominant case on the + feed-refresh hot path — a 300-episode podcast paid one SELECT + 300 + JSON parses every 15 minutes for nothing. + """ + conn = self.get_connection() + row = conn.execute( + "SELECT id, tags, user_tags FROM podcasts WHERE slug = ?", (slug,) + ).fetchone() + if not row: + return False + + try: + current_user = json.loads(row['user_tags'] or '[]') or [] + except (ValueError, TypeError): + current_user = [] + try: + current_all = set(json.loads(row['tags'] or '[]') or []) + except (ValueError, TypeError): + current_all = set() + + # Fast path: RSS-only update where the incoming set is already covered + # by the existing union and the user layer isn't being touched. The + # episode-level aggregation can only grow the union, so if the RSS + # tags are already present we know the final union won't shrink. + if ( + user_tags is None + and rss_tags is not None + and set(rss_tags).issubset(current_all) + ): + return True + + effective_user = list(user_tags) if user_tags is not None else current_user + + # Pull episode-level tags. Done as one SELECT + JSON parse per row; + # podcasts with hundreds of episodes pay this cost only when we + # actually need to recompute the denormalized union. + episode_tags: set = set() + cur = conn.execute( + "SELECT tags FROM episodes WHERE podcast_id = ?", (row['id'],) + ) + for ep in cur.fetchall(): + try: + tags = json.loads(ep['tags'] or '[]') or [] + except (ValueError, TypeError): + tags = [] + if isinstance(tags, list): + episode_tags.update(tags) + + if rss_tags is not None: + effective_rss = set(rss_tags) + else: + effective_rss = current_all - set(current_user) - episode_tags + + union = sorted(set(effective_user) | effective_rss | episode_tags) + if set(union) == current_all and effective_user == current_user: + return True + conn.execute( + "UPDATE podcasts SET tags = ?, user_tags = ?, " + "updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE slug = ?", + (json.dumps(union), json.dumps(effective_user), slug), + ) + conn.commit() + return True + def delete_podcast(self, slug: str) -> bool: """Delete podcast and all associated data.""" conn = self.get_connection() diff --git a/src/database/schema.py b/src/database/schema.py index e2b7a9a0..31a9aea8 100644 --- a/src/database/schema.py +++ b/src/database/schema.py @@ -33,6 +33,8 @@ skip_second_pass INTEGER DEFAULT 0, max_episodes INTEGER, only_expose_processed_episodes INTEGER, + tags TEXT NOT NULL DEFAULT '[]', + user_tags TEXT NOT NULL DEFAULT '[]', created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), updated_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')) ); @@ -60,6 +62,7 @@ ad_detection_status TEXT DEFAULT NULL CHECK(ad_detection_status IN (NULL, 'success', 'failed')), artwork_url TEXT, episode_number INTEGER, + tags TEXT NOT NULL DEFAULT '[]', created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), updated_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ON DELETE CASCADE, @@ -129,7 +132,12 @@ disabled_reason TEXT, avg_duration REAL, duration_samples INTEGER DEFAULT 0, - created_by TEXT DEFAULT 'auto' + created_by TEXT DEFAULT 'auto', + source TEXT NOT NULL DEFAULT 'local' CHECK(source IN ('local', 'community', 'imported')), + community_id TEXT, + version INTEGER NOT NULL DEFAULT 1, + submitted_app_version TEXT, + protected_from_sync INTEGER NOT NULL DEFAULT 0 ); -- pattern_corrections table (user corrections; conflicting entries cleaned up on reversal) @@ -167,6 +175,7 @@ category TEXT, common_ctas TEXT DEFAULT '[]', is_active INTEGER DEFAULT 1, + tags TEXT NOT NULL DEFAULT '[]', created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')) ); @@ -236,6 +245,8 @@ -- Cross-episode training indexes (indexes on new columns created in migrations) CREATE INDEX IF NOT EXISTS idx_patterns_sponsor_id ON ad_patterns(sponsor_id) WHERE is_active = 1; +CREATE INDEX IF NOT EXISTS idx_patterns_source ON ad_patterns(source, is_active); +CREATE INDEX IF NOT EXISTS idx_patterns_community_id ON ad_patterns(community_id) WHERE community_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_fingerprints_pattern ON audio_fingerprints(pattern_id); CREATE INDEX IF NOT EXISTS idx_corrections_pattern ON pattern_corrections(pattern_id); CREATE INDEX IF NOT EXISTS idx_sponsors_name ON known_sponsors(name) WHERE is_active = 1; @@ -365,7 +376,12 @@ def _create_new_tables_only(self, conn): is_active INTEGER DEFAULT 1, disabled_at TEXT, disabled_reason TEXT, - created_by TEXT DEFAULT 'auto' + created_by TEXT DEFAULT 'auto', + source TEXT NOT NULL DEFAULT 'local' CHECK(source IN ('local', 'community', 'imported')), + community_id TEXT, + version INTEGER NOT NULL DEFAULT 1, + submitted_app_version TEXT, + protected_from_sync INTEGER NOT NULL DEFAULT 0 ) """) @@ -400,7 +416,7 @@ def _create_new_tables_only(self, conn): ) """) - # Create known_sponsors table if not exists + # Create known_sponsors table if not exists (must match SCHEMA_SQL exactly) conn.execute(""" CREATE TABLE IF NOT EXISTS known_sponsors ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -409,6 +425,7 @@ def _create_new_tables_only(self, conn): category TEXT, common_ctas TEXT DEFAULT '[]', is_active INTEGER DEFAULT 1, + tags TEXT NOT NULL DEFAULT '[]', created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')) ) """) @@ -681,6 +698,49 @@ def _run_schema_migrations(self): self._add_column_if_missing(conn, 'ad_patterns', 'avg_duration', 'REAL', ap_cols) self._add_column_if_missing(conn, 'ad_patterns', 'duration_samples', 'INTEGER DEFAULT 0', ap_cols) + # Community-pattern columns (2.4.0). source is a CHECK column but + # SQLite allows ADD COLUMN with DEFAULT — the CHECK is enforced via + # the SCHEMA_SQL CREATE TABLE; existing rows default to 'local'. + ap_cols = self._get_table_columns(conn, 'ad_patterns') + self._add_column_if_missing(conn, 'ad_patterns', 'source', "TEXT NOT NULL DEFAULT 'local'", ap_cols) + self._add_column_if_missing(conn, 'ad_patterns', 'community_id', 'TEXT', ap_cols) + self._add_column_if_missing(conn, 'ad_patterns', 'version', 'INTEGER NOT NULL DEFAULT 1', ap_cols) + self._add_column_if_missing(conn, 'ad_patterns', 'submitted_app_version', 'TEXT', ap_cols) + self._add_column_if_missing( + conn, 'ad_patterns', 'protected_from_sync', + 'INTEGER NOT NULL DEFAULT 0', ap_cols, + ) + + # Indexes for source filtering and community_id lookup (idempotent) + try: + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_patterns_source " + "ON ad_patterns(source, is_active)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_patterns_community_id " + "ON ad_patterns(community_id) WHERE community_id IS NOT NULL" + ) + conn.commit() + except Exception as e: + logger.warning(f"Community pattern index creation: {e}") + + # known_sponsors.tags (2.4.0) + ks_cols = self._get_table_columns(conn, 'known_sponsors') + self._add_column_if_missing(conn, 'known_sponsors', 'tags', "TEXT NOT NULL DEFAULT '[]'", ks_cols) + + # podcasts.tags and podcasts.user_tags (2.4.0) + pod_cols = self._get_table_columns(conn, 'podcasts') + self._add_column_if_missing(conn, 'podcasts', 'tags', "TEXT NOT NULL DEFAULT '[]'", pod_cols) + self._add_column_if_missing(conn, 'podcasts', 'user_tags', "TEXT NOT NULL DEFAULT '[]'", pod_cols) + + # episodes.tags (2.4.0) + ep_cols = self._get_table_columns(conn, 'episodes') + self._add_column_if_missing(conn, 'episodes', 'tags', "TEXT NOT NULL DEFAULT '[]'", ep_cols) + + # Sponsor reseed runs at the END of this migration (see below), after + # `_migrate_sponsor_fk` so it operates on dedup'd rows. + # Migration: Update episodes status CHECK constraint to include 'permanently_failed' # SQLite doesn't support ALTER TABLE to modify constraints, so we recreate the table try: @@ -1243,6 +1303,161 @@ def _run_schema_migrations(self): # doesn't update what the editor displays for already-detected ads. self._cleanup_zyn_ad_markers(conn) + # Sponsor seed reseed (2.4.0): CSV is authoritative. Runs LAST so + # `_migrate_sponsor_fk` has already deduped case-variants from + # legacy v2.1.x rows; the reseed then operates on the canonical + # post-FK-migration state. UPDATE on name match preserves `id` for + # any existing `ad_patterns.sponsor_id` foreign keys; orphans are + # soft-deleted (is_active=0) rather than dropped. + try: + self._reseed_known_sponsors(conn) + except Exception as e: + logger.error(f"Sponsor reseed failed: {e}") + + # One-shot repair: patterns created before 2.4.6 by + # text_pattern_matcher have `intro_variants` / `outro_variants` + # double-JSON-encoded (caller json.dumps'd, then create_ad_pattern + # json.dumps'd again). The community export pipeline exploded the + # result into a list of single characters. Idempotent: rows that + # parse to a list on the first decode are skipped. + try: + self._repair_double_encoded_variants(conn) + except Exception as e: + logger.error(f"Variant re-encode repair failed: {e}") + + def _repair_double_encoded_variants(self, conn): + """Re-encode any ad_patterns.intro_variants / outro_variants column + whose stored value parses (via json.loads) to a string rather than + a list. Stamps `variant_reencode_revision` so this only runs once + per database.""" + import json + cursor = conn.execute( + "SELECT value FROM settings WHERE key = 'variant_reencode_revision'" + ) + row = cursor.fetchone() + if row and row['value'] == '1': + return + + repaired = 0 + rows = conn.execute( + "SELECT id, intro_variants, outro_variants FROM ad_patterns" + ).fetchall() + for r in rows: + updates = {} + for col in ('intro_variants', 'outro_variants'): + raw = r[col] + if not raw: + continue + try: + parsed = json.loads(raw) + except (TypeError, ValueError): + continue + if not isinstance(parsed, str): + continue # already a list, nothing to do + try: + inner = json.loads(parsed) + except (TypeError, ValueError): + continue + if not isinstance(inner, list): + continue + updates[col] = json.dumps(inner) + if updates: + fields = ', '.join(f'{k} = ?' for k in updates) + conn.execute( + f"UPDATE ad_patterns SET {fields} WHERE id = ?", + list(updates.values()) + [r['id']], + ) + repaired += 1 + + conn.execute( + "INSERT OR REPLACE INTO settings (key, value, updated_at) " + "VALUES ('variant_reencode_revision', '1', strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))" + ) + conn.commit() + if repaired: + logger.info(f"Re-encoded intro/outro_variants on {repaired} ad_patterns rows") + + def _reseed_known_sponsors(self, conn): + """Apply the authoritative sponsor seed list (src/seed_data/sponsors_final.csv). + + UPDATE on name match (case-insensitive) to preserve `id` for any + existing ad_patterns.sponsor_id foreign keys. INSERT new rows. + Soft-delete (is_active=0) any existing sponsor whose name is not in + the CSV. Idempotent: re-running yields the same end state. + + Stamps a settings flag (`sponsor_seed_revision`) on success so we + only do meaningful work once per app version that ships a new seed. + """ + from utils.community_tags import sponsor_seed + + # Bump this when the seed CSV is replaced so the migration re-runs. + SEED_REVISION = '2.4.0' + try: + current = conn.execute( + "SELECT value FROM settings WHERE key = 'sponsor_seed_revision'" + ).fetchone() + if current and current['value'] == SEED_REVISION: + return + except Exception: + # Settings table may not exist yet on a fresh-create path; carry on. + pass + + seed = sponsor_seed() + seed_names_lower = {row['name'].lower() for row in seed} + + # Build existing-name -> id map (case-insensitive) + existing = conn.execute( + "SELECT id, name FROM known_sponsors" + ).fetchall() + existing_by_lower = {row['name'].lower(): row['id'] for row in existing} + + updated = 0 + inserted = 0 + for row in seed: + name = row['name'] + aliases_json = json.dumps(row['aliases']) + tags_json = json.dumps(row['tags']) + existing_id = existing_by_lower.get(name.lower()) + if existing_id is not None: + conn.execute( + "UPDATE known_sponsors SET aliases = ?, tags = ?, is_active = 1 " + "WHERE id = ?", + (aliases_json, tags_json, existing_id), + ) + updated += 1 + else: + conn.execute( + "INSERT INTO known_sponsors (name, aliases, tags, is_active) " + "VALUES (?, ?, ?, 1)", + (name, aliases_json, tags_json), + ) + inserted += 1 + + # Soft-delete orphans: existing sponsors not present in the seed. + orphans = [ + row_id for lower, row_id in existing_by_lower.items() + if lower not in seed_names_lower + ] + deactivated = 0 + for row_id in orphans: + conn.execute( + "UPDATE known_sponsors SET is_active = 0 WHERE id = ?", + (row_id,), + ) + deactivated += 1 + + # Record the revision so this migration is a no-op next boot. + conn.execute( + "INSERT INTO settings (key, value, is_default) VALUES (?, ?, 0) " + "ON CONFLICT(key) DO UPDATE SET value = excluded.value", + ('sponsor_seed_revision', SEED_REVISION), + ) + conn.commit() + logger.info( + f"Migration: sponsor seed v{SEED_REVISION} applied " + f"({inserted} inserted, {updated} updated, {deactivated} deactivated)" + ) + def _cleanup_zyn_cascade(self, conn): try: zyn_row = conn.execute( diff --git a/src/database/settings.py b/src/database/settings.py index 59e638f1..08bffef4 100644 --- a/src/database/settings.py +++ b/src/database/settings.py @@ -33,6 +33,33 @@ def get_setting(self, key: str) -> Optional[str]: row = cursor.fetchone() return row['value'] if row else None + def get_setting_bool(self, key: str, default: bool = False) -> bool: + """Get a setting as bool. 'true'/'1'/'yes' -> True (case-insensitive).""" + v = self.get_setting(key) + if v is None: + return default + return str(v).strip().lower() in ('true', '1', 'yes', 'on') + + def get_setting_float(self, key: str, default: float = 0.0) -> float: + """Get a setting as float, returning `default` on missing/invalid values.""" + v = self.get_setting(key) + if v is None: + return default + try: + return float(v) + except (TypeError, ValueError): + return default + + def get_setting_int(self, key: str, default: int = 0) -> int: + """Get a setting as int, returning `default` on missing/invalid values.""" + v = self.get_setting(key) + if v is None: + return default + try: + return int(float(v)) + except (TypeError, ValueError): + return default + def get_all_settings(self) -> Dict[str, Any]: """Get all settings as a dictionary.""" conn = self.get_connection() diff --git a/src/database/sponsors.py b/src/database/sponsors.py index 483a9f11..e54a126a 100644 --- a/src/database/sponsors.py +++ b/src/database/sponsors.py @@ -38,13 +38,15 @@ def get_known_sponsor_by_name(self, name: str) -> Optional[Dict]: return dict(row) if row else None def create_known_sponsor(self, name: str, aliases: List[str] = None, - category: str = None, common_ctas: List[str] = None) -> int: + category: str = None, common_ctas: List[str] = None, + tags: List[str] = None) -> int: """Create a known sponsor. Returns sponsor ID.""" conn = self.get_connection() cursor = conn.execute( - """INSERT INTO known_sponsors (name, aliases, category, common_ctas) - VALUES (?, ?, ?, ?)""", - (name, json.dumps(aliases or []), category, json.dumps(common_ctas or [])) + """INSERT INTO known_sponsors (name, aliases, category, common_ctas, tags) + VALUES (?, ?, ?, ?, ?)""", + (name, json.dumps(aliases or []), category, + json.dumps(common_ctas or []), json.dumps(tags or [])) ) conn.commit() return cursor.lastrowid @@ -59,7 +61,7 @@ def update_known_sponsor(self, sponsor_id: int, **kwargs) -> bool: if key in ('name', 'category', 'is_active'): fields.append(f"{key} = ?") values.append(value) - elif key in ('aliases', 'common_ctas'): + elif key in ('aliases', 'common_ctas', 'tags'): fields.append(f"{key} = ?") values.append(json.dumps(value) if isinstance(value, list) else value) @@ -74,6 +76,23 @@ def update_known_sponsor(self, sponsor_id: int, **kwargs) -> bool: conn.commit() return True + def get_sponsors_by_tag(self, tag: str, active_only: bool = True) -> List[Dict]: + """Return sponsors whose tags JSON array contains the given tag. + + SQLite json_each is used to avoid loading every row into Python. + """ + conn = self.get_connection() + query = ( + "SELECT s.* FROM known_sponsors s, json_each(s.tags) j " + "WHERE j.value = ?" + ) + params: List = [tag] + if active_only: + query += " AND s.is_active = 1" + query += " ORDER BY s.name" + cursor = conn.execute(query, params) + return [dict(row) for row in cursor.fetchall()] + def delete_known_sponsor(self, sponsor_id: int) -> bool: """Delete a known sponsor (or set inactive).""" conn = self.get_connection() diff --git a/src/main_app/background.py b/src/main_app/background.py index ff32ea14..86bc3340 100644 --- a/src/main_app/background.py +++ b/src/main_app/background.py @@ -59,11 +59,18 @@ def background_rss_refresh(): """ from main_app.feeds import refresh_all_feeds from pricing_fetcher import refresh_pricing_if_stale - _, _, shutdown_event, _, _ = _get_components() + from community_sync import community_pattern_sync_tick + db, _, shutdown_event, _, _ = _get_components() while not shutdown_event.is_set(): refresh_all_feeds() run_cleanup() refresh_pricing_if_stale() # TTL-gated, fetches once per 24h + # Community pattern sync — gated by settings.community_sync_enabled + # and the cron schedule; safe to call every tick. + try: + community_pattern_sync_tick(db) + except Exception as e: + refresh_logger.warning(f"community_pattern_sync_tick failed: {e}") # Wait 15 minutes, but allow early exit on shutdown shutdown_event.wait(timeout=900) diff --git a/src/main_app/feeds.py b/src/main_app/feeds.py index 3750318e..7efdd744 100644 --- a/src/main_app/feeds.py +++ b/src/main_app/feeds.py @@ -174,6 +174,20 @@ def refresh_rss_feed(slug: str, feed_url: str, force: bool = False): update_kwargs['last_modified_header'] = new_last_modified db.update_podcast(slug, **update_kwargs) + # Map iTunes categories to MinusPod vocabulary tags, then refresh the + # RSS layer of the podcast's tags. set_podcast_tags also folds in + # episode-level tags and the user_tags layer. + try: + from utils.community_tags import map_itunes_category + raw_cats = rss_parser.extract_podcast_categories(parsed_feed) + rss_tags = sorted({ + tag for cat in raw_cats + if (tag := map_itunes_category(cat)) + }) + db.set_podcast_tags(slug, rss_tags=rss_tags) + except Exception as e: + refresh_logger.warning(f"[{slug}] iTunes category mapping failed: {e}") + # Detect DAI platform and network from feed metadata feed_author = parsed_feed.feed.get('author', '') network_info = pattern_service.update_podcast_metadata( diff --git a/src/main_app/processing.py b/src/main_app/processing.py index b4a497cb..89bc5f8c 100644 --- a/src/main_app/processing.py +++ b/src/main_app/processing.py @@ -301,12 +301,23 @@ def _detect_ads_first_pass(slug, episode_id, segments, audio_path, db, storage, _, ad_detector, _, _, _, status_service, _, _ = _get_components() status_service.update_job_stage("pass1:detecting", 50) clear_fallback(episode_id, PASS_AD_DETECTION_1) + # Load podcast tags once for the matcher's community-pattern eligibility check. + podcast_tags = None + try: + podcast_row = db.get_podcast_by_slug(slug) + if podcast_row and podcast_row.get('tags'): + import json as _json + podcast_tags = set(_json.loads(podcast_row['tags'])) + except Exception: + podcast_tags = None + ad_result = ad_detector.process_transcript( segments, podcast_name, episode_title, slug, episode_id, episode_description, audio_path=audio_path, podcast_id=slug, skip_patterns=skip_patterns, podcast_description=podcast_description, + podcast_tags=podcast_tags, progress_callback=progress_callback, audio_analysis=audio_analysis_result, cancel_event=cancel_event diff --git a/src/pattern_service.py b/src/pattern_service.py index 20b56de4..ee481cbb 100644 --- a/src/pattern_service.py +++ b/src/pattern_service.py @@ -25,6 +25,31 @@ logger = logging.getLogger('podcast.patterns') + +def _splice_prefix(text: str, prefix: str) -> Tuple[str, bool]: + """Return (text-without-prefix, applied?). Whitespace- and + case-insensitive: a leading space on either side or a case mismatch + doesn't block the splice. When `prefix` is empty or doesn't actually + start `text`, the original `text` is returned with `applied=False`. + """ + if not prefix: + return text, False + stripped = text.lstrip() + if not stripped.lower().startswith(prefix.lower()): + return text, False + offset = len(text) - len(stripped) + return (text[:offset] + stripped[len(prefix):]).lstrip(), True + + +def _splice_suffix(text: str, suffix: str) -> Tuple[str, bool]: + """Mirror of `_splice_prefix` for the tail end.""" + if not suffix: + return text, False + stripped = text.rstrip() + if not stripped.lower().endswith(suffix.lower()): + return text, False + return stripped[:-len(suffix)].rstrip(), True + # Known DAI platforms and their RSS signatures DAI_PLATFORMS = { 'megaphone': [ @@ -812,3 +837,167 @@ def _get_text_pattern_matcher(self) -> TextPatternMatcher: if getattr(self, '_text_pattern_matcher', None) is None: self._text_pattern_matcher = TextPatternMatcher(db=self.db) return self._text_pattern_matcher + + def rewrite_pattern_from_bounds( + self, + pattern_id: int, + transcript: str, + original_start: float, + original_end: float, + new_start: float, + new_end: float, + ) -> bool: + """Trim a pattern's text_template by the slice that fell outside the new bounds. + + Computes the head slice [original_start, new_start) and tail slice + (new_end, original_end] from the transcript, then splices them out of + the existing `text_template` if they appear at its start/end. This is + Operation 1 from the plan ("trim-only updates") — explicitly NOT a + full re-extract from the new bounds, which would fit the template to + one episode's transcription and risk breaking matches on episodes + that captured the cleaner version. + + intro_variants / outro_variants get the same head/tail prefix/suffix + treatment so they stay aligned with the new template. + + Community patterns are never auto-rewritten by this method. + Returns True when the pattern was actually changed; False otherwise. + """ + from utils.text import extract_text_in_range + + if not self.db or not transcript: + return False + + pattern = self.db.get_ad_pattern_by_id(pattern_id) + if not pattern: + logger.warning(f"rewrite_pattern_from_bounds: pattern {pattern_id} not found") + return False + if (pattern.get('source') or 'local') != 'local': + logger.info( + f"rewrite_pattern_from_bounds: skipping non-local pattern " + f"{pattern_id} (source={pattern.get('source')})" + ) + return False + + old_text = pattern.get('text_template') or '' + if not old_text: + return False + + head_trim = (extract_text_in_range(transcript, original_start, new_start) or '').strip() + tail_trim = (extract_text_in_range(transcript, new_end, original_end) or '').strip() + + new_template, head_applied = _splice_prefix(old_text, head_trim) + new_template, tail_applied = _splice_suffix(new_template, tail_trim) + + if not (head_applied or tail_applied): + logger.info( + f"rewrite_pattern_from_bounds: pattern {pattern_id} trim slices " + f"do not match the existing template (head={len(head_trim)} " + f"chars, tail={len(tail_trim)} chars); skipping rewrite" + ) + return False + + if len(new_template) < 50: + logger.info( + f"rewrite_pattern_from_bounds: pattern {pattern_id} trimmed " + f"template too short ({len(new_template)} chars); skipping rewrite" + ) + return False + + try: + intro_variants = json.loads(pattern.get('intro_variants') or '[]') or [] + except (TypeError, ValueError): + intro_variants = [] + try: + outro_variants = json.loads(pattern.get('outro_variants') or '[]') or [] + except (TypeError, ValueError): + outro_variants = [] + + # Mirror the head/tail trim onto the variant arrays. Variants that + # don't share the trimmed prefix/suffix were independent samples; + # leave them alone. + if head_applied and head_trim: + intro_variants = [_splice_prefix(v, head_trim)[0] for v in intro_variants] + if tail_applied and tail_trim: + outro_variants = [_splice_suffix(v, tail_trim)[0] for v in outro_variants] + + self.db.update_ad_pattern( + pattern_id, + text_template=new_template, + intro_variants=intro_variants, + outro_variants=outro_variants, + ) + logger.info( + f"rewrite_pattern_from_bounds: pattern {pattern_id} trimmed " + f"(head={len(head_trim) if head_applied else 0} chars, " + f"tail={len(tail_trim) if tail_applied else 0} chars); " + f"template now {len(new_template)} chars" + ) + # Invalidate the cached matcher so it reloads on next match call. + self._text_pattern_matcher = None + return True + + def import_community_pattern(self, data: Dict) -> int: + """Insert or update an ad pattern carrying a community_id. + + New community_id -> INSERT with source='community', protected_from_sync=0. + Existing community_id with higher `version` -> UPDATE in place, + unless the row has `protected_from_sync = 1` (then skip). + + Returns the pattern id (new or existing). Raises ValueError when + required fields are missing. + """ + if not self.db: + raise ValueError("import_community_pattern requires a database") + + required = ('community_id', 'text_template', 'sponsor', 'scope') + missing = [k for k in required if not data.get(k)] + if missing: + raise ValueError(f"import_community_pattern: missing fields {missing}") + + community_id = data['community_id'] + version = int(data.get('version') or 1) + + sponsor_name = data['sponsor'] + sponsor_id = get_or_create_known_sponsor(self.db, sponsor_name) + + existing = self.db.find_pattern_by_community_id(community_id) + if existing: + if existing.get('protected_from_sync'): + logger.info( + f"import_community_pattern: community_id={community_id} " + f"is protected; skipping" + ) + return existing['id'] + if version <= int(existing.get('version') or 1): + # Same or older version — no-op. + return existing['id'] + + self.db.update_ad_pattern( + existing['id'], + text_template=data['text_template'], + intro_variants=data.get('intro_variants') or [], + outro_variants=data.get('outro_variants') or [], + sponsor_id=sponsor_id, + version=version, + submitted_app_version=data.get('submitted_app_version'), + ) + return existing['id'] + + pattern_id = self.db.create_ad_pattern( + scope=data['scope'], + text_template=data['text_template'], + sponsor_id=sponsor_id, + podcast_id=data.get('podcast_id'), + network_id=data.get('network_id'), + dai_platform=data.get('dai_platform'), + intro_variants=data.get('intro_variants') or [], + outro_variants=data.get('outro_variants') or [], + created_by='community', + source='community', + community_id=community_id, + version=version, + submitted_app_version=data.get('submitted_app_version'), + protected_from_sync=0, + ) + return pattern_id diff --git a/src/processing_timeouts.py b/src/processing_timeouts.py index f39f576c..906d8a6f 100644 --- a/src/processing_timeouts.py +++ b/src/processing_timeouts.py @@ -47,7 +47,11 @@ def _resolve(key: str, env_name: str, default: int) -> int: value: Optional[int] = None try: - from database import get_database + # `get_database` lives in the api package, not the database package. + # The pre-2.4.x code imported from `database` and silently swallowed + # an ImportError on every refresh tick — env-var / default fallback + # picked up the slack so the bug went unnoticed. + from api import get_database raw = get_database().get_setting(key) if raw is not None and str(raw).strip(): try: diff --git a/src/rss_parser.py b/src/rss_parser.py index f4684a79..67adff07 100644 --- a/src/rss_parser.py +++ b/src/rss_parser.py @@ -317,6 +317,59 @@ def extract_podcast_artwork_url(parsed_feed) -> Optional[str]: return feed.itunes_image.get('href') return None + @staticmethod + def extract_podcast_categories(parsed_feed) -> List[str]: + """Extract iTunes category strings (top-level + subcategory) from a parsed feed. + + Returns the raw category labels exactly as they appear in the feed. + Callers map them through `utils.community_tags.map_itunes_category`. + """ + if not parsed_feed or not parsed_feed.feed: + return [] + labels: List[str] = [] + feed = parsed_feed.feed + # feedparser exposes RSS-level categories on .tags as a list of dicts. + tags = feed.get('tags', []) if hasattr(feed, 'get') else getattr(feed, 'tags', []) + for t in tags or []: + label = None + if isinstance(t, dict): + label = t.get('term') or t.get('label') + else: + label = getattr(t, 'term', None) or getattr(t, 'label', None) + if label and isinstance(label, str): + labels.append(label.strip()) + # Dedup while preserving order. + seen = set() + out: List[str] = [] + for lab in labels: + if lab not in seen: + seen.add(lab) + out.append(lab) + return out + + @staticmethod + def extract_episode_categories(entry) -> List[str]: + """Extract iTunes category strings from a single feedparser entry.""" + if entry is None: + return [] + tags = entry.get('tags', []) if hasattr(entry, 'get') else getattr(entry, 'tags', []) + labels: List[str] = [] + for t in tags or []: + label = None + if isinstance(t, dict): + label = t.get('term') or t.get('label') + else: + label = getattr(t, 'term', None) or getattr(t, 'label', None) + if label and isinstance(label, str): + labels.append(label.strip()) + seen = set() + out: List[str] = [] + for lab in labels: + if lab not in seen: + seen.add(lab) + out.append(lab) + return out + def generate_episode_id(self, episode_url: str, guid: str = None) -> str: """Generate consistent episode ID from GUID or URL. @@ -666,6 +719,20 @@ def extract_episodes(self, feed_content: str) -> List[Dict]: except (ValueError, TypeError): pass + # Map per-episode iTunes categories to vocabulary tags. + ep_tags: List[str] = [] + try: + from utils.community_tags import map_itunes_category + raw_cats = self.extract_episode_categories(entry) + seen = set() + for cat in raw_cats: + tag = map_itunes_category(cat) + if tag and tag not in seen: + seen.add(tag) + ep_tags.append(tag) + except Exception as e: + logger.warning(f"Episode iTunes category mapping failed: {e}") + episodes.append({ 'id': self.generate_episode_id(episode_url, entry.get('id', '')), 'url': episode_url, @@ -674,6 +741,7 @@ def extract_episodes(self, feed_content: str) -> List[Dict]: 'description': self._get_episode_description(entry), 'artwork_url': artwork_url, 'episode_number': episode_number, + 'tags': ep_tags, }) # De-duplicate episodes (keep latest when multiple versions exist) diff --git a/src/seed_data/itunes_category_map.json b/src/seed_data/itunes_category_map.json new file mode 100644 index 00000000..3b5a4761 --- /dev/null +++ b/src/seed_data/itunes_category_map.json @@ -0,0 +1,113 @@ +{ + "_comment": "Maps iTunes podcast category strings (top-level and common subcategories) to MinusPod vocabulary tags. Match is case-insensitive. Unknown categories produce no tag.", + "Arts": "arts", + "Books": "books", + "Design": "arts", + "Fashion & Beauty": "arts", + "Food": "leisure", + "Performing Arts": "arts", + "Visual Arts": "arts", + "Business": "business", + "Careers": "business", + "Entrepreneurship": "business", + "Investing": "business", + "Management": "business", + "Marketing": "business", + "Non-Profit": "business", + "Comedy": "comedy", + "Comedy Interviews": "comedy", + "Improv": "comedy", + "Stand-Up": "comedy", + "Education": "education", + "Courses": "education", + "How To": "education", + "Language Learning": "language_learning", + "Self-Improvement": "self_improvement", + "Fiction": "fiction", + "Comedy Fiction": "fiction", + "Drama": "fiction", + "Science Fiction": "fiction", + "Government": "politics", + "History": "history", + "Health & Fitness": "health", + "Alternative Health": "health", + "Fitness": "health", + "Medicine": "health", + "Mental Health": "mental_health", + "Nutrition": "health", + "Sexuality": "health", + "Kids & Family": "kids_family", + "Education for Kids": "kids_family", + "Parenting": "kids_family", + "Pets & Animals": "kids_family", + "Stories for Kids": "kids_family", + "Leisure": "leisure", + "Animation & Manga": "leisure", + "Automotive": "automotive", + "Aviation": "leisure", + "Crafts": "leisure", + "Games": "gaming", + "Hobbies": "leisure", + "Home & Garden": "leisure", + "Video Games": "gaming", + "Music": "music", + "Music Commentary": "music", + "Music History": "music", + "Music Interviews": "music", + "News": "news", + "Business News": "news", + "Daily News": "news", + "Entertainment News": "news", + "News Commentary": "news", + "Politics": "politics", + "Sports News": "sports", + "Tech News": "news", + "Religion & Spirituality": "religion", + "Buddhism": "religion", + "Christianity": "religion", + "Hinduism": "religion", + "Islam": "religion", + "Judaism": "religion", + "Religion": "religion", + "Spirituality": "religion", + "Science": "science", + "Astronomy": "science", + "Chemistry": "science", + "Earth Sciences": "science", + "Life Sciences": "science", + "Mathematics": "science", + "Natural Sciences": "science", + "Nature": "science", + "Physics": "science", + "Social Sciences": "science", + "Society & Culture": "society_culture", + "Documentary": "society_culture", + "Personal Journals": "society_culture", + "Philosophy": "society_culture", + "Places & Travel": "travel", + "Relationships": "society_culture", + "Sports": "sports", + "Baseball": "sports", + "Basketball": "sports", + "Cricket": "sports", + "Fantasy Sports": "sports", + "Football": "sports", + "Golf": "sports", + "Hockey": "sports", + "Rugby": "sports", + "Running": "sports", + "Soccer": "sports", + "Swimming": "sports", + "Tennis": "sports", + "Volleyball": "sports", + "Wilderness": "sports", + "Wrestling": "sports", + "Technology": "technology", + "True Crime": "true_crime", + "TV & Film": "tv_film", + "After Shows": "tv_film", + "Film History": "tv_film", + "Film Interviews": "tv_film", + "Film Reviews": "tv_film", + "TV Reviews": "tv_film" +} diff --git a/src/seed_data/sponsors_final.csv b/src/seed_data/sponsors_final.csv new file mode 100644 index 00000000..f00a53d5 --- /dev/null +++ b/src/seed_data/sponsors_final.csv @@ -0,0 +1,256 @@ +name,aliases,tags +Athletic Greens,AG1|AG One,supplements|health|dtc|universal +BetterHelp,Better Help,mental_health|health|universal +Squarespace,Square Space,tech|saas|dtc|universal +Shopify,,tech|saas|universal +HelloFresh,Hello Fresh,meal_kit|food|dtc|universal +NordVPN,Nord VPN,vpn|tech|security|universal +ExpressVPN,Express VPN,vpn|tech|security|universal +ZipRecruiter,Zip Recruiter,jobs|universal +SimpliSafe,Simpli Safe,home_security|universal +Mint Mobile,MintMobile,telecom|universal +MasterClass,Master Class,education|universal +Rocket Money,RocketMoney|Truebill,finance|saas|universal +DoorDash,Door Dash,food|universal +HubSpot,Hub Spot,tech|saas +NetSuite,Net Suite,tech|saas +Amazon,,tech +Audible,,streaming|books|universal +Factor,,meal_kit|food|dtc|universal +Calm,,mental_health|health|universal +Headspace,Head Space,mental_health|health|universal +Indeed,,jobs|universal +LinkedIn,LinkedIn Jobs,jobs|tech|saas|universal +Stamps.com,Stamps,tech|saas|universal +Ring,,home_security|tech|universal +ADT,,home_security|universal +Casper,,mattress|dtc|universal +Helix Sleep,Helix,mattress|dtc|universal +Purple,,mattress|dtc|universal +Brooklinen,,home_goods|dtc|universal +Bombas,,apparel|dtc|universal +Manscaped,,personal_care|dtc|universal +Dollar Shave Club,DSC,personal_care|dtc|universal +Harry's,Harrys,personal_care|dtc|universal +Quip,,personal_care|health|dtc|universal +Hims,,health|personal_care|dtc|universal +Hers,,health|personal_care|dtc|universal +Roman,,health|personal_care|dtc|universal +Keeps,,health|personal_care|dtc|universal +Function of Beauty,,personal_care|health|dtc|universal +Native,,personal_care|dtc|universal +Liquid IV,Liquid I.V.,supplements|beverage|health|dtc|universal +Athletic Brewing,,beverage|dtc +Magic Spoon,,food|dtc|universal +Thrive Market,,food|dtc|universal +Butcher Box,ButcherBox,food|dtc|universal +Blue Apron,,meal_kit|food|dtc|universal +Uber Eats,UberEats,food|universal +Grubhub,Grub Hub,food|universal +Instacart,,food|universal +Credit Karma,,finance|universal +SoFi,,finance|universal +Acorns,,finance|universal +Betterment,,finance|universal +Wealthfront,,finance|universal +PolicyGenius,Policy Genius,finance|insurance|universal +Lemonade,,finance|insurance|universal +State Farm,,insurance|universal +Progressive,,insurance|universal +Geico,,insurance|universal +Liberty Mutual,,insurance|universal +T-Mobile,TMobile,telecom +Visible,,telecom|universal +FanDuel,Fan Duel,gambling|sports|universal +DraftKings,Draft Kings,gambling|sports|universal +BetMGM,Bet MGM,gambling|sports|universal +Toyota,,auto +Hyundai,,auto +CarMax,Car Max,auto|universal +Carvana,,auto|universal +eBay Motors,,auto|universal +ZocDoc,Zoc Doc,health|universal +GoodRx,Good Rx,health|universal +Care/of,Care of|Careof,supplements|health|dtc|universal +Ritual,,supplements|health|dtc|universal +Seed,,supplements|health|dtc|universal +Monday.com,Monday,tech|saas +Notion,,tech|saas|universal +Canva,,tech|saas|universal +Grammarly,,tech|saas|universal +Babbel,,education|language_learning|universal +Rosetta Stone,,education|language_learning|universal +Blinkist,,education|books|universal +Raycon,,tech +Bose,,tech +MacPaw,CleanMyMac,tech|saas +Green Chef,GreenChef,meal_kit|food|dtc|universal +Magic Mind,,beverage|supplements|dtc|universal +Honeylove,Honey Love,apparel|dtc|universal +Cozy Earth,,home_goods|dtc|universal +Quince,,apparel|dtc|universal +LMNT,Element,supplements|beverage|health|dtc|universal +Nutrafol,,supplements|health|dtc|universal +Aura,,tech|security|saas|universal +OneSkin,One Skin,personal_care|health|dtc|universal +Incogni,,tech|security|saas|universal +Gametime,Game Time,sports|universal +1Password,One Password,tech|security|saas|universal +Bitwarden,Bit Warden,tech|security|saas|universal +CacheFly,,tech|saas +Deel,,tech|saas|jobs +DeleteMe,Delete Me,tech|security|saas|universal +Framer,,tech|saas|universal +Miro,,tech|saas +Monarch Money,,finance|saas|universal +OutSystems,,tech|saas +Spaceship,,tech|saas +Thinkst Canary,,tech|security|saas +ThreatLocker,,tech|security|saas +Vanta,,tech|security|saas +Veeam,,tech|saas +Zapier,,tech|saas|universal +Zscaler,,tech|security|saas +Capital One,,finance|universal +Ford,,auto +WhatsApp,,tech +Lime,,auto +Lyft,,auto|universal +Turo,,auto|universal +Uber,,auto|universal +Waymo,,auto +Gusto,,tech|saas +Meter,,tech|saas +PagerDuty,,tech|saas +Rippling,,tech|saas|jobs +Splunk,,tech|saas +Webflow,,tech|saas +Allbirds,,apparel|dtc|universal +Alo Yoga,,apparel|dtc +Birchbox,,personal_care|dtc|universal +Everlane,,apparel|dtc|universal +FabFitFun,,apparel|personal_care|dtc|universal +GOAT,,apparel +Gopuff,,food|universal +Lululemon,,apparel +Outdoor Voices,,apparel|dtc|universal +Poshmark,,apparel +Rothy's,,apparel|dtc|universal +Saatva,,mattress|dtc|universal +Shein,,apparel +SKIMS,,apparel|dtc +Stitch Fix,,apparel|dtc|universal +StockX,,apparel +Temu,,apparel +Ten Thousand,,apparel|dtc +ThredUp,,apparel +Vuori,,apparel|dtc|universal +Warby Parker,,apparel|dtc|universal +Wayfair,,home_goods +Affirm,,finance|universal +Bill.com,,finance|saas +Brex,,finance|saas +Chime,,finance|universal +Coinbase,,finance +FreshBooks,,finance|saas +Intuit,,finance|saas +Klarna,,finance|universal +Mercury,,finance|saas +NerdWallet,,finance|universal +Plaid,,finance|tech|saas +Public.com,,finance|universal +QuickBooks,,finance|saas +Ramp,,finance|saas +Robinhood,,finance|universal +Stripe,,finance|tech|saas +UnitedHealth Group,,health|insurance +WebBank,,finance +Xero,,finance|saas +Alani Nu,,supplements|beverage|health|dtc|universal +Bloom Nutrition,,supplements|health|dtc|universal +EveryPlate,,meal_kit|food|dtc|universal +Huel,,food|supplements|dtc|universal +Imperfect Foods,,food|dtc|universal +McDonald's,,food +OLIPOP,,beverage|dtc +Poppi,,beverage|dtc +Starbucks,,beverage|food +Transparent Labs,,supplements|health|dtc|universal +Caesars Sportsbook,,gambling|sports|universal +ESPN Bet,,gambling|sports|universal +SeatGeek,,sports|universal +StubHub,,sports|universal +Pura,,home_goods|home_security|dtc +LegalZoom,,tech|saas|universal +Rocket Lawyer,,tech|saas|universal +Apple TV+,,streaming|tv_film|universal +Disney+,,streaming|tv_film|universal +HBO Max,,streaming|tv_film|universal +iHeartRadio,,streaming|music|universal +Netflix,,streaming|tv_film|universal +Paramount+,,streaming|tv_film|universal +SiriusXM,,streaming|music|universal +Spotify,,streaming|music|universal +YouTube,,streaming|tv_film|universal +YouTube TV,,streaming|tv_film|universal +Cerebral,,mental_health|health|universal +Eight Sleep,,health|dtc|universal +Function Health,,health|universal +Inside Tracker,,health|supplements|universal +Joovv,,health|dtc|universal +Levels,,health|universal +Momentous,,supplements|health|dtc|universal +Noom,,health|mental_health|universal +Ro,,health|personal_care|universal +Talkspace,,mental_health|health|universal +Thorne,,supplements|health|dtc|universal +WHOOP,,health|sports|dtc|universal +Airtable,,tech|saas +Anthropic,,tech|saas +Asana,,tech|saas +Brilliant,,tech|education|saas|universal +ClickUp,,tech|saas +Cloudflare,,tech|security|saas +CrowdStrike,,tech|security|saas +Cursor,,tech|saas +Databricks,,tech|saas +Datadog,,tech|saas +DocuSign,,tech|saas +Duolingo,,tech|education|language_learning|saas|universal +ElevenLabs,,tech|saas +Figma,,tech|saas +GitHub,,tech|saas +GitHub Copilot,,tech|saas +Klaviyo,,tech|saas +Linear,,tech|saas +Loom,,tech|saas +Mailchimp,,tech|saas +Midjourney,,tech|saas +Okta,,tech|security|saas +OpenAI,,tech|saas +Patreon,,tech|saas +Perplexity,,tech|saas +Retool,,tech|saas +Salesforce,,tech|saas +SendGrid,,tech|saas +ServiceNow,,tech|saas +Skillshare,,tech|education|saas|universal +Slack,,tech|saas +Snowflake,,tech|saas +Substack,,tech|saas +Twilio,,tech|saas +Vercel,,tech|saas +Workday,,tech|saas +Zendesk,,tech|saas +Zoom,,tech|saas +AT&T,,telecom +Comcast,,telecom +Verizon,,telecom +Airbnb,,travel|universal +Booking.com,,travel|universal +Expedia,,travel|universal +Hopper,,travel|universal +Kayak,,travel|universal +Skyscanner,,travel|universal +Vrbo,,travel|universal +Zyn,ZYN|Zinn,nicotine|universal diff --git a/src/seed_data/tag_vocabulary.csv b/src/seed_data/tag_vocabulary.csv new file mode 100644 index 00000000..2f7ff79d --- /dev/null +++ b/src/seed_data/tag_vocabulary.csv @@ -0,0 +1,49 @@ +category,tag,description +podcast_genre,arts,"Arts, design, fashion, performing/visual arts" +podcast_genre,books,"Books, audiobook-targeted shows" +podcast_genre,business,"Business, careers, entrepreneurship, investing, management, marketing" +podcast_genre,comedy,"Comedy, stand-up, improv" +podcast_genre,education,"General education, courses, how-to" +podcast_genre,language_learning,"Language learning specifically (Babbel/Rosetta target)" +podcast_genre,self_improvement,"Self-improvement, productivity, life advice" +podcast_genre,fiction,"Narrative fiction, audio drama" +podcast_genre,history,"History, historical analysis" +podcast_genre,health,"Health & fitness, alternative health, medicine, nutrition, sexuality" +podcast_genre,mental_health,"Mental health, meditation, mindfulness" +podcast_genre,kids_family,"Kids, family, parenting, pets" +podcast_genre,leisure,"Leisure, crafts, hobbies, home & garden" +podcast_genre,gaming,"Video games, board games, gaming culture" +podcast_genre,automotive,"Cars, car culture, auto enthusiast" +podcast_genre,music,"Music, music commentary, music interviews" +podcast_genre,news,"General news, daily news, news commentary, business/tech news" +podcast_genre,politics,"Politics, political commentary, government" +podcast_genre,religion,"Religion, spirituality, faith" +podcast_genre,science,"Science, astronomy, physics, chemistry, life sciences" +podcast_genre,society_culture,"Society & culture, documentary, philosophy, relationships" +podcast_genre,travel,"Places & travel" +podcast_genre,sports,"Sports, all sports subcategories" +podcast_genre,technology,"Technology shows specifically" +podcast_genre,true_crime,"True crime" +podcast_genre,tv_film,"TV & film, after shows, film reviews" +sponsor_industry,tech,"Software, apps, digital tools (broad)" +sponsor_industry,saas,"Subscription B2B/B2C software" +sponsor_industry,vpn,"VPN and privacy networking" +sponsor_industry,security,"Password managers, identity protection, antivirus" +sponsor_industry,finance,"Banking, investing, lending, credit" +sponsor_industry,insurance,"Insurance products (auto, home, life, etc.)" +sponsor_industry,food,"Restaurants, grocery, delivery" +sponsor_industry,meal_kit,"Meal kit delivery (HelloFresh, Factor, etc.)" +sponsor_industry,beverage,"Drinks, alcohol, energy, hydration" +sponsor_industry,supplements,"Supplements, vitamins, nutrition powders" +sponsor_industry,apparel,"Clothing, shoes, accessories" +sponsor_industry,home_goods,"Furniture, decor, home goods (non-mattress)" +sponsor_industry,mattress,"Mattresses, bedding (Casper, Helix, etc.)" +sponsor_industry,home_security,"Alarms, cameras, smart locks" +sponsor_industry,personal_care,"Grooming, skincare, haircare, hygiene" +sponsor_industry,auto,"Cars, car buying, auto services" +sponsor_industry,telecom,"Mobile carriers, internet providers" +sponsor_industry,jobs,"Recruitment, hiring platforms" +sponsor_industry,streaming,"Streaming services, audiobooks (Audible, Netflix, etc.)" +sponsor_industry,gambling,"Sportsbooks, casinos, fantasy sports" +sponsor_industry,nicotine,"Tobacco, vape, nicotine pouches" +sponsor_industry,dtc,"Direct-to-consumer brand (cross-cutting tag)" diff --git a/src/text_pattern_matcher.py b/src/text_pattern_matcher.py index fa12e03e..cb9ab2c9 100644 --- a/src/text_pattern_matcher.py +++ b/src/text_pattern_matcher.py @@ -15,6 +15,7 @@ from utils.text import extract_text_from_segments from sponsor_normalize import get_or_create_known_sponsor from utils.constants import INVALID_SPONSOR_VALUES +from utils.community_tags import UNIVERSAL_TAG logger = logging.getLogger('podcast.textmatch') @@ -131,6 +132,8 @@ class AdPattern: podcast_id: Optional[str] = None network_id: Optional[str] = None avg_duration: Optional[float] = None + sponsor_id: Optional[int] = None + source: str = 'local' # "local", "community", "imported" class TextPatternMatcher: @@ -158,6 +161,8 @@ def __init__(self, db=None, sponsor_service=None): self._patterns: List[AdPattern] = [] self._pattern_buckets = {} self._initialized = False + # sponsor_id -> set of tags; populated alongside _load_patterns. + self._sponsor_tags: Dict[int, set] = {} def _ensure_initialized(self): """Lazy initialization of TF-IDF vectorizer.""" @@ -217,9 +222,19 @@ def _load_patterns(self): scope=p.get('scope', 'podcast'), podcast_id=p.get('podcast_id'), network_id=p.get('network_id'), - avg_duration=p.get('avg_duration') + avg_duration=p.get('avg_duration'), + sponsor_id=p.get('sponsor_id'), + source=p.get('source') or 'local', )) + # Cache sponsor_id -> tags for matcher eligibility checks. + try: + tags_map = self.db.get_sponsor_tags_map() + self._sponsor_tags = {sid: set(tags) for sid, tags in tags_map.items()} + except Exception as e: + logger.warning(f"Could not load sponsor tags map: {e}") + self._sponsor_tags = {} + # Build TF-IDF vectors for pattern templates if self._patterns: templates = [p.text_template for p in self._patterns if p.text_template] @@ -266,7 +281,8 @@ def find_matches( self, segments: List[Dict], podcast_id: str = None, - network_id: str = None + network_id: str = None, + podcast_tags: Optional[set] = None, ) -> List[TextMatch]: """ Search transcript segments for known ad patterns. @@ -275,6 +291,10 @@ def find_matches( segments: List of transcript segments with 'start', 'end', 'text' podcast_id: Optional podcast ID for scope filtering network_id: Optional network ID for scope filtering + podcast_tags: Optional set of tag strings for this podcast. + Community patterns are filtered out when their sponsor tags + share no overlap with the podcast tags (unless the sponsor + or podcast has no tags, or the sponsor carries 'universal'). Returns: List of TextMatch objects for found ads @@ -298,9 +318,9 @@ def find_matches( if len(full_text.strip()) < MIN_TEXT_LENGTH: return [] - # Filter patterns by scope + # Filter patterns by scope (+ tag eligibility for community patterns) applicable_patterns = self._filter_patterns_by_scope( - podcast_id, network_id + podcast_id, network_id, podcast_tags ) if not applicable_patterns: @@ -333,28 +353,54 @@ def find_matches( def _filter_patterns_by_scope( self, podcast_id: str = None, - network_id: str = None + network_id: str = None, + podcast_tags: Optional[set] = None, ) -> List[AdPattern]: - """Filter patterns by scope hierarchy. - - Global patterns apply to all podcasts. - Network patterns apply to podcasts in the same network. - Podcast patterns apply only to the specific podcast. + """Filter patterns by scope hierarchy and (for community) tag eligibility. + + Scope rules: + - Global patterns apply to all podcasts. + - Network patterns apply to podcasts in the same network. + - Podcast patterns apply only to the specific podcast. + + Tag eligibility (community patterns only): + - Sponsor with 'universal' tag matches everything. + - Overlap between sponsor tags and podcast tags matches. + - Either side empty -> match (fallback). + Local and imported patterns bypass the tag check entirely. """ - applicable = [] + applicable: List[AdPattern] = [] + podcast_tag_set = set(podcast_tags) if podcast_tags else set() for pattern in self._patterns: + # Scope gate if pattern.scope == 'global': - # Global patterns always apply - applicable.append(pattern) + pass elif pattern.scope == 'network': - # Network patterns require matching network_id - if network_id and pattern.network_id == network_id: - applicable.append(pattern) + if not (network_id and pattern.network_id == network_id): + continue elif pattern.scope == 'podcast': - # Podcast patterns require matching podcast_id - if podcast_id and pattern.podcast_id == podcast_id: + if not (podcast_id and pattern.podcast_id == podcast_id): + continue + else: + continue + + # Tag eligibility (community patterns only) + if pattern.source == 'community': + sponsor_tags = self._sponsor_tags.get(pattern.sponsor_id, set()) + if UNIVERSAL_TAG in sponsor_tags: applicable.append(pattern) + continue + if not sponsor_tags or not podcast_tag_set: + applicable.append(pattern) + continue + if sponsor_tags & podcast_tag_set: + applicable.append(pattern) + continue + # No overlap -> drop this community pattern + continue + + applicable.append(pattern) return applicable @@ -882,8 +928,8 @@ def create_pattern_from_ad( pattern_id = self.db.create_ad_pattern( scope=scope, text_template=ad_text, - intro_variants=json.dumps([intro]) if intro else "[]", - outro_variants=json.dumps([outro]) if outro else "[]", + intro_variants=[intro] if intro else [], + outro_variants=[outro] if outro else [], sponsor_id=sponsor_id, podcast_id=podcast_id, network_id=network_id, @@ -1025,8 +1071,8 @@ def split_pattern(self, pattern_id: int) -> List[int]: new_id = self.db.create_ad_pattern( scope=pattern.get('scope', 'podcast'), text_template=segment, - intro_variants=json.dumps([intro]) if intro else "[]", - outro_variants=json.dumps([outro]) if outro else "[]", + intro_variants=[intro] if intro else [], + outro_variants=[outro] if outro else [], sponsor_id=split_sponsor_id, podcast_id=pattern.get('podcast_id'), network_id=pattern.get('network_id'), diff --git a/src/tools/__init__.py b/src/tools/__init__.py new file mode 100644 index 00000000..a2cb5871 --- /dev/null +++ b/src/tools/__init__.py @@ -0,0 +1,13 @@ +"""Standalone CLI tools that import from the app's src/ tree. + +Importing this package also wires `src/` onto sys.path so the scripts work +when invoked as `python -m src.tools.X` (workflow style) or directly as +`python src/tools/X.py` (manual). Each script just does `import tools` (or +relies on relative imports from `src.tools.X`); no per-script bootstrap. +""" +import sys +from pathlib import Path + +_REPO_SRC = Path(__file__).resolve().parents[1] +if str(_REPO_SRC) not in sys.path: + sys.path.insert(0, str(_REPO_SRC)) diff --git a/src/tools/community_pattern_validator.py b/src/tools/community_pattern_validator.py new file mode 100644 index 00000000..f569694b --- /dev/null +++ b/src/tools/community_pattern_validator.py @@ -0,0 +1,476 @@ +"""Community pattern submission validator. + +Runs schema, tag, sponsor, quality, and dedupe checks against one or more +JSON pattern files. Same code is used by: + + - CI (`python -m src.tools.community_pattern_validator --pr-files A.json B.json`) + - The `POST /patterns/import` endpoint for community-format imports + +Outputs a structured ValidationResult per file plus a single combined +Markdown comment for posting back to the PR. CLI exits non-zero when any +file fails a hard check (rejected) so the GitHub Action sets a red status. +""" +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +# Defensive sys.path bootstrap so direct `python path/to/script.py` invocation +# works as well as `python -m src.tools.X` (the workflow-style invocation). +_REPO_SRC = Path(__file__).resolve().parents[1] +if str(_REPO_SRC) not in sys.path: + sys.path.insert(0, str(_REPO_SRC)) + +from community_export import find_foreign_sponsors # noqa: E402 +from utils.community_tags import ( # noqa: E402 + BUNDLE_FORMAT, + CANONICAL_DAYS, + CANONICAL_MONTHS, + CANONICAL_RELATIVE_TIME, + CANONICAL_STOPWORDS, + DATE_REGEX, + YEAR_REGEX, + iter_bundle_patterns, + sponsor_seed, + valid_tags, +) + +logger = logging.getLogger('community_pattern_validator') + +DUPLICATE_THRESHOLD = 0.95 +VARIANT_THRESHOLD = 0.75 + +REQUIRED_FIELDS = ('community_id', 'text_template', 'sponsor', 'version', 'submitted_at') + + +@dataclass +class ValidationResult: + path: str + status: str = 'pass' # 'pass' | 'reject' | 'warn' + sponsor: Optional[str] = None + sponsor_match: str = 'unknown' # 'exact' | 'alias' | 'fuzzy' | 'unknown' + classification: str = 'distinct' # 'duplicate' | 'variant' | 'distinct' + similar_to: Optional[str] = None # community_id + similarity: Optional[float] = None + diff_snippet: Optional[str] = None + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + + +def canonicalize_for_dedupe(text: str) -> str: + """Return the canonical form of `text` used for dedupe comparison only. + + Mirrors plan section 8.5: lowercase -> strip date/year tokens (BEFORE + punctuation removal, otherwise '12/31' becomes '12 31' and slips past) + -> punctuation->space -> collapse whitespace -> strip stopwords / day + / month / relative-time tokens -> trim. Original text is not modified. + """ + if not text: + return '' + s = text.lower() + # Date-format strings and 4-digit years must be removed BEFORE punctuation + # → space, otherwise '12/31' becomes '12 31' and slips through the date regex. + s = DATE_REGEX.sub(' ', s) + s = YEAR_REGEX.sub(' ', s) + s = re.sub(r'[^a-z0-9\s]+', ' ', s) + s = re.sub(r'\s+', ' ', s).strip() + if not s: + return '' + tokens = s.split(' ') + drop = CANONICAL_STOPWORDS | CANONICAL_DAYS | CANONICAL_MONTHS | CANONICAL_RELATIVE_TIME + kept = [t for t in tokens if t and t not in drop] + return ' '.join(kept) + + +def similarity(a: str, b: str) -> float: + """Compute similarity ratio between two canonicalized strings.""" + if not a or not b: + return 0.0 + return SequenceMatcher(None, a, b).ratio() + + +def _classify_sponsor(sponsor_name: str, seed: List[Dict[str, Any]]) -> str: + if not sponsor_name: + return 'unknown' + lname = sponsor_name.lower() + for s in seed: + if s['name'].lower() == lname: + return 'exact' + for alias in s.get('aliases') or []: + if alias.lower() == lname: + return 'alias' + for s in seed: + nm = s['name'].lower() + if nm and (nm in lname or lname in nm): + return 'fuzzy' + return 'unknown' + + +def _schema_errors(doc: Dict[str, Any]) -> List[str]: + errs: List[str] = [] + if not isinstance(doc, dict): + return ['payload must be a JSON object'] + for k in REQUIRED_FIELDS: + if not doc.get(k): + errs.append(f'missing required field: {k}') + if doc.get('version') is not None: + try: + int(doc['version']) + except (TypeError, ValueError): + errs.append('version must be an integer') + if doc.get('text_template') and not isinstance(doc['text_template'], str): + errs.append('text_template must be a string') + if doc.get('intro_variants') is not None and not isinstance(doc['intro_variants'], list): + errs.append('intro_variants must be a list') + if doc.get('outro_variants') is not None and not isinstance(doc['outro_variants'], list): + errs.append('outro_variants must be a list') + if doc.get('sponsor_tags') is not None and not isinstance(doc['sponsor_tags'], list): + errs.append('sponsor_tags must be a list') + return errs + + +def _tag_errors(doc: Dict[str, Any]) -> List[str]: + vt = valid_tags() + bad = [t for t in (doc.get('sponsor_tags') or []) if t not in vt] + return [f'unknown tag: {t}' for t in bad] + + +def _quality_errors(doc: Dict[str, Any]) -> List[str]: + """Defense-in-depth: re-run the same quality gates from the export side.""" + errs: List[str] = [] + text = doc.get('text_template') or '' + if len(text) < 50: + errs.append(f'text_template too short ({len(text)} < 50)') + if len(text) > 3500: + errs.append(f'text_template too long ({len(text)} > 3500)') + dur = doc.get('avg_duration') + if isinstance(dur, (int, float)) and dur > 120: + errs.append(f'avg_duration too long ({dur:.0f}s > 120s)') + + sponsor = (doc.get('sponsor') or '').lower() + text_l = text.lower() + aliases = [a.lower() for a in (doc.get('sponsor_aliases') or [])] + candidates = [sponsor] + aliases + if not any(c and re.search(rf'\b{re.escape(c)}\b', text_l) for c in candidates): + errs.append('sponsor (or any alias) does not appear in text_template') + return errs + + +def _single_pattern_errors(doc: Dict[str, Any], seed: List[Dict[str, Any]]) -> List[str]: + """Reject submissions that stitch multiple ads together. + + Delegates to the shared `find_foreign_sponsors` helper so the import + side and export side agree on the rule. + """ + text = doc.get('text_template') or '' + declared = (doc.get('sponsor') or '').lower() + declared_aliases = {a.lower() for a in (doc.get('sponsor_aliases') or [])} + declared_lower = {declared, *declared_aliases} - {''} + + foreign = find_foreign_sponsors(text, declared_lower, seed) + if not foreign: + return [] + sample = ', '.join(foreign[:3]) + more = '' if len(foreign) <= 3 else f' (+{len(foreign) - 3} more)' + return [ + 'multi-sponsor block: text mentions other seed sponsors ' + f'({sample}{more}). Each community pattern must describe ONE ad.' + ] + + +def _diff_snippet(incoming: str, existing: str, n: int = 200) -> str: + """Return a short side-by-side diff snippet capped at `n` chars per side.""" + inc = incoming[:n].replace('\n', ' ') + exi = existing[:n].replace('\n', ' ') + return f'incoming: "{inc}"\nexisting: "{exi}"' + + +def dedupe(doc: Dict[str, Any], existing: List[Dict[str, Any]]) -> Tuple[str, Optional[Dict[str, Any]], float]: + """Compare `doc` against `existing` patterns sharing the same sponsor. + + Returns (classification, matched_existing_doc_or_None, best_score). + classification is 'duplicate' (>=95%), 'variant' (75-95%), or 'distinct'. + """ + sponsor = (doc.get('sponsor') or '').lower() + if not sponsor: + return 'distinct', None, 0.0 + incoming_canon = canonicalize_for_dedupe(doc.get('text_template') or '') + if not incoming_canon: + return 'distinct', None, 0.0 + + best_score = 0.0 + best_match: Optional[Dict[str, Any]] = None + for ex in existing: + if (ex.get('sponsor') or '').lower() != sponsor: + continue + ex_canon = canonicalize_for_dedupe(ex.get('text_template') or '') + score = similarity(incoming_canon, ex_canon) + if score > best_score: + best_score = score + best_match = ex + + if best_score >= DUPLICATE_THRESHOLD: + return 'duplicate', best_match, best_score + if best_score >= VARIANT_THRESHOLD: + return 'variant', best_match, best_score + return 'distinct', best_match, best_score + + +def validate_doc( + path: str, + doc: Dict[str, Any], + seed: List[Dict[str, Any]], + existing: List[Dict[str, Any]], +) -> ValidationResult: + """Validate a single pattern doc against the seed list and existing patterns.""" + result = ValidationResult(path=path, sponsor=doc.get('sponsor')) + + schema_errs = _schema_errors(doc) + if schema_errs: + result.errors.extend(schema_errs) + result.status = 'reject' + return result + + tag_errs = _tag_errors(doc) + if tag_errs: + result.errors.extend(tag_errs) + result.status = 'reject' + + quality_errs = _quality_errors(doc) + if quality_errs: + result.errors.extend(quality_errs) + result.status = 'reject' + + single_errs = _single_pattern_errors(doc, seed) + if single_errs: + result.errors.extend(single_errs) + result.status = 'reject' + + result.sponsor_match = _classify_sponsor(doc.get('sponsor') or '', seed) + if result.sponsor_match == 'unknown': + result.warnings.append( + f'sponsor "{doc.get("sponsor")}" not in seed list (triage required)' + ) + + if result.status == 'reject': + return result + + classification, matched, score = dedupe(doc, existing) + result.classification = classification + result.similarity = round(score, 3) if score else None + if matched: + result.similar_to = matched.get('community_id') + if classification == 'duplicate': + result.errors.append( + f'duplicates {matched.get("community_id")} (score={score:.2f})' + ) + result.status = 'reject' + elif classification == 'variant': + result.warnings.append( + f'similar to {matched.get("community_id")} at {score:.0%}; ' + f'consider merging into existing intro/outro variants' + ) + result.diff_snippet = _diff_snippet( + doc.get('text_template') or '', + matched.get('text_template') or '', + ) + + if result.warnings and result.status != 'reject': + result.status = 'warn' + return result + + +def _extract_patterns(path: str, raw: Any) -> List[Tuple[str, Dict[str, Any]]]: + """Yield (synthetic_path, pattern_doc) for any payload shape. + + A flat per-pattern file returns ``[(path, raw)]``. A bundle file returns + one entry per pattern in the ``patterns`` array, with ``path#patterns[i]`` + so the PR comment can point at the failing index. + """ + if not isinstance(raw, dict): + return [] + if raw.get('format') == BUNDLE_FORMAT: + return [ + (f'{path}#patterns[{i}]', p) + for i, p in enumerate(iter_bundle_patterns(raw)) + ] + return [(path, raw)] + + +def _load_existing_patterns(community_dir: Path) -> List[Dict[str, Any]]: + """Load every JSON file in patterns/community/ (recursive) as existing patterns. + + Bundle files contribute every pattern in their ``patterns[]`` array. + """ + out: List[Dict[str, Any]] = [] + if not community_dir.exists(): + return out + for p in community_dir.rglob('*.json'): + if p.name == 'index.json': + continue + try: + with p.open('r', encoding='utf-8') as fh: + raw = json.load(fh) + except Exception as e: + logger.warning(f'Could not parse {p}: {e}') + continue + for _, doc in _extract_patterns(str(p), raw): + out.append(doc) + return out + + +def render_markdown_comment(results: List[ValidationResult]) -> str: + """Render a single Markdown comment for the PR summarizing all results. + + Each section links back to the relevant part of `patterns/CONTRIBUTING.md` + so submitters can self-serve on what failed and why. + """ + contributing = '../blob/main/patterns/CONTRIBUTING.md' + quality_link = f'[Quality checks]({contributing}#quality-checks-before-submission)' + dedupe_link = f'[Dedupe]({contributing}#dedupe)' + sponsor_link = ( + '[How to add a sponsor to the seed list]' + '(../blob/main/patterns/README.md#how-to-add-a-sponsor-to-the-seed-list)' + ) + + lines: List[str] = ['## Community pattern validation', ''] + rejected = [r for r in results if r.status == 'reject'] + warned = [r for r in results if r.status == 'warn'] + passed = [r for r in results if r.status == 'pass'] + + if rejected: + lines.append(f'### Rejected ({len(rejected)})') + lines.append(f'See {quality_link} and {dedupe_link} for what each gate enforces.') + lines.append('') + for r in rejected: + lines.append(f'- `{r.path}` (sponsor: {r.sponsor})') + for e in r.errors: + lines.append(f' - {e}') + lines.append('') + if warned: + lines.append(f'### Warnings ({len(warned)})') + lines.append( + f'Variant suggestions ({dedupe_link}) and unknown-sponsor flags ' + f'({sponsor_link}) are advisory — the maintainer decides during review.' + ) + lines.append('') + for r in warned: + lines.append(f'- `{r.path}` (sponsor: {r.sponsor})') + for w in r.warnings: + lines.append(f' - {w}') + if r.diff_snippet: + lines.append(' - Diff:') + lines.append(' ```') + for ln in r.diff_snippet.splitlines(): + lines.append(f' {ln}') + lines.append(' ```') + lines.append('') + if passed: + lines.append(f'### Passed ({len(passed)})') + for r in passed: + lines.append(f'- `{r.path}` (sponsor: {r.sponsor})') + lines.append('') + + if not (rejected or warned): + lines.append('Validation passed. Ready for review.') + + lines.append('') + lines.append( + f'_See [`patterns/CONTRIBUTING.md`]({contributing}) for the full ' + f'submission guide._' + ) + return '\n'.join(lines).rstrip() + '\n' + + +def run(pr_files: List[str], comment_output: Optional[str] = None, + status_output: Optional[str] = None) -> int: + """CLI driver. Returns 0 when no rejections, 1 otherwise.""" + seed = sponsor_seed() + repo_root = _REPO_SRC.parent + existing = _load_existing_patterns(repo_root / 'patterns' / 'community') + + # CI checks out the PR branch, so files added in the PR are already on + # disk in patterns/community/. Strip them from the "existing" baseline + # by community_id, otherwise dedupe sees every new file as a duplicate + # of itself with score=1.00. + pr_paths = {Path(p).resolve() for p in pr_files} + pr_community_ids: set = set() + for p in pr_paths: + if not p.exists(): + continue + try: + with p.open('r', encoding='utf-8') as fh: + raw = json.load(fh) + except Exception: + continue + for _, doc in _extract_patterns(str(p), raw): + cid = doc.get('community_id') + if cid: + pr_community_ids.add(cid) + if pr_community_ids: + existing = [e for e in existing if e.get('community_id') not in pr_community_ids] + + results: List[ValidationResult] = [] + for path in pr_files: + p = Path(path) + if not p.exists(): + results.append(ValidationResult( + path=path, status='reject', + errors=[f'file not found: {path}'], + )) + continue + try: + with p.open('r', encoding='utf-8') as fh: + raw = json.load(fh) + except Exception as e: + results.append(ValidationResult( + path=path, status='reject', + errors=[f'JSON parse error: {e}'], + )) + continue + if p.name == 'index.json': + continue + extracted = _extract_patterns(path, raw) + if not extracted: + results.append(ValidationResult( + path=path, status='reject', + errors=['empty submission: no patterns found in file'], + )) + continue + for sub_path, doc in extracted: + results.append(validate_doc(sub_path, doc, seed, existing)) + + markdown = render_markdown_comment(results) + if comment_output: + Path(comment_output).write_text(markdown, encoding='utf-8') + if status_output: + statuses = ','.join(f'{r.path}:{r.status}' for r in results) + Path(status_output).write_text(statuses + '\n', encoding='utf-8') + + rejected = any(r.status == 'reject' for r in results) + print(markdown) + return 1 if rejected else 0 + + +def main(argv: Optional[List[str]] = None) -> int: + parser = argparse.ArgumentParser(description='Validate community pattern submissions.') + parser.add_argument('--pr-files', nargs='+', required=True, + help='One or more JSON file paths from the PR diff.') + parser.add_argument('--comment-output', default=None, + help='Where to write the Markdown comment.') + parser.add_argument('--status-output', default=None, + help='Where to write a one-line status string.') + args = parser.parse_args(argv) + return run(args.pr_files, args.comment_output, args.status_output) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/tools/generate_manifest.py b/src/tools/generate_manifest.py new file mode 100644 index 00000000..06a5e48c --- /dev/null +++ b/src/tools/generate_manifest.py @@ -0,0 +1,124 @@ +"""Regenerate patterns/community/index.json from the per-pattern JSON files. + +Invoked by the `regenerate-manifest` GitHub Action on every push to `main` +that touches `patterns/community/**.json`, and also runnable by hand for +local testing or recovery from a workflow failure: + + python -m tools.generate_manifest + +Pattern files live at `patterns/community/-.json`. The +manifest is a single document that embeds every pattern inline so the +client fetches everything in one request. `published_at` is bumped to +the current UTC time; `manifest_version` and `vocabulary_version` are +constants that bump only when the format changes. +""" +from __future__ import annotations + +import json +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List + +# Defensive sys.path bootstrap so `python path/to/script.py` works as well as +# `python -m src.tools.X` (the workflow-style invocation). When run via -m, +# tools/__init__.py already did this — the lines below are then a no-op. +_REPO_SRC = Path(__file__).resolve().parents[1] +if str(_REPO_SRC) not in sys.path: + sys.path.insert(0, str(_REPO_SRC)) + +from utils.community_tags import ( # noqa: E402, F401 + BUNDLE_FORMAT, + MANIFEST_VERSION, + VOCABULARY_VERSION, + iter_bundle_patterns, +) + + +def _community_dir() -> Path: + return _REPO_SRC.parent / 'patterns' / 'community' + + +def _flatten_to_patterns(path: Path, data: Any) -> List[Dict[str, Any]]: + """Return per-pattern dicts from a JSON payload. Drops entries missing + ``community_id`` with a stderr warning so the manifest stays clean.""" + if not isinstance(data, dict): + print(f'WARN: skipping {path.name}: not a JSON object', file=sys.stderr) + return [] + is_bundle = data.get('format') == BUNDLE_FORMAT + out: List[Dict[str, Any]] = [] + for i, p in enumerate(iter_bundle_patterns(data)): + if not p.get('community_id'): + label = f'{path.name}#patterns[{i}]' if is_bundle else path.name + print(f'WARN: {label}: missing community_id', file=sys.stderr) + continue + out.append(p) + return out + + +def _load_pattern_files(directory: Path) -> List[Dict[str, Any]]: + """Read every -.json in `directory`, excluding index.json. + + Returns the parsed pattern dicts sorted by `submitted_at` so the + manifest order is deterministic across regenerations. Bundle files + are flattened so each contained pattern becomes its own manifest entry. + """ + patterns: List[Dict[str, Any]] = [] + for path in sorted(directory.glob('*.json')): + if path.name == 'index.json': + continue + try: + with path.open('r', encoding='utf-8') as fh: + data = json.load(fh) + except (json.JSONDecodeError, OSError) as e: + print(f'WARN: skipping {path.name}: {e}', file=sys.stderr) + continue + patterns.extend(_flatten_to_patterns(path, data)) + patterns.sort(key=lambda d: (d.get('submitted_at') or '', d.get('community_id') or '')) + return patterns + + +def build_manifest(patterns: List[Dict[str, Any]]) -> Dict[str, Any]: + """Build the manifest document from the loaded pattern dicts.""" + entries = [] + for p in patterns: + entries.append({ + 'community_id': p['community_id'], + 'version': int(p.get('version') or 1), + 'data': p, + }) + return { + 'manifest_version': MANIFEST_VERSION, + 'published_at': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + 'vocabulary_version': VOCABULARY_VERSION, + 'patterns': entries, + } + + +def write_manifest(manifest: Dict[str, Any], path: Path) -> None: + """Write the manifest atomically to `path`, preserving a trailing newline.""" + tmp = path.with_suffix(path.suffix + '.tmp') + with tmp.open('w', encoding='utf-8') as fh: + json.dump(manifest, fh, indent=2) + fh.write('\n') + tmp.replace(path) + + +def main() -> int: + directory = _community_dir() + if not directory.exists(): + print(f'ERROR: {directory} does not exist', file=sys.stderr) + return 1 + patterns = _load_pattern_files(directory) + manifest = build_manifest(patterns) + target = directory / 'index.json' + write_manifest(manifest, target) + print( + f'Wrote {target.relative_to(_REPO_SRC.parent)} ' + f'with {len(patterns)} pattern(s) at {manifest["published_at"]}' + ) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/utils/community_tags.py b/src/utils/community_tags.py new file mode 100644 index 00000000..f9bd14ba --- /dev/null +++ b/src/utils/community_tags.py @@ -0,0 +1,232 @@ +"""Tag vocabulary, sponsor seed, iTunes-category map, PII constants for community patterns.""" +from __future__ import annotations + +import csv +import json +import os +import re +from functools import lru_cache +from typing import Dict, FrozenSet, List, Tuple + +_SEED_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'seed_data') + +UNIVERSAL_TAG = 'universal' + +# Single source of truth for the upstream MinusPod repo identity. Used by +# both the export pipeline's prefilled-PR URL builder and the sync job's +# manifest fetch URL. +GITHUB_REPO = 'ttlequals0/MinusPod' +COMMUNITY_MANIFEST_URL = ( + f'https://raw.githubusercontent.com/{GITHUB_REPO}/main/patterns/community/index.json' +) + +# Schema versions. MANIFEST_VERSION bumps when the manifest envelope shape +# changes; VOCABULARY_VERSION bumps when the tag list is added to / removed +# from. Both ship with the app image; this module is the single owner — the +# manifest generator and the sync job both import these constants, not their +# own copies. +MANIFEST_VERSION = 1 +VOCABULARY_VERSION = 1 + +# Community submission bundle: format string + version, used by the export +# builder (community_export.build_bundle), the PR-side validator, and the +# manifest generator. Single source of truth so the three modules can't +# drift on the spelling. +BUNDLE_FORMAT = 'minuspod-community-submission' +BUNDLE_VERSION = 1 + + +def iter_bundle_patterns(raw): + """Yield each pattern dict inside a payload, regardless of shape. + + A flat per-pattern file yields ``raw`` itself. A bundle file (``raw['format'] + == BUNDLE_FORMAT``) yields each entry in ``raw['patterns']``. Non-dict + entries are skipped. Callers add their own indexing or filtering. + """ + if not isinstance(raw, dict): + return + if raw.get('format') == BUNDLE_FORMAT: + for p in raw.get('patterns') or []: + if isinstance(p, dict): + yield p + return + yield raw + +# ad_patterns.source values. Centralized so API/DB/UI agree on spelling. +PATTERN_SOURCE_LOCAL = 'local' +PATTERN_SOURCE_COMMUNITY = 'community' +PATTERN_SOURCE_IMPORTED = 'imported' +PATTERN_SOURCES: FrozenSet[str] = frozenset({ + PATTERN_SOURCE_LOCAL, PATTERN_SOURCE_COMMUNITY, PATTERN_SOURCE_IMPORTED, +}) + + +@lru_cache(maxsize=1) +def _vocabulary_tags() -> FrozenSet[str]: + """Tags from src/seed_data/tag_vocabulary.csv (no 'universal' — see VALID_TAGS).""" + path = os.path.join(_SEED_DIR, 'tag_vocabulary.csv') + tags = set() + with open(path, 'r', encoding='utf-8') as fh: + reader = csv.DictReader(fh) + for row in reader: + tag = (row.get('tag') or '').strip() + if tag: + tags.add(tag) + return frozenset(tags) + + +@lru_cache(maxsize=1) +def valid_tags() -> FrozenSet[str]: + """All accepted tags: vocabulary CSV plus the special 'universal' opt-in.""" + return frozenset(_vocabulary_tags() | {UNIVERSAL_TAG}) + + +@lru_cache(maxsize=1) +def vocabulary_payload() -> Dict[str, object]: + """Categorized vocabulary view used by patterns/vocabulary.json AND the + /api/v1/tags/vocabulary endpoint. Cached because the source CSV ships + with the app image and never changes at runtime. + """ + path = os.path.join(_SEED_DIR, 'tag_vocabulary.csv') + genres: List[Dict[str, str]] = [] + industries: List[Dict[str, str]] = [] + with open(path, 'r', encoding='utf-8') as fh: + reader = csv.DictReader(fh) + for row in reader: + entry = {'tag': row['tag'], 'description': row['description']} + if row['category'] == 'podcast_genre': + genres.append(entry) + elif row['category'] == 'sponsor_industry': + industries.append(entry) + return { + 'vocabulary_version': VOCABULARY_VERSION, + 'all_tags': sorted(valid_tags()), + 'podcast_genres': genres, + 'sponsor_industries': industries, + 'special_tags': [{ + 'tag': UNIVERSAL_TAG, + 'description': 'Sponsor advertises broadly across all podcast genres.', + }], + } + + +@lru_cache(maxsize=1) +def itunes_category_map() -> Dict[str, str]: + """iTunes category string -> vocabulary tag (case-insensitive lookup via .lower()). + + Keys in the file are the canonical iTunes labels (e.g. 'Health & Fitness'). + We expose a lowercased view for lookup. + """ + path = os.path.join(_SEED_DIR, 'itunes_category_map.json') + with open(path, 'r', encoding='utf-8') as fh: + raw = json.load(fh) + return {k.lower(): v for k, v in raw.items() if not k.startswith('_')} + + +def map_itunes_category(category: str) -> str | None: + """Look up a single iTunes category string and return the vocab tag, or None.""" + if not category: + return None + return itunes_category_map().get(category.strip().lower()) + + +@lru_cache(maxsize=1) +def sponsor_seed() -> List[Dict[str, object]]: + """List of {name, aliases: List[str], tags: List[str]} from sponsors_final.csv. + + Names and tags are preserved verbatim. Aliases and tags are pipe-delimited in the CSV. + """ + path = os.path.join(_SEED_DIR, 'sponsors_final.csv') + rows: List[Dict[str, object]] = [] + with open(path, 'r', encoding='utf-8') as fh: + reader = csv.DictReader(fh) + for row in reader: + name = (row.get('name') or '').strip() + if not name: + continue + aliases_raw = (row.get('aliases') or '').strip() + tags_raw = (row.get('tags') or '').strip() + aliases = [a.strip() for a in aliases_raw.split('|') if a.strip()] if aliases_raw else [] + tags = [t.strip() for t in tags_raw.split('|') if t.strip()] if tags_raw else [] + rows.append({'name': name, 'aliases': aliases, 'tags': tags}) + return rows + + +# Consumer email domains — strip on export (best-effort, tunable). +CONSUMER_EMAIL_DOMAINS: FrozenSet[str] = frozenset({ + 'gmail.com', 'yahoo.com', 'aol.com', 'hotmail.com', 'outlook.com', + 'icloud.com', 'me.com', 'mac.com', 'protonmail.com', 'proton.me', + 'mail.com', 'gmx.com', 'gmx.net', 'yandex.com', 'yandex.ru', + 'qq.com', '163.com', 'live.com', 'msn.com', 'hey.com', 'fastmail.com', + 'tutanota.com', +}) + +# Toll-free prefixes — phone numbers using these are KEPT in export text. +# Everything else matched by the phone regex is stripped. +TOLLFREE_PREFIXES_NANP: Tuple[str, ...] = ('800', '833', '844', '855', '866', '877', '888') +TOLLFREE_PREFIXES_UK: Tuple[str, ...] = ('0800', '0808') +TOLLFREE_PREFIX_AU: str = '1800' +TOLLFREE_PREFIX_UIFN: str = '+800' + + +# Conservative phone regex: must have at least one dash/paren/dot/space separator +# or a country code prefix. Bare 7-10 digit runs (likely codes, dates, etc.) are +# left alone. Captures the leading dialable prefix to classify toll-free. +PHONE_REGEX = re.compile( + r'''(?x) + (? bool: + """Return True if the captured phone string starts with a toll-free prefix.""" + digits_only = re.sub(r'[^\d+]', '', phone_match) + # Strip optional leading +1 for NANP + if digits_only.startswith('+1'): + digits_only = digits_only[2:] + elif digits_only.startswith('1') and len(digits_only) > 10: + digits_only = digits_only[1:] + if digits_only.startswith(TOLLFREE_PREFIXES_NANP): + return True + if digits_only.startswith(TOLLFREE_PREFIXES_UK): + return True + if digits_only.startswith(TOLLFREE_PREFIX_AU): + return True + if phone_match.startswith(TOLLFREE_PREFIX_UIFN): + return True + return False diff --git a/src/utils/cron.py b/src/utils/cron.py new file mode 100644 index 00000000..47c1574b --- /dev/null +++ b/src/utils/cron.py @@ -0,0 +1,133 @@ +"""Minimal 5-field cron evaluator for MinusPod's community-sync schedule. + +Supports the common cron syntax users will write — numbers, '*', ranges, +lists, and step values — for the five standard fields: + + minute (0-59) hour (0-23) day-of-month (1-31) month (1-12) dow (0-6 sun=0) + +Designed to answer two questions: + + * `is_valid_expression(expr)` -> bool — for input validation in settings. + * `is_due(expr, last_run, now)` -> bool — has a fire time elapsed since + `last_run`? Used by the background sync job that ticks every 15 min. + +Day-of-month and day-of-week follow vixie-cron's OR semantics: if both +are restricted, a fire happens when EITHER matches; if only one is +restricted, only that one is consulted. Names for months and days are +intentionally NOT supported to keep parsing simple. +""" +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import Iterable, Set, Tuple + +_FIELD_RANGES = ( + (0, 59), # minute + (0, 23), # hour + (1, 31), # day of month + (1, 12), # month + (0, 6), # day of week (Sunday = 0) +) + + +def _parse_field(spec: str, lo: int, hi: int) -> Set[int]: + spec = spec.strip() + if not spec: + raise ValueError('empty cron field') + out: Set[int] = set() + for part in spec.split(','): + step = 1 + if '/' in part: + base, step_str = part.split('/', 1) + step = int(step_str) + if step < 1: + raise ValueError(f'step must be >= 1, got {step_str}') + else: + base = part + if base == '*' or base == '': + start, end = lo, hi + elif '-' in base: + a, b = base.split('-', 1) + start = int(a) + end = int(b) + if start < lo or end > hi or start > end: + raise ValueError(f'out-of-range range: {base}') + else: + v = int(base) + if v < lo or v > hi: + raise ValueError(f'out-of-range value: {v}') + start = end = v + for n in range(start, end + 1, step): + out.add(n) + if not out: + raise ValueError(f'no values for field "{spec}"') + return out + + +def parse_expression(expr: str) -> Tuple[Set[int], Set[int], Set[int], Set[int], Set[int]]: + """Parse a 5-field cron expression into per-field sets. Raises ValueError.""" + parts = expr.strip().split() + if len(parts) != 5: + raise ValueError(f'expected 5 fields, got {len(parts)}') + sets = tuple( + _parse_field(parts[i], *_FIELD_RANGES[i]) for i in range(5) + ) + return sets # type: ignore[return-value] + + +def is_valid_expression(expr: str) -> bool: + """True if `expr` is a parseable 5-field cron expression.""" + try: + parse_expression(expr) + return True + except (ValueError, TypeError): + return False + + +def _matches(dt: datetime, sets: Tuple[Set[int], Set[int], Set[int], Set[int], Set[int]]) -> bool: + minute_s, hour_s, dom_s, month_s, dow_s = sets + if dt.minute not in minute_s: + return False + if dt.hour not in hour_s: + return False + if dt.month not in month_s: + return False + dow = dt.weekday() # 0=Mon..6=Sun + cron_dow = (dow + 1) % 7 # cron: 0=Sun..6=Sat + # Vixie semantics: if both DOM and DOW are constrained (not full sets), + # match on EITHER. If only one is constrained, use only that one. + dom_full = dom_s == set(range(1, 32)) + dow_full = dow_s == set(range(0, 7)) + if dom_full and dow_full: + return True + if dom_full: + return cron_dow in dow_s + if dow_full: + return dt.day in dom_s + return dt.day in dom_s or cron_dow in dow_s + + +def next_fire(expr: str, after: datetime, *, max_iters: int = 60 * 24 * 366) -> datetime: + """Return the first datetime > `after` that matches `expr`. + + Resolution is minute. `after` may be naive or tz-aware; the return value + matches whichever was passed in. Raises ValueError if no fire within + `max_iters` minutes. + """ + sets = parse_expression(expr) + # Drop to minute resolution and add one minute so `after` itself isn't a hit. + candidate = after.replace(second=0, microsecond=0) + timedelta(minutes=1) + for _ in range(max_iters): + if _matches(candidate, sets): + return candidate + candidate += timedelta(minutes=1) + raise ValueError(f'No match within {max_iters} minutes for expression: {expr}') + + +def is_due(expr: str, last_run: datetime, now: datetime) -> bool: + """True if the next scheduled fire after `last_run` has elapsed by `now`.""" + try: + nxt = next_fire(expr, last_run) + except ValueError: + return False + return nxt <= now diff --git a/tests/integration/test_community_pattern_flow.py b/tests/integration/test_community_pattern_flow.py new file mode 100644 index 00000000..7c030d33 --- /dev/null +++ b/tests/integration/test_community_pattern_flow.py @@ -0,0 +1,112 @@ +"""End-to-end integration: community sync + protect + reseed + tag eligibility.""" +import os +import sys + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from database import Database # noqa: E402 +from community_sync import apply_manifest # noqa: E402 +from text_pattern_matcher import TextPatternMatcher # noqa: E402 + + +@pytest.fixture +def db(tmp_path): + Database._instance = None # type: ignore[attr-defined] + inst = Database(data_dir=str(tmp_path)) + yield inst + Database._instance = None # type: ignore[attr-defined] + + +SEGMENTS = [ + {'start': 0.0, 'end': 5.0, 'text': 'Welcome to the show'}, + {'start': 5.0, 'end': 15.0, 'text': 'This episode is brought to you by Squarespace'}, + {'start': 15.0, 'end': 25.0, 'text': 'Visit Squarespace dot com slash show for a free trial'}, + {'start': 25.0, 'end': 35.0, 'text': 'Use code SHOW for ten percent off your first website'}, + {'start': 35.0, 'end': 40.0, 'text': 'Back to the episode'}, +] + + +def _manifest(community_id='sq-1', version=1): + return { + 'manifest_version': 1, + 'patterns': [{ + 'community_id': community_id, + 'version': version, + 'data': { + 'community_id': community_id, + 'version': version, + 'scope': 'global', + 'sponsor': 'Squarespace', + 'text_template': ( + 'This episode is brought to you by Squarespace. ' + 'Visit Squarespace dot com slash show for a free trial. ' + 'Use code SHOW for ten percent off your first website.' + ), + 'intro_variants': ['This episode is brought to you by Squarespace'], + 'outro_variants': ['Use code SHOW'], + }, + }], + } + + +def test_full_sync_to_match_cycle(db): + # 1. Apply the manifest -> 1 community pattern inserted, source='community'. + summary = apply_manifest(db, _manifest()) + assert summary['inserted'] == 1 + + rows = db.get_patterns_by_source('community', active_only=True) + assert len(rows) == 1 + cp = rows[0] + assert cp['source'] == 'community' + assert cp['version'] == 1 + assert cp['protected_from_sync'] == 0 + + # 2. Sponsor Squarespace was seeded by migration with 'universal' tag, + # so eligibility is unconditional. + matcher = TextPatternMatcher(db=db) + matcher._load_patterns() + out = matcher._filter_patterns_by_scope(podcast_tags={'kids_family'}) + assert any(p.id == cp['id'] for p in out), 'universal sponsor should always be eligible' + + +def test_resync_skips_protected_pattern(db): + apply_manifest(db, _manifest(version=1)) + pid = db.get_patterns_by_source('community', active_only=False)[0]['id'] + + # User edits / protects the row (the API endpoint sets this; we set it directly). + db.set_pattern_protected(pid, True) + + # Higher-version manifest -> should NOT update the protected row. + summary = apply_manifest(db, _manifest(version=2)) + row = db.get_ad_pattern_by_id(pid) + assert row['version'] == 1 + assert summary['updated'] == 0 + assert summary['skipped'] >= 1 + + +def test_sponsor_reseed_preserves_pattern_fk(db): + # The seed migration ran during Database init; patterns referencing + # sponsor_id from the migration should remain valid. + squarespace = db.get_known_sponsor_by_name('Squarespace') + assert squarespace is not None + pid = db.create_ad_pattern( + scope='podcast', + podcast_id='test-podcast', + text_template='Squarespace pattern body long enough for tests to be considered valid for storage today', + sponsor_id=squarespace['id'], + source='local', + ) + # Re-trigger the schema migrations explicitly to ensure idempotence. + conn = db.get_connection() + # Force-rerun by clearing the revision stamp + conn.execute("DELETE FROM settings WHERE key = 'sponsor_seed_revision'") + conn.commit() + db._reseed_known_sponsors(conn) + + # The pattern's sponsor_id still resolves to a valid sponsor row. + pattern = db.get_ad_pattern_by_id(pid) + sponsor = db.get_known_sponsor_by_id(pattern['sponsor_id']) + assert sponsor is not None + assert sponsor['name'] == 'Squarespace' diff --git a/tests/unit/test_community_export.py b/tests/unit/test_community_export.py new file mode 100644 index 00000000..a9534bc0 --- /dev/null +++ b/tests/unit/test_community_export.py @@ -0,0 +1,221 @@ +"""Tests for the community export pipeline (src/community_export.py).""" +import json +import os +import sys +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from community_export import ( # noqa: E402 + BUNDLE_FORMAT, + ExportError, + URL_LENGTH_LIMIT_BYTES, + build_bundle, + build_export_payload, + build_pr_url, + strip_pii, +) + + +def _sponsor(name='Squarespace', aliases=None, tags=None, sponsor_id=1): + return { + 'id': sponsor_id, + 'name': name, + 'aliases': json.dumps(aliases or []), + 'tags': json.dumps(tags or ['tech', 'saas', 'universal']), + 'is_active': 1, + } + + +def _pattern(text=None, sponsor_id=1, **overrides): + base = dict( + id=42, + text_template=( + text + or 'Go to Squarespace dot com slash show to start your free trial. ' + 'Save ten percent on your first website with code SHOW. ' + 'Squarespace gives you the tools to launch any idea.' + ), + intro_variants=json.dumps(['Go to Squarespace dot com slash show to start your free trial.']), + outro_variants=json.dumps([]), + scope='global', + sponsor_id=sponsor_id, + confirmation_count=3, + false_positive_count=0, + avg_duration=30.0, + source='local', + ) + base.update(overrides) + return base + + +def test_export_happy_path(): + p = _pattern() + s = _sponsor() + payload = build_export_payload(p, [s]) + assert payload['sponsor'] == 'Squarespace' + assert payload['version'] == 1 + assert payload['sponsor_match'] == 'exact' + assert 'community_id' in payload + assert payload['sponsor_tags'] == ['tech', 'saas', 'universal'] + + +def test_export_rejects_short_text(): + p = _pattern(text='Squarespace dot com slash code') + with pytest.raises(ExportError) as excinfo: + build_export_payload(p, [_sponsor()]) + assert any('too short' in r for r in excinfo.value.reasons) + + +def test_export_rejects_low_confirmation(): + p = _pattern(confirmation_count=0) + with pytest.raises(ExportError) as excinfo: + build_export_payload(p, [_sponsor()]) + assert any('confirmation_count' in r for r in excinfo.value.reasons) + + +def test_export_rejects_when_sponsor_not_in_text(): + p = _pattern(text='Some long ad text that does not name the brand at all ' + 'but is long enough to pass the length check and exceeds the minimum ' + 'fifty characters that the gate requires. Visit example dot com for more.') + with pytest.raises(ExportError) as excinfo: + build_export_payload(p, [_sponsor()]) + assert any('sponsor name' in r for r in excinfo.value.reasons) + + +def test_export_rejects_foreign_sponsor_in_text(): + base_text = ( + 'Today\'s episode is brought to you by Squarespace. Go to Squarespace dot com ' + 'slash show. But also try BetterHelp for your mental health needs.' + ) + p = _pattern(text=base_text) + sponsors = [ + _sponsor(), + { + 'id': 2, + 'name': 'BetterHelp', + 'aliases': '[]', + 'tags': json.dumps(['mental_health', 'universal']), + 'is_active': 1, + }, + ] + with pytest.raises(ExportError) as excinfo: + build_export_payload(p, sponsors) + assert any('foreign sponsor' in r for r in excinfo.value.reasons) + + +def test_export_strips_consumer_emails(): + text = ( + 'Squarespace ad text long enough. Reach out at user@gmail.com ' + 'or contact us at sales@business.example for partnerships. Visit Squarespace today.' + ) + out = strip_pii(text) + assert '[email]' in out + assert 'user@gmail.com' not in out + assert 'sales@business.example' in out # business kept + + +def test_export_strips_non_tollfree_phones_keeps_tollfree(): + text = 'Squarespace. Call 1-800-555-1234 or (212) 555-1234 for help.' + out = strip_pii(text) + assert '1-800-555-1234' in out + assert '(212) 555-1234' not in out + assert '[phone]' in out + + +def test_pr_url_fits_for_typical_pattern(): + p = _pattern() + s = _sponsor() + payload = build_export_payload(p, [s]) + url, filename, too_large = build_pr_url(payload) + assert filename.startswith('squarespace-') + assert filename.endswith('.json') + assert url.startswith('https://github.com/ttlequals0/MinusPod/new/main/patterns/community') + assert too_large is False + + +class _FakeDB: + """Tiny stand-in for the DB facade build_bundle calls.""" + + def __init__(self, patterns_by_id, sponsors): + self._patterns = patterns_by_id + self._sponsors = sponsors + + def get_ad_pattern_by_id(self, pid): + return self._patterns.get(pid) + + def get_ad_patterns_by_ids(self, ids): + return {pid: self._patterns[pid] for pid in ids if pid in self._patterns} + + def get_known_sponsors(self, active_only=False): + if active_only: + return [s for s in self._sponsors if s.get('is_active')] + return list(self._sponsors) + + +def test_build_bundle_groups_ready_and_rejected(): + good = _pattern() + bad_short = _pattern(text='too short', id=43) + db = _FakeDB( + patterns_by_id={42: good, 43: bad_short}, + sponsors=[_sponsor()], + ) + bundle, rejected = build_bundle([42, 43, 999], db) + assert bundle['format'] == BUNDLE_FORMAT + assert bundle['pattern_count'] == 1 + assert len(bundle['patterns']) == 1 + assert bundle['patterns'][0]['sponsor'] == 'Squarespace' + assert bundle['patterns'][0]['community_id'] + rejected_ids = {r['id'] for r in rejected} + assert 43 in rejected_ids + assert 999 in rejected_ids + not_found = next(r for r in rejected if r['id'] == 999) + assert 'not found' in not_found['reasons'][0] + + +def test_build_export_payload_repairs_double_encoded_variants(): + """Patterns created before 2.4.6 had intro/outro_variants stored as + a double-JSON-encoded string. The export pipeline must decode through + that and emit a clean list[str], not a list of single characters.""" + raw_text = 'Go to Squarespace dot com slash show to start your free trial. Save ten percent on your first website with code SHOW. Squarespace gives you the tools to launch any idea.' + intros = ['Intro variant one for Squarespace ad.'] + outros = ['Outro variant for Squarespace ad here.'] + pattern = _pattern( + text=raw_text, + # Simulate the production bug: a JSON-encoded string of a + # JSON-encoded list. `json.loads` once returns a string; the + # defensive helper has to decode again. + intro_variants=json.dumps(json.dumps(intros)), + outro_variants=json.dumps(json.dumps(outros)), + ) + payload = build_export_payload(pattern, [_sponsor()]) + assert payload['intro_variants'] == intros + assert payload['outro_variants'] == outros + + +def test_build_export_payload_passes_single_encoded_through(): + """Correctly-encoded patterns must still produce list[str], idempotent.""" + intros = ['Intro variant one for Squarespace ad.'] + pattern = _pattern(intro_variants=json.dumps(intros), outro_variants='[]') + payload = build_export_payload(pattern, [_sponsor()]) + assert payload['intro_variants'] == intros + assert payload['outro_variants'] == [] + + +def test_build_bundle_rejects_community_source(): + pattern = _pattern(source='community') + db = _FakeDB(patterns_by_id={42: pattern}, sponsors=[_sponsor()]) + bundle, rejected = build_bundle([42], db) + assert bundle['pattern_count'] == 0 + assert rejected[0]['id'] == 42 + assert 'only \'local\' can be submitted' in rejected[0]['reasons'][0] + + +def test_pr_url_falls_back_when_too_large(): + long_text = 'Squarespace ' + ('x ' * 1700) + p = _pattern(text=long_text) + s = _sponsor() + payload = build_export_payload(p, [s]) + url, _, too_large = build_pr_url(payload) + # Force the size check + assert (len(url.encode('utf-8')) > URL_LENGTH_LIMIT_BYTES) == too_large diff --git a/tests/unit/test_community_pattern_validator.py b/tests/unit/test_community_pattern_validator.py new file mode 100644 index 00000000..f46a4df2 --- /dev/null +++ b/tests/unit/test_community_pattern_validator.py @@ -0,0 +1,275 @@ +"""Tests for the community_pattern_validator (canonicalization + dedupe).""" +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from tools.community_pattern_validator import ( # noqa: E402 + canonicalize_for_dedupe, + dedupe, + validate_doc, +) +from utils.community_tags import BUNDLE_FORMAT, sponsor_seed # noqa: E402 + + +def test_canonicalize_strips_stopwords_and_dates(): + text = ( + 'On Monday March 5 2024, the show is brought to you by Squarespace! ' + 'Visit by 12/31. We are sponsored by example dot com.' + ) + out = canonicalize_for_dedupe(text) + assert 'monday' not in out + assert 'march' not in out + assert '2024' not in out + assert '12 31' not in out + assert '12/31' not in out + assert 'show' in out + assert 'squarespace' in out + assert 'the' not in out.split() + assert 'by' not in out.split() + + +def test_canonicalize_relative_time_stripped(): + out = canonicalize_for_dedupe('Sign up today or tomorrow for our weekend offer') + assert 'today' not in out + assert 'tomorrow' not in out + assert 'weekend' not in out + assert 'sign' in out + assert 'offer' in out + + +def test_dedupe_identifies_duplicate_when_near_identical(): + base = 'Visit Squarespace to launch your website with confidence today.' + near = 'Visit Squarespace and launch your website with confidence today!' + existing = [{'sponsor': 'Squarespace', 'text_template': base, 'community_id': 'A'}] + doc = {'sponsor': 'Squarespace', 'text_template': near} + classification, matched, score = dedupe(doc, existing) + assert classification == 'duplicate' + assert matched['community_id'] == 'A' + assert score >= 0.95 + + +def test_dedupe_identifies_variant(): + # Same sponsor + same opener and closing sentence, middle CTA swapped — + # calibrated to land around 0.78-0.85 on SequenceMatcher. + base = ( + 'Visit Squarespace dot com slash show for a free trial. Use code SHOW ' + 'to save ten percent on your first website. Squarespace gives you the ' + 'tools to launch any idea online.' + ) + variant = ( + 'Visit Squarespace dot com slash show for a free trial. Build with the ' + 'Squarespace tools and launch any idea online today.' + ) + existing = [{'sponsor': 'Squarespace', 'text_template': base, 'community_id': 'A'}] + doc = {'sponsor': 'Squarespace', 'text_template': variant} + classification, _, score = dedupe(doc, existing) + assert classification == 'variant', f'got {classification} score={score:.3f}' + assert 0.75 <= score < 0.95 + + +def test_dedupe_identifies_distinct_different_sponsors(): + existing = [ + {'sponsor': 'NordVPN', 'text_template': 'Visit NordVPN today for forty percent off', 'community_id': 'A'} + ] + doc = {'sponsor': 'Squarespace', 'text_template': 'Visit Squarespace today for ten percent off your site'} + classification, matched, score = dedupe(doc, existing) + assert classification == 'distinct' + assert score == 0.0 + + +def test_validate_doc_rejects_unknown_tags(): + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'Squarespace', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': 'Squarespace dot com slash show for ten percent off your website today launch confidently!', + 'sponsor_tags': ['tech', 'not_a_real_tag'], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status == 'reject' + assert any('unknown tag: not_a_real_tag' in e for e in result.errors) + + +def test_validate_doc_warns_unknown_sponsor(): + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'AcmeBrandThatDoesNotExist', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': 'AcmeBrandThatDoesNotExist dot com slash show ten percent off launch your idea today now', + 'sponsor_tags': [], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status == 'warn' + assert result.sponsor_match == 'unknown' + + +def test_validate_doc_rejects_missing_required(): + seed = sponsor_seed() + doc = {'community_id': 'abc'} + result = validate_doc('a.json', doc, seed, []) + assert result.status == 'reject' + assert any('required field' in e for e in result.errors) + + +def test_validate_doc_rejects_multi_sponsor_block(): + """A pattern whose text mentions another seed sponsor by name is a + multi-ad stitch and must be rejected.""" + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'Squarespace', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': ( + 'Build your site on Squarespace. Visit Squarespace dot com slash ' + 'show for ten percent off. This episode is also brought to you by ' + 'BetterHelp, online therapy on your schedule.' + ), + 'sponsor_tags': ['tech'], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status == 'reject' + assert any('multi-sponsor block' in e for e in result.errors), result.errors + + +def test_validate_doc_accepts_own_alias_in_text(): + """A pattern's own sponsor_aliases must not be flagged as foreign.""" + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'Athletic Greens', + 'sponsor_aliases': ['AG1'], + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': ( + 'Athletic Greens makes AG1, a daily foundational nutrition supplement ' + 'with 75 high quality vitamins, minerals, and whole food sourced ingredients.' + ), + 'sponsor_tags': ['supplements'], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status in ('pass', 'warn'), (result.status, result.errors) + assert not any('multi-sponsor block' in e for e in result.errors) + + +def test_validate_doc_accepts_seed_alias_of_declared_sponsor(): + """If the seed row carries aliases the doc didn't redeclare, mentioning + those aliases in the text must still not flag as foreign. The helper + skips the declared sponsor's seed row entirely once any of its names + matches the declared identity.""" + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'Athletic Greens', + # NOTE: sponsor_aliases intentionally omitted; rely on seed aliases. + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': ( + 'Athletic Greens makes AG1, a daily nutrition supplement, and you can ' + 'try AG1 risk free with a 90-day money back guarantee.' + ), + 'sponsor_tags': ['supplements'], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status in ('pass', 'warn'), (result.status, result.errors) + assert not any('multi-sponsor block' in e for e in result.errors) + + +def test_run_accepts_bundle_file(tmp_path, monkeypatch): + """A bundle file with `format == 'minuspod-community-submission'` + is expanded into N validations, one per pattern in `patterns[]`. + Each result path is `#patterns[i]` so the PR comment can + point at the failing index.""" + import tools.community_pattern_validator as v + repo_root = tmp_path / 'repo' + community = repo_root / 'patterns' / 'community' + community.mkdir(parents=True) + monkeypatch.setattr(v, '_REPO_SRC', repo_root / 'src') + (repo_root / 'src').mkdir() + + bundle = { + 'format': BUNDLE_FORMAT, + 'bundle_version': 1, + 'submitted_at': '2026-05-15T20:00:00Z', + 'submitted_app_version': '2.4.5', + 'pattern_count': 2, + 'patterns': [ + { + 'community_id': 'aaaaaaaa-1111-2222-3333-444444444441', + 'version': 1, + 'sponsor': 'Squarespace', + 'submitted_at': '2026-05-15T20:00:00Z', + 'text_template': 'Squarespace dot com slash show for ten percent off your website today launch confidently!', + 'sponsor_tags': ['tech'], + }, + { + 'community_id': 'aaaaaaaa-1111-2222-3333-444444444442', + 'version': 1, + 'sponsor': 'BetterHelp', + 'submitted_at': '2026-05-15T20:00:00Z', + 'text_template': 'BetterHelp offers convenient affordable online therapy you can do from anywhere right now.', + 'sponsor_tags': ['mental_health'], + }, + ], + } + pr_file = community / 'submission-aaaaaaaa.json' + pr_file.write_text(__import__('json').dumps(bundle)) + + rc = v.run([str(pr_file)], comment_output=str(tmp_path / 'c.md')) + comment = (tmp_path / 'c.md').read_text() + assert '#patterns[0]' in comment + assert '#patterns[1]' in comment + assert rc == 0, comment + + +def test_run_excludes_pr_files_from_existing_baseline(tmp_path, monkeypatch): + """CI checks out the PR branch, so files added in the PR are already + on disk in patterns/community/. The CLI's `run()` must strip them + from the "existing" baseline before deduping, otherwise every new + pattern flags as a duplicate of itself with score 1.0.""" + import tools.community_pattern_validator as v + repo_root = tmp_path / 'repo' + community = repo_root / 'patterns' / 'community' + community.mkdir(parents=True) + # Point the validator at our fake repo root. + monkeypatch.setattr(v, '_REPO_SRC', repo_root / 'src') + (repo_root / 'src').mkdir() + + new_doc = { + 'community_id': 'aaaaaaaa-1111-2222-3333-444444444444', + 'version': 1, + 'sponsor': 'Squarespace', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': 'Squarespace dot com slash show for ten percent off your website today launch confidently!', + 'sponsor_tags': ['tech'], + } + pr_file = community / 'squarespace-aaaaaaaa.json' + pr_file.write_text(__import__('json').dumps(new_doc)) + + rc = v.run([str(pr_file)], comment_output=str(tmp_path / 'c.md')) + assert rc == 0, (tmp_path / 'c.md').read_text() + + +def test_validate_doc_accepts_alias_as_declared_sponsor(): + """Edge case: doc declares an alias as the canonical sponsor. The seed + row's actual canonical name appearing in the text must not flag.""" + seed = sponsor_seed() + doc = { + 'community_id': 'abc', + 'version': 1, + 'sponsor': 'AG1', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': ( + 'AG1 from Athletic Greens is a daily foundational nutrition supplement ' + 'with vitamins minerals and whole food sourced ingredients you can mix.' + ), + 'sponsor_tags': ['supplements'], + } + result = validate_doc('a.json', doc, seed, []) + assert result.status in ('pass', 'warn'), (result.status, result.errors) + assert not any('multi-sponsor block' in e for e in result.errors) diff --git a/tests/unit/test_community_sync.py b/tests/unit/test_community_sync.py new file mode 100644 index 00000000..e91db4da --- /dev/null +++ b/tests/unit/test_community_sync.py @@ -0,0 +1,110 @@ +"""Tests for community_sync.apply_manifest semantics.""" +import json +import os +import sys + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from database import Database # noqa: E402 +from community_sync import apply_manifest, sync_now # noqa: E402 + + +@pytest.fixture +def db(tmp_path): + Database._instance = None # type: ignore[attr-defined] + instance = Database(data_dir=str(tmp_path)) + yield instance + Database._instance = None # type: ignore[attr-defined] + + +def _pattern_entry(cid, version=1, sponsor='Squarespace', text='community version '): + return { + 'community_id': cid, + 'version': version, + 'data': { + 'community_id': cid, + 'version': version, + 'scope': 'global', + 'sponsor': sponsor, + 'text_template': f'{text} for {sponsor} dot com slash show promo SHOW save ten percent today extra body', + 'intro_variants': [], + 'outro_variants': [], + }, + } + + +def test_apply_manifest_insert_update_delete(db): + # Initial sync: insert two patterns. + summary = apply_manifest(db, { + 'manifest_version': 1, + 'patterns': [ + _pattern_entry('c-1'), + _pattern_entry('c-2', sponsor='NordVPN', text='community nord text'), + ], + }) + assert summary['inserted'] == 2 + assert summary['updated'] == 0 + assert summary['deleted'] == 0 + + rows = db.get_patterns_by_source('community', active_only=False) + assert len(rows) == 2 + + # Second sync: bump version of c-1, drop c-2, add c-3. + summary = apply_manifest(db, { + 'manifest_version': 2, + 'patterns': [ + _pattern_entry('c-1', version=2, text='community version two'), + _pattern_entry('c-3', sponsor='ExpressVPN', text='community express'), + ], + }) + assert summary['inserted'] == 1 + assert summary['updated'] == 1 + assert summary['deleted'] == 1 + + rows = db.get_patterns_by_source('community', active_only=False) + cids = {r['community_id'] for r in rows} + assert cids == {'c-1', 'c-3'} + c1 = next(r for r in rows if r['community_id'] == 'c-1') + assert c1['version'] == 2 + assert 'version two' in c1['text_template'] + + +def test_apply_manifest_respects_protected_flag(db): + apply_manifest(db, {'manifest_version': 1, 'patterns': [_pattern_entry('p-1')]}) + rows = db.get_patterns_by_source('community', active_only=False) + assert len(rows) == 1 + pid = rows[0]['id'] + db.set_pattern_protected(pid, True) + + # Re-sync: try to bump version and drop. Protected pattern should survive both. + summary = apply_manifest(db, { + 'manifest_version': 2, + 'patterns': [_pattern_entry('p-1', version=5, text='attempted overwrite text')], + }) + assert summary['updated'] == 0 + assert summary['skipped'] >= 1 + rows = db.get_patterns_by_source('community', active_only=False) + assert rows[0]['version'] == 1 + + summary = apply_manifest(db, {'manifest_version': 3, 'patterns': []}) + assert summary['deleted'] == 0 + assert summary['skipped'] >= 1 + rows = db.get_patterns_by_source('community', active_only=False) + assert len(rows) == 1 # still present + + +def test_sync_now_records_settings_state(db, monkeypatch): + # Stub the network call. + def fake_fetch(url): + return {'manifest_version': 7, 'patterns': [_pattern_entry('c-net')]} + monkeypatch.setattr('community_sync._fetch_manifest', fake_fetch) + + summary = sync_now(db) + assert summary['inserted'] == 1 + assert summary['manifest_version'] == 7 + assert db.get_setting('community_sync_manifest_version') == '7' + assert db.get_setting('community_sync_last_error') == '' + stored = json.loads(db.get_setting('community_sync_last_summary')) + assert stored['inserted'] == 1 diff --git a/tests/unit/test_community_sync_404.py b/tests/unit/test_community_sync_404.py new file mode 100644 index 00000000..2ae0dc6e --- /dev/null +++ b/tests/unit/test_community_sync_404.py @@ -0,0 +1,70 @@ +"""Test that the /community-patterns/sync endpoint returns a soft 200 +when upstream has not yet published the manifest (404), rather than 502. + +We avoid spinning up the full main_app (which probes /app/data, queues, +sentry, etc.) by invoking the route handler directly inside a Flask +test_request_context. +""" +import json +import os +import sys +from unittest.mock import patch + +import pytest +import requests +from flask import Flask + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + + +@pytest.fixture +def app(monkeypatch, tmp_path): + monkeypatch.setenv('MINUSPOD_DATA_DIR', str(tmp_path)) + monkeypatch.setenv('DATABASE_PATH', ':memory:') + monkeypatch.setenv('AUTH_DISABLED', 'true') + app = Flask(__name__) + app.config['TESTING'] = True + return app + + +def _fake_http_error(status: int) -> requests.HTTPError: + resp = requests.Response() + resp.status_code = status + return requests.HTTPError(f'{status} error', response=resp) + + +def _call_sync(app, sync_side_effect): + """Patch sync_now AND get_database (route calls both before our branch + fires). Returns the Flask Response.""" + from api.settings import trigger_community_pattern_sync + with app.test_request_context('/api/v1/community-patterns/sync', method='POST'), \ + patch('community_sync.sync_now', side_effect=sync_side_effect), \ + patch('api.settings.get_database', return_value=object()): + return trigger_community_pattern_sync() + + +def _status_and_body(resp): + """Route returns either a Response or a (Response, status) tuple.""" + if isinstance(resp, tuple): + body_obj, status = resp[0], resp[1] + return status, body_obj.get_json() + return resp.status_code, resp.get_json() + + +def test_sync_404_returns_no_manifest_yet_not_502(app): + resp = _call_sync(app, _fake_http_error(404)) + status, body = _status_and_body(resp) + assert status == 200, body + assert body['status'] == 'no_manifest_yet' + + +def test_sync_other_http_error_still_502(app): + resp = _call_sync(app, _fake_http_error(500)) + status, _ = _status_and_body(resp) + assert status == 502 + + +def test_sync_generic_error_still_502(app): + resp = _call_sync(app, RuntimeError('boom')) + status, _ = _status_and_body(resp) + assert status == 502 diff --git a/tests/unit/test_community_tags_constants.py b/tests/unit/test_community_tags_constants.py new file mode 100644 index 00000000..b7115993 --- /dev/null +++ b/tests/unit/test_community_tags_constants.py @@ -0,0 +1,79 @@ +"""Tests for utils.community_tags loaders and PII helpers.""" +import os +import sys +import re + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from utils.community_tags import ( # noqa: E402 + UNIVERSAL_TAG, + CONSUMER_EMAIL_DOMAINS, + is_tollfree, + map_itunes_category, + sponsor_seed, + valid_tags, +) + + +def test_valid_tags_includes_universal_and_vocab(): + vt = valid_tags() + assert UNIVERSAL_TAG in vt + # spot-check vocabulary entries + assert 'tech' in vt + assert 'mental_health' in vt + assert 'true_crime' in vt + assert 'gambling' in vt + # 48 vocab tags + universal + assert len(vt) == 49 + + +def test_sponsor_seed_loads_255_entries(): + seed = sponsor_seed() + assert len(seed) == 255 + # Each row has the right shape + sample = seed[0] + assert 'name' in sample and 'aliases' in sample and 'tags' in sample + assert isinstance(sample['aliases'], list) + assert isinstance(sample['tags'], list) + + +def test_sponsor_seed_universal_marker_present(): + seed = sponsor_seed() + universal_count = sum(1 for s in seed if UNIVERSAL_TAG in s['tags']) + # Plan promises many sponsors carry universal; sanity check at least 10. + assert universal_count >= 10 + + +def test_itunes_category_map_basic(): + assert map_itunes_category('Technology') == 'technology' + assert map_itunes_category('technology') == 'technology' + assert map_itunes_category('True Crime') == 'true_crime' + assert map_itunes_category('Health & Fitness') == 'health' + assert map_itunes_category('Mental Health') == 'mental_health' + assert map_itunes_category('Comedy Interviews') == 'comedy' + # Unknown returns None. + assert map_itunes_category('Knitting Club Reviews') is None + assert map_itunes_category('') is None + assert map_itunes_category(None) is None # type: ignore[arg-type] + + +def test_consumer_email_domains_complete(): + expected = { + 'gmail.com', 'yahoo.com', 'aol.com', 'hotmail.com', 'outlook.com', + 'icloud.com', 'me.com', 'mac.com', 'protonmail.com', 'proton.me', + } + assert expected.issubset(CONSUMER_EMAIL_DOMAINS) + + +def test_is_tollfree_classification(): + # NANP toll-free numbers + assert is_tollfree('1-800-555-1234') is True + assert is_tollfree('(866) 555-1234') is True + assert is_tollfree('877-555-1234') is True + # Not toll-free + assert is_tollfree('212-555-1234') is False + assert is_tollfree('+1-415-555-1234') is False + # UK toll-free + assert is_tollfree('0800-555-1234') is True + # AU toll-free + assert is_tollfree('1800-555-1234') is True diff --git a/tests/unit/test_create_correction.py b/tests/unit/test_create_correction.py index 0bb7ffe1..6e31e0e5 100644 --- a/tests/unit/test_create_correction.py +++ b/tests/unit/test_create_correction.py @@ -216,14 +216,19 @@ def test_global_scope_creates_global_pattern(temp_db): def test_case_variant_sponsor_resolves_to_existing(temp_db): - """Submitting a create with 'squarespace' after 'Squarespace' exists - must reuse the same sponsor_id (case-insensitive lookup).""" + """Submitting a create with 'testbrand' after 'TestBrand' exists + must reuse the same sponsor_id (case-insensitive lookup). + + Uses a name that is NOT in the community seed list, since the + schema migration now preloads 255 sponsors and re-inserting an + existing name violates the UNIQUE constraint. + """ _make_episode(temp_db) - sid = temp_db.create_known_sponsor(name='Squarespace') + sid = temp_db.create_known_sponsor(name='TestBrandPRT') text = 'A second create against a case-variant sponsor name.' resp = _call(temp_db, { - 'start': 0.0, 'end': 30.0, 'sponsor': 'squarespace', + 'start': 0.0, 'end': 30.0, 'sponsor': 'testbrandprt', 'text_template': text * 2, 'scope': 'podcast', }) assert resp.status_code == 200 diff --git a/tests/unit/test_cron_evaluator.py b/tests/unit/test_cron_evaluator.py new file mode 100644 index 00000000..cde62241 --- /dev/null +++ b/tests/unit/test_cron_evaluator.py @@ -0,0 +1,57 @@ +"""Tests for the minimal cron evaluator (src/utils/cron.py).""" +import os +import sys +from datetime import datetime, timezone + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from utils.cron import is_due, is_valid_expression, next_fire, parse_expression # noqa: E402 + + +def test_validation_accepts_common_forms(): + assert is_valid_expression('0 3 * * 0') # Sunday 3am + assert is_valid_expression('*/5 * * * *') # every 5 minutes + assert is_valid_expression('30 */2 * * *') # every 2 hours at :30 + assert is_valid_expression('0 0 1,15 * *') # 1st and 15th midnight + assert is_valid_expression('0 9-17 * * 1-5') # weekday business hours + assert is_valid_expression('* * * * *') # every minute + + +def test_validation_rejects_garbage(): + assert not is_valid_expression('bad') + assert not is_valid_expression('* * * *') # too few fields + assert not is_valid_expression('60 * * * *') # out of range minute + assert not is_valid_expression('* * * * 7') # out of range dow + assert not is_valid_expression('*/0 * * * *') # zero step + assert not is_valid_expression('5-3 * * * *') # backwards range + + +def test_parse_field_handles_lists_and_steps(): + minute, hour, dom, month, dow = parse_expression('0,15,30 */2 * * *') + assert minute == {0, 15, 30} + assert hour == {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22} + assert dom == set(range(1, 32)) + assert month == set(range(1, 13)) + assert dow == set(range(0, 7)) + + +def test_next_fire_sunday_3am(): + # 2026-05-14 is a Thursday. Next Sunday 3am should be 2026-05-17 03:00. + after = datetime(2026, 5, 14, 12, 0, tzinfo=timezone.utc) + nxt = next_fire('0 3 * * 0', after) + assert nxt == datetime(2026, 5, 17, 3, 0, tzinfo=timezone.utc) + + +def test_next_fire_every_5_minutes(): + after = datetime(2026, 1, 1, 12, 7, tzinfo=timezone.utc) + nxt = next_fire('*/5 * * * *', after) + assert nxt == datetime(2026, 1, 1, 12, 10, tzinfo=timezone.utc) + + +def test_is_due_basic(): + cron = '0 3 * * 0' + last_run = datetime(2026, 5, 10, 3, 0, tzinfo=timezone.utc) # Sunday 3am + not_yet = datetime(2026, 5, 14, 12, 0, tzinfo=timezone.utc) # Thursday + sunday = datetime(2026, 5, 17, 3, 0, tzinfo=timezone.utc) + assert is_due(cron, last_run, not_yet) is False + assert is_due(cron, last_run, sunday) is True diff --git a/tests/unit/test_database.py b/tests/unit/test_database.py index 35f4057e..dad6f2ee 100644 --- a/tests/unit/test_database.py +++ b/tests/unit/test_database.py @@ -177,7 +177,10 @@ class TestAdPatternOperations: def test_create_ad_pattern(self, temp_db): """Create and retrieve ad pattern.""" - sponsor_id = temp_db.create_known_sponsor(name='BetterHelp') + # BetterHelp is preloaded by the v2.4.0 migration; reuse it instead of + # re-inserting (which would trip the UNIQUE constraint). + sponsor = temp_db.get_known_sponsor_by_name('BetterHelp') + sponsor_id = sponsor['id'] if sponsor else temp_db.create_known_sponsor(name='BetterHelp') pattern_id = temp_db.create_ad_pattern( scope='global', text_template='brought to you by {sponsor}', diff --git a/tests/unit/test_generate_manifest.py b/tests/unit/test_generate_manifest.py new file mode 100644 index 00000000..07835626 --- /dev/null +++ b/tests/unit/test_generate_manifest.py @@ -0,0 +1,71 @@ +"""Tests for the manifest builder (src/tools/generate_manifest.py).""" +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from tools.generate_manifest import _load_pattern_files, build_manifest # noqa: E402 + + +def _write(path: Path, doc): + path.write_text(json.dumps(doc)) + + +def test_bundle_file_flattens_into_manifest(tmp_path): + community = tmp_path / 'community' + community.mkdir() + _write(community / 'flat.json', { + 'community_id': 'flat-1', + 'version': 1, + 'sponsor': 'A', + 'submitted_at': '2026-01-01T00:00:00Z', + 'text_template': 'flat one', + }) + _write(community / 'bundle.json', { + 'format': 'minuspod-community-submission', + 'bundle_version': 1, + 'submitted_at': '2026-02-01T00:00:00Z', + 'submitted_app_version': '2.4.5', + 'pattern_count': 2, + 'patterns': [ + { + 'community_id': 'b-1', + 'version': 1, + 'sponsor': 'B', + 'submitted_at': '2026-02-02T00:00:00Z', + 'text_template': 'bundle one', + }, + { + 'community_id': 'b-2', + 'version': 1, + 'sponsor': 'C', + 'submitted_at': '2026-02-03T00:00:00Z', + 'text_template': 'bundle two', + }, + ], + }) + patterns = _load_pattern_files(community) + ids = {p['community_id'] for p in patterns} + assert ids == {'flat-1', 'b-1', 'b-2'} + manifest = build_manifest(patterns) + manifest_ids = {entry['community_id'] for entry in manifest['patterns']} + assert manifest_ids == {'flat-1', 'b-1', 'b-2'} + + +def test_bundle_with_missing_community_id_is_skipped(tmp_path, capsys): + community = tmp_path / 'community' + community.mkdir() + _write(community / 'bundle.json', { + 'format': 'minuspod-community-submission', + 'patterns': [ + {'community_id': 'ok-1', 'sponsor': 'A', 'text_template': 't', + 'version': 1, 'submitted_at': '2026-01-01T00:00:00Z'}, + {'sponsor': 'missing-id', 'text_template': 't'}, + ], + }) + patterns = _load_pattern_files(community) + assert [p['community_id'] for p in patterns] == ['ok-1'] + err = capsys.readouterr().err + assert 'missing community_id' in err diff --git a/tests/unit/test_migration_sponsor_fk.py b/tests/unit/test_migration_sponsor_fk.py index a29d6a9f..8f273b39 100644 --- a/tests/unit/test_migration_sponsor_fk.py +++ b/tests/unit/test_migration_sponsor_fk.py @@ -8,8 +8,13 @@ def _rebuild_pre_migration_shape(conn): """Rebuild `ad_patterns` and `pattern_corrections` in the v2.1.x shape so we can exercise the migration end-to-end. Assumes the post-migration tables have just been created by the normal Database init. + + The v2.4.0 seed migration preloads 255 sponsors; we clear them here so + tests can stage their own sponsor case-variants without colliding on the + UNIQUE name constraint. """ conn.execute("PRAGMA foreign_keys = OFF") + conn.execute("DELETE FROM known_sponsors") conn.execute("DROP TABLE IF EXISTS ad_patterns") conn.execute("DROP TABLE IF EXISTS pattern_corrections") conn.execute("DROP TABLE IF EXISTS _migration_backup_ad_patterns_sponsor") diff --git a/tests/unit/test_pattern_service_rewrite.py b/tests/unit/test_pattern_service_rewrite.py new file mode 100644 index 00000000..bf4782e6 --- /dev/null +++ b/tests/unit/test_pattern_service_rewrite.py @@ -0,0 +1,177 @@ +"""Tests for PatternService.rewrite_pattern_from_bounds + import_community_pattern.""" +import json +import os +import sys +import tempfile + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from database import Database # noqa: E402 +from pattern_service import PatternService # noqa: E402 + + +@pytest.fixture +def db(tmp_path): + # Singleton Database; clear it so each test gets a fresh sqlite file. + Database._instance = None # type: ignore[attr-defined] + if hasattr(Database, '_initialized'): + Database._initialized = False # type: ignore[attr-defined] + instance = Database(data_dir=str(tmp_path)) + yield instance + Database._instance = None # type: ignore[attr-defined] + + +# VTT-style transcript. Sentence-per-segment with gaps between, so the trim +# windows (head 0..20, tail 90..120) line up with the head / tail segments +# without partially overlapping the middle ad copy. include_partial=True is +# the default in extract_text_in_range, so any overlap pulls the segment in. +TRANSCRIPT = ( + '[00:00:00.000 --> 00:00:05.000] Welcome to the show.\n' + '[00:00:30.000 --> 00:00:35.000] This episode is brought to you by Squarespace.\n' + '[00:00:40.000 --> 00:00:50.000] Visit Squarespace dot com slash show for a free trial.\n' + '[00:00:55.000 --> 00:01:05.000] Use code SHOW for ten percent off your first website.\n' + '[00:01:55.000 --> 00:02:00.000] Now back to our regular programming.\n' +) + + +def _seed_pattern( + db, + source='local', + text_template=None, + intro_variants=None, + outro_variants=None, +): + # The schema migration already seeded Squarespace from sponsors_final.csv. + sponsor = db.get_known_sponsor_by_name('Squarespace') + assert sponsor is not None, 'Squarespace should be in the migrated seed list' + sid = sponsor['id'] + pid = db.create_ad_pattern( + scope='global', + text_template=text_template or 'old text template that is long enough to satisfy any length checks for tests today', + intro_variants=intro_variants if intro_variants is not None else ['old intro'], + outro_variants=outro_variants if outro_variants is not None else ['old outro'], + sponsor_id=sid, + source=source, + community_id='abc-123' if source == 'community' else None, + ) + return pid, sid + + +def test_rewrite_trims_head_and_tail_from_existing_template(db): + """The trim splices the head/tail transcript slice out of the existing + template — it does NOT re-extract a new template from the new bounds.""" + template = ( + 'Welcome to the show. This episode is brought to you by Squarespace. ' + 'Visit Squarespace dot com slash show for a free trial. Use code SHOW ' + 'for ten percent off your first website. Now back to our regular programming.' + ) + intro = ['Welcome to the show.'] + outro = ['Now back to our regular programming.'] + pid, _ = _seed_pattern( + db, text_template=template, intro_variants=intro, outro_variants=outro, + ) + svc = PatternService(db) + + # Reviewer narrowed [0, 120] -> [25, 90]. Head trim (0..25) picks up the + # "Welcome to the show." segment; tail trim (90..120) picks up "Now back + # to our regular programming." Middle ad copy stays intact. + changed = svc.rewrite_pattern_from_bounds( + pid, TRANSCRIPT, + original_start=0.0, original_end=120.0, + new_start=25.0, new_end=90.0, + ) + assert changed is True + p = db.get_ad_pattern_by_id(pid) + assert 'Welcome to the show.' not in p['text_template'] + assert 'Now back to our regular programming.' not in p['text_template'] + # Middle survives, unchanged from the original template content. + assert 'Squarespace' in p['text_template'] + assert 'Use code SHOW' in p['text_template'] + + +def test_rewrite_returns_false_when_trim_doesnt_match_template(db): + """If the head/tail slice from the transcript isn't actually at the + start/end of the existing template, the rewrite is a no-op.""" + pid, _ = _seed_pattern( + db, + text_template='completely unrelated template that does not begin with welcome or end with regular programming', + ) + svc = PatternService(db) + changed = svc.rewrite_pattern_from_bounds( + pid, TRANSCRIPT, + original_start=0.0, original_end=120.0, + new_start=25.0, new_end=90.0, + ) + assert changed is False + p = db.get_ad_pattern_by_id(pid) + assert p['text_template'].startswith('completely unrelated') + + +def test_rewrite_pattern_skips_community(db): + pid, _ = _seed_pattern(db, source='community') + svc = PatternService(db) + changed = svc.rewrite_pattern_from_bounds( + pid, TRANSCRIPT, + original_start=0.0, original_end=120.0, + new_start=25.0, new_end=90.0, + ) + assert changed is False + p = db.get_ad_pattern_by_id(pid) + assert p['text_template'].startswith('old text template') + + +def test_import_community_pattern_insert_then_update(db): + svc = PatternService(db) + cid = 'c1-22-33-44-55' + pid1 = svc.import_community_pattern({ + 'community_id': cid, + 'version': 1, + 'scope': 'global', + 'sponsor': 'Squarespace', + 'text_template': 'community pattern text template version one of squarespace dot com slash show', + 'intro_variants': ['Visit Squarespace'], + }) + row = db.get_ad_pattern_by_id(pid1) + assert row['source'] == 'community' + assert row['community_id'] == cid + assert row['version'] == 1 + + # Same community_id, higher version -> update. + pid2 = svc.import_community_pattern({ + 'community_id': cid, + 'version': 2, + 'scope': 'global', + 'sponsor': 'Squarespace', + 'text_template': 'updated community text template version two for squarespace promo SHOW', + }) + assert pid2 == pid1 + row = db.get_ad_pattern_by_id(pid1) + assert row['version'] == 2 + assert 'updated community text' in row['text_template'] + + +def test_import_community_pattern_respects_protected(db): + svc = PatternService(db) + cid = 'protected-c-001' + pid = svc.import_community_pattern({ + 'community_id': cid, + 'version': 1, + 'scope': 'global', + 'sponsor': 'Squarespace', + 'text_template': 'community version one for squarespace dot com slash show promo SHOW save ten', + }) + db.set_pattern_protected(pid, True) + # Higher version, but protected -> no update. + pid2 = svc.import_community_pattern({ + 'community_id': cid, + 'version': 5, + 'scope': 'global', + 'sponsor': 'Squarespace', + 'text_template': 'updated version five text body for squarespace promo SHOW save fifty percent today', + }) + assert pid2 == pid + row = db.get_ad_pattern_by_id(pid) + assert row['version'] == 1 + assert 'version one' in row['text_template'] diff --git a/tests/unit/test_sponsor_seed_idempotent.py b/tests/unit/test_sponsor_seed_idempotent.py index 50ba5013..2dfd08a0 100644 --- a/tests/unit/test_sponsor_seed_idempotent.py +++ b/tests/unit/test_sponsor_seed_idempotent.py @@ -1,8 +1,10 @@ """Tests for SponsorService.seed_initial_data() idempotency. -Covers the 2.0.13 rewrite that changed seed behavior from "first-run only" -to "name-diff every startup" so that updates to SEED_SPONSORS / SEED_NORMALIZATIONS -auto-propagate to existing deployments without overwriting user-edited rows. +As of 2.4.0 the schema migration is the authoritative seed for sponsors +(loaded from src/seed_data/sponsors_final.csv). SponsorService.seed_initial_data() +still runs at startup and is responsible for normalizations; for sponsors it is +effectively a no-op against the post-migration baseline because every SEED_SPONSORS +name already exists. These tests pin that idempotency contract. """ import json import sys @@ -10,57 +12,50 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) -from sponsor_service import SponsorService, SEED_SPONSORS, SEED_NORMALIZATIONS +from sponsor_service import SponsorService, SEED_NORMALIZATIONS class TestSeedIdempotent: - def test_empty_db_seeds_all(self, temp_db): - svc = SponsorService(temp_db) - assert temp_db.get_known_sponsors(active_only=False) == [] - - svc.seed_initial_data() - + def test_baseline_db_has_migrated_seed(self, temp_db): + """The migration runs during Database init and seeds 255 sponsors.""" rows = temp_db.get_known_sponsors(active_only=False) - assert len(rows) == len(SEED_SPONSORS) - norms = temp_db.get_sponsor_normalizations(active_only=False) - assert len(norms) == len(SEED_NORMALIZATIONS) - - def test_partial_db_inserts_only_missing(self, temp_db): - temp_db.create_known_sponsor(name='BetterHelp', aliases=[], category='health') - temp_db.create_known_sponsor(name='HelloFresh', aliases=[], category='food') - pre = len(temp_db.get_known_sponsors(active_only=False)) - assert pre == 2 + # 255 from sponsors_final.csv (migration v2.4.0). The number is a hard + # contract: changing it requires bumping the seed revision in schema.py. + assert len(rows) == 255 + def test_seed_initial_data_is_noop_against_migration_baseline(self, temp_db): svc = SponsorService(temp_db) + before = len(temp_db.get_known_sponsors(active_only=False)) svc.seed_initial_data() - - rows = temp_db.get_known_sponsors(active_only=False) - assert len(rows) == len(SEED_SPONSORS) - names = {r['name'] for r in rows} - assert {'BetterHelp', 'HelloFresh'} <= names + after = len(temp_db.get_known_sponsors(active_only=False)) + # SponsorService only inserts names not already present; + # every SEED_SPONSORS name overlaps with the migration seed. + assert before == after == 255 def test_user_edited_aliases_preserved(self, temp_db): - temp_db.create_known_sponsor( - name='BetterHelp', + """Editing a sponsor's aliases after migration is preserved by seed_initial_data.""" + existing = temp_db.get_known_sponsor_by_name('BetterHelp') + assert existing is not None + temp_db.update_known_sponsor( + existing['id'], aliases=['BH', 'Therapy App', 'My Custom Alias'], - category='health', ) svc = SponsorService(temp_db) svc.seed_initial_data() - row = next(r for r in temp_db.get_known_sponsors(active_only=False) if r['name'] == 'BetterHelp') + row = temp_db.get_known_sponsor_by_name('BetterHelp') aliases = json.loads(row['aliases']) if isinstance(row['aliases'], str) else row['aliases'] assert aliases == ['BH', 'Therapy App', 'My Custom Alias'] def test_deactivated_row_not_reactivated(self, temp_db): - sponsor_id = temp_db.create_known_sponsor(name='BetterHelp', aliases=[], category='health') - temp_db.update_known_sponsor(sponsor_id, is_active=0) + existing = temp_db.get_known_sponsor_by_name('BetterHelp') + temp_db.update_known_sponsor(existing['id'], is_active=0) svc = SponsorService(temp_db) svc.seed_initial_data() - row = next(r for r in temp_db.get_known_sponsors(active_only=False) if r['name'] == 'BetterHelp') + row = temp_db.get_known_sponsor_by_name('BetterHelp') assert row['is_active'] == 0 def test_running_twice_is_a_noop(self, temp_db): @@ -71,4 +66,12 @@ def test_running_twice_is_a_noop(self, temp_db): svc.seed_initial_data() second_count = len(temp_db.get_known_sponsors(active_only=False)) - assert first_count == second_count == len(SEED_SPONSORS) + assert first_count == second_count + + def test_normalizations_still_seeded(self, temp_db): + # Normalizations are unaffected by the migration — SponsorService remains + # the only place that seeds them. + svc = SponsorService(temp_db) + svc.seed_initial_data() + norms = temp_db.get_sponsor_normalizations(active_only=False) + assert len(norms) == len(SEED_NORMALIZATIONS) diff --git a/tests/unit/test_tag_matcher_eligibility.py b/tests/unit/test_tag_matcher_eligibility.py new file mode 100644 index 00000000..b521ea0e --- /dev/null +++ b/tests/unit/test_tag_matcher_eligibility.py @@ -0,0 +1,79 @@ +"""Tests for the tag-eligibility filter in TextPatternMatcher._filter_patterns_by_scope.""" +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) + +from text_pattern_matcher import AdPattern, TextPatternMatcher # noqa: E402 + + +def _matcher_with_patterns(patterns, sponsor_tags): + """Build a matcher with the supplied patterns and sponsor->tags map (bypasses DB).""" + m = TextPatternMatcher(db=None) + m._patterns = patterns + m._sponsor_tags = {sid: set(tags) for sid, tags in sponsor_tags.items()} + return m + + +def _community(pid, sponsor_id, scope='global'): + return AdPattern( + id=pid, text_template='x', intro_variants=[], outro_variants=[], + sponsor='S', scope=scope, sponsor_id=sponsor_id, source='community', + ) + + +def _local(pid, sponsor_id=None, scope='global'): + return AdPattern( + id=pid, text_template='x', intro_variants=[], outro_variants=[], + sponsor='S', scope=scope, sponsor_id=sponsor_id, source='local', + ) + + +def test_local_patterns_never_filtered_by_tags(): + p = _local(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: ['tech']}) + out = m._filter_patterns_by_scope(podcast_tags={'comedy'}) + assert [x.id for x in out] == [1] + + +def test_universal_sponsor_matches_everything(): + p = _community(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: ['universal', 'tech']}) + out = m._filter_patterns_by_scope(podcast_tags={'comedy'}) + assert [x.id for x in out] == [1] + + +def test_tag_overlap_matches(): + p = _community(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: ['tech', 'saas']}) + out = m._filter_patterns_by_scope(podcast_tags={'business', 'tech'}) + assert [x.id for x in out] == [1] + + +def test_no_overlap_drops_community_pattern(): + p = _community(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: ['gambling']}) + out = m._filter_patterns_by_scope(podcast_tags={'kids_family'}) + assert out == [] + + +def test_empty_sponsor_tags_fallback(): + p = _community(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: []}) + out = m._filter_patterns_by_scope(podcast_tags={'comedy'}) + assert [x.id for x in out] == [1] + + +def test_empty_podcast_tags_fallback(): + p = _community(1, sponsor_id=10) + m = _matcher_with_patterns([p], sponsor_tags={10: ['tech']}) + out = m._filter_patterns_by_scope(podcast_tags=None) + assert [x.id for x in out] == [1] + + +def test_scope_still_enforced_for_community(): + p = _community(1, sponsor_id=10, scope='podcast') + p.podcast_id = 'desert-island-discs' + m = _matcher_with_patterns([p], sponsor_tags={10: ['universal']}) + out = m._filter_patterns_by_scope(podcast_id='other-podcast', podcast_tags=None) + assert out == [] diff --git a/version.py b/version.py index 4618fe65..c9e914fc 100644 --- a/version.py +++ b/version.py @@ -1 +1 @@ -__version__ = "2.3.4" +__version__ = "2.4.6"
Status + Actions +
-
+
{getScopeBadge(pattern)} {pattern.created_by === 'user' && ( Manual )} + {pattern.source === PATTERN_SOURCE_COMMUNITY && pattern.community_id && ( + + )}
@@ -439,11 +566,22 @@ function PatternsPage() { {getStatusBadge(pattern.is_active)} + {pattern.source === PATTERN_SOURCE_COMMUNITY && ( + + )} +
+ No patterns found