-
Notifications
You must be signed in to change notification settings - Fork 0
feat(db): unified asset search (semantic + text + tag filters) #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| import { describe, expect, it } from "vitest"; | ||
|
|
||
| import { EMBEDDING_DIMENSIONS, normalizeSearchParams } from "./search"; | ||
|
|
||
| function embedding(): number[] { | ||
| return new Array(EMBEDDING_DIMENSIONS).fill(0); | ||
| } | ||
|
|
||
| describe("normalizeSearchParams", () => { | ||
| it("applies default limit and offset", () => { | ||
| const p = normalizeSearchParams({}); | ||
| expect(p.limit).toBe(20); | ||
| expect(p.offset).toBe(0); | ||
| }); | ||
|
|
||
| it("clamps limit into the 1..100 range", () => { | ||
| expect(normalizeSearchParams({ limit: 0 }).limit).toBe(1); | ||
| expect(normalizeSearchParams({ limit: -10 }).limit).toBe(1); | ||
| expect(normalizeSearchParams({ limit: 9999 }).limit).toBe(100); | ||
| }); | ||
|
|
||
| it("never returns a negative offset", () => { | ||
| expect(normalizeSearchParams({ offset: -5 }).offset).toBe(0); | ||
| }); | ||
|
|
||
| it("trims text and drops blank strings", () => { | ||
| expect(normalizeSearchParams({ text: " hello " }).text).toBe("hello"); | ||
| expect(normalizeSearchParams({ text: " " }).text).toBeUndefined(); | ||
| }); | ||
|
|
||
| it("lower-cases, trims, and de-duplicates tags", () => { | ||
| expect( | ||
| normalizeSearchParams({ tags: ["Lo-Fi", " lo-fi ", "Dreamy"] }).tags, | ||
| ).toEqual(["lo-fi", "dreamy"]); | ||
| }); | ||
|
|
||
| it("drops the tags field when nothing survives normalization", () => { | ||
| expect(normalizeSearchParams({ tags: [" ", ""] }).tags).toBeUndefined(); | ||
| }); | ||
|
|
||
| it("de-duplicates asset types and statuses", () => { | ||
| const p = normalizeSearchParams({ | ||
| assetTypes: ["song", "song", "image"], | ||
| statuses: ["captured", "captured"], | ||
| }); | ||
| expect(p.assetTypes).toEqual(["song", "image"]); | ||
| expect(p.statuses).toEqual(["captured"]); | ||
| }); | ||
|
|
||
| it("clamps minSimilarity into the 0..1 range", () => { | ||
| expect( | ||
| normalizeSearchParams({ embedding: embedding(), minSimilarity: 2 }) | ||
| .minSimilarity, | ||
| ).toBe(1); | ||
| expect( | ||
| normalizeSearchParams({ embedding: embedding(), minSimilarity: -1 }) | ||
| .minSimilarity, | ||
| ).toBe(0); | ||
| }); | ||
|
|
||
| it("accepts an embedding of the expected dimensionality", () => { | ||
| expect(() => | ||
| normalizeSearchParams({ embedding: embedding() }), | ||
| ).not.toThrow(); | ||
| }); | ||
|
|
||
| it("rejects an embedding with the wrong dimensionality", () => { | ||
| expect(() => normalizeSearchParams({ embedding: [1, 2, 3] })).toThrow( | ||
| RangeError, | ||
| ); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,194 @@ | ||
| import { | ||
| and, | ||
| cosineDistance, | ||
| desc, | ||
| eq, | ||
| gt, | ||
| ilike, | ||
| inArray, | ||
| isNotNull, | ||
| or, | ||
| sql, | ||
| } from "drizzle-orm"; | ||
|
|
||
| import type { Result } from "@/lib/result"; | ||
| import { tryCatchAsync } from "@/lib/result"; | ||
|
|
||
| import { db } from "./index"; | ||
| import { quirkAnnotations, quirkAssets, type QuirkAsset } from "./schema"; | ||
|
|
||
| /** Dimensionality of the `quirk_assets.embedding` pgvector column. */ | ||
| export const EMBEDDING_DIMENSIONS = 1536; | ||
|
|
||
| const DEFAULT_LIMIT = 20; | ||
| const MAX_LIMIT = 100; | ||
|
|
||
| type AssetType = QuirkAsset["assetType"]; | ||
| type AssetStatus = QuirkAsset["status"]; | ||
|
|
||
| export interface SearchAssetsParams { | ||
| /** Free-text fragment matched (case-insensitively) against title and raw text. */ | ||
| text?: string; | ||
| /** Restrict to these asset types. */ | ||
| assetTypes?: AssetType[]; | ||
| /** Restrict to these statuses. */ | ||
| statuses?: AssetStatus[]; | ||
| /** Restrict to assets carrying a `tag` annotation with one of these labels. */ | ||
| tags?: string[]; | ||
| /** Query embedding; when present, results are ranked by cosine similarity. */ | ||
| embedding?: number[]; | ||
| /** Minimum cosine similarity (0..1); only applied when `embedding` is set. */ | ||
| minSimilarity?: number; | ||
| /** Page size (clamped to 1..100, default 20). */ | ||
| limit?: number; | ||
| /** Rows to skip (default 0). */ | ||
| offset?: number; | ||
| } | ||
|
|
||
| export interface NormalizedSearchParams { | ||
| text?: string; | ||
| assetTypes?: AssetType[]; | ||
| statuses?: AssetStatus[]; | ||
| tags?: string[]; | ||
| embedding?: number[]; | ||
| minSimilarity?: number; | ||
| limit: number; | ||
| offset: number; | ||
| } | ||
|
|
||
| export interface AssetSearchHit { | ||
| asset: QuirkAsset; | ||
| /** Cosine similarity in 0..1 when ranked semantically, otherwise null. */ | ||
| similarity: number | null; | ||
| } | ||
|
|
||
| function clampInt(value: number, min: number, max: number): number { | ||
| return Math.min(Math.max(Math.trunc(value), min), max); | ||
| } | ||
|
|
||
| function dedupe<T>(values?: T[]): T[] | undefined { | ||
| if (!values || values.length === 0) return undefined; | ||
| const unique = Array.from(new Set(values)); | ||
| return unique.length > 0 ? unique : undefined; | ||
| } | ||
|
|
||
| /** | ||
| * Validate and normalize raw search input into a predictable shape: limits are | ||
| * clamped, text is trimmed, tags are lower-cased and de-duplicated, and a | ||
| * wrong-sized embedding is rejected up front. Pure and database-free so it can | ||
| * be unit-tested in isolation. | ||
| */ | ||
| export function normalizeSearchParams( | ||
| params: SearchAssetsParams, | ||
| ): NormalizedSearchParams { | ||
| if ( | ||
| params.embedding !== undefined && | ||
| params.embedding.length !== EMBEDDING_DIMENSIONS | ||
| ) { | ||
| throw new RangeError( | ||
| `embedding must have ${EMBEDDING_DIMENSIONS} dimensions, received ${params.embedding.length}`, | ||
| ); | ||
| } | ||
|
|
||
| const text = params.text?.trim() || undefined; | ||
|
|
||
| const tags = params.tags | ||
| ? dedupe( | ||
| params.tags.map((tag) => tag.trim().toLowerCase()).filter(Boolean), | ||
| ) | ||
| : undefined; | ||
|
|
||
| const minSimilarity = | ||
| params.minSimilarity === undefined | ||
| ? undefined | ||
| : Math.min(Math.max(params.minSimilarity, 0), 1); | ||
|
Comment on lines
+101
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
|
|
||
| return { | ||
| text, | ||
| assetTypes: dedupe(params.assetTypes), | ||
| statuses: dedupe(params.statuses), | ||
| tags, | ||
| embedding: params.embedding, | ||
| minSimilarity, | ||
| limit: clampInt(params.limit ?? DEFAULT_LIMIT, 1, MAX_LIMIT), | ||
| offset: Math.max(0, Math.trunc(params.offset ?? 0)), | ||
|
Comment on lines
+113
to
+114
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| }; | ||
| } | ||
|
|
||
| /** Escape LIKE/ILIKE wildcards so user text is matched literally. */ | ||
| function escapeLike(input: string): string { | ||
| return input.replace(/[\\%_]/g, (char) => `\\${char}`); | ||
| } | ||
|
|
||
| /** | ||
| * Search the asset registry by any combination of semantic similarity, free | ||
| * text, type/status, and tags. When `embedding` is supplied, only assets that | ||
| * have an embedding are returned and rows are ordered by descending cosine | ||
| * similarity; otherwise the newest matching assets come first. | ||
| */ | ||
| export async function searchAssets( | ||
| params: SearchAssetsParams, | ||
| ): Promise<Result<AssetSearchHit[]>> { | ||
| return tryCatchAsync(async () => { | ||
| const p = normalizeSearchParams(params); | ||
|
|
||
| const similarity = p.embedding | ||
| ? sql<number>`1 - (${cosineDistance(quirkAssets.embedding, p.embedding)})` | ||
| : null; | ||
|
|
||
| const conditions = []; | ||
|
|
||
| if (p.text) { | ||
| const pattern = `%${escapeLike(p.text)}%`; | ||
| conditions.push( | ||
| or( | ||
| ilike(quirkAssets.title, pattern), | ||
| ilike(quirkAssets.rawText, pattern), | ||
| ), | ||
| ); | ||
| } | ||
|
|
||
| if (p.assetTypes) { | ||
| conditions.push(inArray(quirkAssets.assetType, p.assetTypes)); | ||
| } | ||
|
|
||
| if (p.statuses) { | ||
| conditions.push(inArray(quirkAssets.status, p.statuses)); | ||
| } | ||
|
|
||
| if (p.tags) { | ||
| const taggedAssetIds = db | ||
| .select({ id: quirkAnnotations.assetId }) | ||
| .from(quirkAnnotations) | ||
| .where( | ||
| and( | ||
| eq(quirkAnnotations.annotationType, "tag"), | ||
| inArray(sql`lower(${quirkAnnotations.label})`, p.tags), | ||
| ), | ||
| ); | ||
| conditions.push(inArray(quirkAssets.id, taggedAssetIds)); | ||
| } | ||
|
|
||
| if (similarity) { | ||
| conditions.push(isNotNull(quirkAssets.embedding)); | ||
| if (p.minSimilarity !== undefined) { | ||
| conditions.push(gt(similarity, p.minSimilarity)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The code documents Useful? React with 👍 / 👎. |
||
| } | ||
| } | ||
|
|
||
| const where = conditions.length > 0 ? and(...conditions) : undefined; | ||
|
|
||
| const rows = await db | ||
| .select({ asset: quirkAssets, similarity: similarity ?? sql<null>`null` }) | ||
| .from(quirkAssets) | ||
| .where(where) | ||
| .orderBy(similarity ? desc(similarity) : desc(quirkAssets.createdAt)) | ||
| .limit(p.limit) | ||
| .offset(p.offset); | ||
|
|
||
| return rows.map((row) => ({ | ||
| asset: row.asset, | ||
| similarity: row.similarity === null ? null : Number(row.similarity), | ||
| })); | ||
| }); | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The embedding validation only checks array length, so vectors containing
NaN/Infinitypass normalization and are sent into the pgvector distance expression. pgvector requires finite elements, so this turns malformed caller input into a database error path (Resultfailure) instead of deterministic upfront validation, even though this function is intended to normalize and validate search parameters.Useful? React with 👍 / 👎.