Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/lib/db/search.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { describe, expect, it } from "vitest";

import { EMBEDDING_DIMENSIONS, normalizeSearchParams } from "./search";

function embedding(): number[] {
return new Array(EMBEDDING_DIMENSIONS).fill(0);
}

describe("normalizeSearchParams", () => {
it("applies default limit and offset", () => {
const p = normalizeSearchParams({});
expect(p.limit).toBe(20);
expect(p.offset).toBe(0);
});

it("clamps limit into the 1..100 range", () => {
expect(normalizeSearchParams({ limit: 0 }).limit).toBe(1);
expect(normalizeSearchParams({ limit: -10 }).limit).toBe(1);
expect(normalizeSearchParams({ limit: 9999 }).limit).toBe(100);
});

it("never returns a negative offset", () => {
expect(normalizeSearchParams({ offset: -5 }).offset).toBe(0);
});

it("trims text and drops blank strings", () => {
expect(normalizeSearchParams({ text: " hello " }).text).toBe("hello");
expect(normalizeSearchParams({ text: " " }).text).toBeUndefined();
});

it("lower-cases, trims, and de-duplicates tags", () => {
expect(
normalizeSearchParams({ tags: ["Lo-Fi", " lo-fi ", "Dreamy"] }).tags,
).toEqual(["lo-fi", "dreamy"]);
});

it("drops the tags field when nothing survives normalization", () => {
expect(normalizeSearchParams({ tags: [" ", ""] }).tags).toBeUndefined();
});

it("de-duplicates asset types and statuses", () => {
const p = normalizeSearchParams({
assetTypes: ["song", "song", "image"],
statuses: ["captured", "captured"],
});
expect(p.assetTypes).toEqual(["song", "image"]);
expect(p.statuses).toEqual(["captured"]);
});

it("clamps minSimilarity into the 0..1 range", () => {
expect(
normalizeSearchParams({ embedding: embedding(), minSimilarity: 2 })
.minSimilarity,
).toBe(1);
expect(
normalizeSearchParams({ embedding: embedding(), minSimilarity: -1 })
.minSimilarity,
).toBe(0);
});

it("accepts an embedding of the expected dimensionality", () => {
expect(() =>
normalizeSearchParams({ embedding: embedding() }),
).not.toThrow();
});

it("rejects an embedding with the wrong dimensionality", () => {
expect(() => normalizeSearchParams({ embedding: [1, 2, 3] })).toThrow(
RangeError,
);
});
});
194 changes: 194 additions & 0 deletions src/lib/db/search.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import {
and,
cosineDistance,
desc,
eq,
gt,
ilike,
inArray,
isNotNull,
or,
sql,
} from "drizzle-orm";

import type { Result } from "@/lib/result";
import { tryCatchAsync } from "@/lib/result";

import { db } from "./index";
import { quirkAnnotations, quirkAssets, type QuirkAsset } from "./schema";

/** Dimensionality of the `quirk_assets.embedding` pgvector column. */
export const EMBEDDING_DIMENSIONS = 1536;

const DEFAULT_LIMIT = 20;
const MAX_LIMIT = 100;

type AssetType = QuirkAsset["assetType"];
type AssetStatus = QuirkAsset["status"];

export interface SearchAssetsParams {
/** Free-text fragment matched (case-insensitively) against title and raw text. */
text?: string;
/** Restrict to these asset types. */
assetTypes?: AssetType[];
/** Restrict to these statuses. */
statuses?: AssetStatus[];
/** Restrict to assets carrying a `tag` annotation with one of these labels. */
tags?: string[];
/** Query embedding; when present, results are ranked by cosine similarity. */
embedding?: number[];
/** Minimum cosine similarity (0..1); only applied when `embedding` is set. */
minSimilarity?: number;
/** Page size (clamped to 1..100, default 20). */
limit?: number;
/** Rows to skip (default 0). */
offset?: number;
}

export interface NormalizedSearchParams {
text?: string;
assetTypes?: AssetType[];
statuses?: AssetStatus[];
tags?: string[];
embedding?: number[];
minSimilarity?: number;
limit: number;
offset: number;
}

export interface AssetSearchHit {
asset: QuirkAsset;
/** Cosine similarity in 0..1 when ranked semantically, otherwise null. */
similarity: number | null;
}

function clampInt(value: number, min: number, max: number): number {
return Math.min(Math.max(Math.trunc(value), min), max);
}

function dedupe<T>(values?: T[]): T[] | undefined {
if (!values || values.length === 0) return undefined;
const unique = Array.from(new Set(values));
return unique.length > 0 ? unique : undefined;
}

/**
* Validate and normalize raw search input into a predictable shape: limits are
* clamped, text is trimmed, tags are lower-cased and de-duplicated, and a
* wrong-sized embedding is rejected up front. Pure and database-free so it can
* be unit-tested in isolation.
*/
export function normalizeSearchParams(
params: SearchAssetsParams,
): NormalizedSearchParams {
if (
params.embedding !== undefined &&
params.embedding.length !== EMBEDDING_DIMENSIONS
) {
Comment on lines +85 to +87

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Validate embedding values are finite numbers

The embedding validation only checks array length, so vectors containing NaN/Infinity pass normalization and are sent into the pgvector distance expression. pgvector requires finite elements, so this turns malformed caller input into a database error path (Result failure) instead of deterministic upfront validation, even though this function is intended to normalize and validate search parameters.

Useful? React with 👍 / 👎.

throw new RangeError(
`embedding must have ${EMBEDDING_DIMENSIONS} dimensions, received ${params.embedding.length}`,
);
}

const text = params.text?.trim() || undefined;

const tags = params.tags
? dedupe(
params.tags.map((tag) => tag.trim().toLowerCase()).filter(Boolean),
)
: undefined;

const minSimilarity =
params.minSimilarity === undefined
? undefined
: Math.min(Math.max(params.minSimilarity, 0), 1);
Comment on lines +101 to +104

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reject NaN minSimilarity values during normalization

minSimilarity is clamped but not validated for finiteness, so an input like Number("abc") becomes NaN and survives normalization. That NaN is then used in the SQL threshold predicate, which produces unintuitive filtering behavior (effectively dropping all finite-similarity rows) instead of treating the input as invalid or defaulting safely.

Useful? React with 👍 / 👎.


return {
text,
assetTypes: dedupe(params.assetTypes),
statuses: dedupe(params.statuses),
tags,
embedding: params.embedding,
minSimilarity,
limit: clampInt(params.limit ?? DEFAULT_LIMIT, 1, MAX_LIMIT),
offset: Math.max(0, Math.trunc(params.offset ?? 0)),
Comment on lines +113 to +114

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Reject non-finite limit/offset values during normalization

normalizeSearchParams currently clamps/truncates limit and offset without checking finiteness, so values like NaN (common when parsing query params such as Number("abc")) survive as NaN because Math.trunc/Math.max/Math.min propagate it. That lets p.limit/p.offset reach Drizzle as invalid numbers and can turn a bad client input into a runtime query error instead of falling back to safe pagination defaults.

Useful? React with 👍 / 👎.

};
}

/** Escape LIKE/ILIKE wildcards so user text is matched literally. */
function escapeLike(input: string): string {
return input.replace(/[\\%_]/g, (char) => `\\${char}`);
}

/**
* Search the asset registry by any combination of semantic similarity, free
* text, type/status, and tags. When `embedding` is supplied, only assets that
* have an embedding are returned and rows are ordered by descending cosine
* similarity; otherwise the newest matching assets come first.
*/
export async function searchAssets(
params: SearchAssetsParams,
): Promise<Result<AssetSearchHit[]>> {
return tryCatchAsync(async () => {
const p = normalizeSearchParams(params);

const similarity = p.embedding
? sql<number>`1 - (${cosineDistance(quirkAssets.embedding, p.embedding)})`
: null;

const conditions = [];

if (p.text) {
const pattern = `%${escapeLike(p.text)}%`;
conditions.push(
or(
ilike(quirkAssets.title, pattern),
ilike(quirkAssets.rawText, pattern),
),
);
}

if (p.assetTypes) {
conditions.push(inArray(quirkAssets.assetType, p.assetTypes));
}

if (p.statuses) {
conditions.push(inArray(quirkAssets.status, p.statuses));
}

if (p.tags) {
const taggedAssetIds = db
.select({ id: quirkAnnotations.assetId })
.from(quirkAnnotations)
.where(
and(
eq(quirkAnnotations.annotationType, "tag"),
inArray(sql`lower(${quirkAnnotations.label})`, p.tags),
),
);
conditions.push(inArray(quirkAssets.id, taggedAssetIds));
}

if (similarity) {
conditions.push(isNotNull(quirkAssets.embedding));
if (p.minSimilarity !== undefined) {
conditions.push(gt(similarity, p.minSimilarity));

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Apply minSimilarity as an inclusive lower bound

The code documents minSimilarity as a minimum threshold, but the query uses gt(similarity, p.minSimilarity), which excludes rows exactly at the threshold. In practice this makes minSimilarity: 1 return no results even for perfect matches, and generally drops boundary-equal hits that callers would reasonably expect to keep.

Useful? React with 👍 / 👎.

}
}

const where = conditions.length > 0 ? and(...conditions) : undefined;

const rows = await db
.select({ asset: quirkAssets, similarity: similarity ?? sql<null>`null` })
.from(quirkAssets)
.where(where)
.orderBy(similarity ? desc(similarity) : desc(quirkAssets.createdAt))
.limit(p.limit)
.offset(p.offset);

return rows.map((row) => ({
asset: row.asset,
similarity: row.similarity === null ? null : Number(row.similarity),
}));
});
}
Loading