From 76736c4372474883021f9f6702be439da0416ef7 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 10:53:48 +0800 Subject: [PATCH 01/12] good wip --- bun.lock | 4 +- packages/core/package.json | 2 +- .../core/src/db/schema/entities/issue.sql.ts | 3 + packages/core/src/db/utils/json.ts | 36 +++++- packages/core/src/embedding.ts | 30 ++++- packages/core/src/openai/index.ts | 2 + packages/core/src/openai/schema.ts | 8 ++ packages/core/src/semsearch/db.ts | 4 +- packages/core/src/summary.ts | 121 ++++++++++++++++++ .../sync/embedding/embedding.workflow.ts | 101 ++++++++++++++- .../workflows/sync/issue/issue.workflow.ts | 2 - .../workflows/sync/repo-init/init.workflow.ts | 2 +- 12 files changed, 298 insertions(+), 17 deletions(-) create mode 100644 packages/core/src/summary.ts diff --git a/bun.lock b/bun.lock index 330a5a5a..1d375b0c 100644 --- a/bun.lock +++ b/bun.lock @@ -32,7 +32,7 @@ "gql.tada": "^1.8.10", "graphql": "^16.9.0", "octokit": "^4.0.2", - "openai": "^4.69.0", + "openai": "^4.83.0", "postgres": "^3.4.4", "resend": "^4.0.1", "tldts": "^6.1.68", @@ -1352,7 +1352,7 @@ "oidc-token-hash": ["oidc-token-hash@5.0.3", "", {}, "sha512-IF4PcGgzAr6XXSff26Sk/+P4KZFJVuHAJZj3wgO3vX2bMdNVp/QXTP3P7CEm9V1IdG8lDLY3HhiqpsE/nOwpPw=="], - "openai": ["openai@4.69.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "zod": "^3.23.8" }, "optionalPeers": ["zod"], "bin": { "openai": "bin/cli" } }, "sha512-S3hOHSkk609KqwgH+7dwFrSvO3Gm3Nk0YWGyPHNscoMH/Y2tH1qunMi7gtZnLbUv4/N1elqCp6bDior2401kCQ=="], + "openai": ["openai@4.83.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-fmTsqud0uTtRKsPC7L8Lu55dkaTwYucqncDHzVvO64DKOpNTuiYwjbR/nVgpapXuYy8xSnhQQPUm+3jQaxICgw=="], "openid-client": ["openid-client@5.6.4", "", { "dependencies": { "jose": "^4.15.4", "lru-cache": "^6.0.0", "object-hash": "^2.2.0", "oidc-token-hash": "^5.0.3" } }, "sha512-T1h3B10BRPKfcObdBklX639tVz+xh34O7GjofqrqiAQdm7eHsQ00ih18x6wuJ/E6FxdtS2u3FmUGPDeEcMwzNA=="], diff --git a/packages/core/package.json b/packages/core/package.json index 915c90af..d5af2f27 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -28,7 +28,7 @@ "gql.tada": "^1.8.10", "graphql": "^16.9.0", "octokit": "^4.0.2", - "openai": "^4.69.0", + "openai": "^4.83.0", "postgres": "^3.4.4", "resend": "^4.0.1", "tldts": "^6.1.68", diff --git a/packages/core/src/db/schema/entities/issue.sql.ts b/packages/core/src/db/schema/entities/issue.sql.ts index 33a4fb80..3b474f62 100644 --- a/packages/core/src/db/schema/entities/issue.sql.ts +++ b/packages/core/src/db/schema/entities/issue.sql.ts @@ -38,6 +38,9 @@ export const issueTable = pgTable( htmlUrl: text("html_url").notNull(), title: text("title").notNull(), body: text("body").notNull(), + overallSummary: text("overall_summary"), + bodySummary: text("body_summary"), + commentsSummary: text("comments_summary"), aggregateReactions: jsonb( "aggregate_reactions", ).$type(), diff --git a/packages/core/src/db/utils/json.ts b/packages/core/src/db/utils/json.ts index 9be03b39..bd2fa708 100644 --- a/packages/core/src/db/utils/json.ts +++ b/packages/core/src/db/utils/json.ts @@ -63,8 +63,7 @@ export function jsonArrayContains< )`; } -// improvised somewhat, probably not the best way to do this -export function jsonAggBuildObjectFromJoin< +export function jsonAggBuildObjectManyToMany< T extends SelectedFields, Column extends AnyColumn, >( @@ -100,3 +99,36 @@ export function jsonAggBuildObjectFromJoin< '[]'::json )`; } + +// Simpler version for one-to-many relationships where we just need to aggregate related rows +export function jsonAggBuildObjectOneToMany< + T extends SelectedFields, + Column extends AnyColumn, +>( + shape: T, + { + from, + foreignKeyEquals, + orderBy, + }: { + from: Table; + foreignKeyEquals: SQL; + orderBy?: { colName: Column; direction: "ASC" | "DESC" }; + }, +) { + return sql[]>` + COALESCE( + ( + SELECT json_agg(${jsonBuildObject(shape)} + ${ + orderBy + ? sql`ORDER BY ${orderBy.colName} ${sql.raw(orderBy.direction)}` + : undefined + } + ) + FROM ${sql`${from}`} + WHERE ${foreignKeyEquals} + ), + '[]'::json + )`; +} diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts index b6c6b2a4..e8f1af27 100644 --- a/packages/core/src/embedding.ts +++ b/packages/core/src/embedding.ts @@ -5,6 +5,7 @@ import { truncateCodeBlocks, truncateToByteSize } from "@/util/truncate"; import type { DbClient } from "./db"; import { and, asc, eq, gt, inArray, isNull, lt, ne, or, sql } from "./db"; +import { comments as commentTable } from "./db/schema/entities/comment.sql"; import { issueEmbeddings } from "./db/schema/entities/issue-embedding.sql"; import { issuesToLabels } from "./db/schema/entities/issue-to-label.sql"; import type { SelectIssueForEmbedding } from "./db/schema/entities/issue.schema"; @@ -14,7 +15,10 @@ import { labels as labelTable } from "./db/schema/entities/label.sql"; import { repos } from "./db/schema/entities/repo.sql"; import { conflictUpdateOnly } from "./db/utils/conflict"; import { convertToSqlRaw } from "./db/utils/general"; -import { jsonAggBuildObjectFromJoin } from "./db/utils/json"; +import { + jsonAggBuildObjectManyToMany, + jsonAggBuildObjectOneToMany, +} from "./db/utils/json"; import { EMBEDDING_MODEL, type OpenAIClient } from "./openai"; import { isReducePromptError } from "./openai/errors"; import { embeddingsCreateSchema } from "./openai/schema"; @@ -98,7 +102,7 @@ export async function selectIssuesForEmbeddingInit( issueStateReason: issueTable.issueStateReason, issueCreatedAt: issueTable.issueCreatedAt, issueClosedAt: issueTable.issueClosedAt, - labels: jsonAggBuildObjectFromJoin( + labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, description: labelTable.description, @@ -110,6 +114,16 @@ export async function selectIssuesForEmbeddingInit( whereCondition: eq(issuesToLabels.issueId, issueTable.id), }, ), + comments: jsonAggBuildObjectOneToMany( + { + body: commentTable.body, + author: commentTable.author, + }, + { + from: commentTable, + foreignKeyEquals: eq(commentTable.issueId, issueTable.id), + }, + ), }) .from(issueTable) .leftJoin(issueEmbeddings, eq(issueEmbeddings.issueId, issueTable.id)) @@ -178,7 +192,7 @@ export async function selectIssuesForEmbeddingCron({ issueStateReason: lockedIssues.issueStateReason, issueCreatedAt: lockedIssues.issueCreatedAt, issueClosedAt: lockedIssues.issueClosedAt, - labels: jsonAggBuildObjectFromJoin( + labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, description: labelTable.description, @@ -190,6 +204,16 @@ export async function selectIssuesForEmbeddingCron({ whereCondition: eq(issuesToLabels.issueId, lockedIssues.id), }, ), + comments: jsonAggBuildObjectOneToMany( + { + body: commentTable.body, + author: commentTable.author, + }, + { + from: commentTable, + foreignKeyEquals: eq(commentTable.issueId, lockedIssues.id), + }, + ), }) .from(lockedIssues) .leftJoin(issueEmbeddings, eq(issueEmbeddings.issueId, lockedIssues.id)) diff --git a/packages/core/src/openai/index.ts b/packages/core/src/openai/index.ts index 86e8af54..59fbeeb9 100644 --- a/packages/core/src/openai/index.ts +++ b/packages/core/src/openai/index.ts @@ -9,3 +9,5 @@ export function createOpenAIClient(apiKey: string) { export type OpenAIClient = ReturnType; export const EMBEDDING_MODEL = "text-embedding-3-small"; + +export const SUMMARY_MODEL = "o3-mini-2025-01-31"; diff --git a/packages/core/src/openai/schema.ts b/packages/core/src/openai/schema.ts index 6564ac2c..6c920ab6 100644 --- a/packages/core/src/openai/schema.ts +++ b/packages/core/src/openai/schema.ts @@ -12,3 +12,11 @@ export const embeddingsCreateSchema = z model: z.string(), }) .strip(); + +export const chatCompletionSchema = z.object({ + choices: z.array( + z.object({ + message: z.object({ content: z.string() }), + }), + ), +}); diff --git a/packages/core/src/semsearch/db.ts b/packages/core/src/semsearch/db.ts index 41f9a388..40ef394c 100644 --- a/packages/core/src/semsearch/db.ts +++ b/packages/core/src/semsearch/db.ts @@ -14,7 +14,7 @@ import { publicCollections } from "@/db/schema/entities/public-collection.sql"; import { repos } from "@/db/schema/entities/repo.sql"; import { usersToRepos } from "@/db/schema/entities/user-to-repo.sql"; import { lower } from "@/db/utils/general"; -import { jsonAggBuildObjectFromJoin, jsonContains } from "@/db/utils/json"; +import { jsonAggBuildObjectManyToMany, jsonContains } from "@/db/utils/json"; import type { SearchParams } from "./schema.output"; import { parseSearchQuery } from "./util"; @@ -24,7 +24,7 @@ export function getBaseSelect() { id: issueTable.id, number: issueTable.number, title: issueTable.title, - labels: jsonAggBuildObjectFromJoin( + labels: jsonAggBuildObjectManyToMany( { name: labels.name, color: labels.color, diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts new file mode 100644 index 00000000..3a66a348 --- /dev/null +++ b/packages/core/src/summary.ts @@ -0,0 +1,121 @@ +import type { OpenAIClient } from "./openai"; +import { SUMMARY_MODEL } from "./openai"; +import { chatCompletionSchema } from "./openai/schema"; + +async function summarize( + { + textToSummarize, + systemPrompt, + userInstructions, + temperature = 0.2, + reasoningEffort = "medium", + }: { + textToSummarize: string; + systemPrompt: string; + userInstructions: string; + temperature?: number; + reasoningEffort?: "low" | "medium" | "high"; + }, + openai: OpenAIClient, +): Promise { + const messages = [ + { + role: "system" as const, + content: systemPrompt, + }, + { + role: "user" as const, + content: `${userInstructions}\n\n${textToSummarize}`, + }, + ]; + + const response = await openai.chat.completions.create({ + model: SUMMARY_MODEL, + messages, + temperature, + reasoning_effort: reasoningEffort, + }); + const result = chatCompletionSchema.parse(response); + return result.choices[0]!.message.content; +} + +// Predefined prompts and instructions +const PROMPTS = { + issueBody: { + system: + "You are a helpful assistant that generates concise summaries of GitHub issue descriptions. Focus on the problem, proposed solutions, and key technical details.", + user: "Please summarize this GitHub issue description:", + }, + comments: { + system: + "You are a helpful assistant that generates concise summaries of GitHub issue comments. Focus on key decisions, solutions proposed, and final outcomes.", + user: "Please summarize the discussion in these GitHub issue comments:", + }, + overall: { + system: + "You are a helpful assistant that generates concise overall summaries of GitHub issues. Synthesize the issue description and discussion into a clear summary.", + user: "Please provide a concise overall summary of this GitHub issue based on these summaries:", + }, +} as const; + +export async function generateBodySummary( + body: string, + openai: OpenAIClient, +): Promise { + return await summarize( + { + textToSummarize: body, + systemPrompt: PROMPTS.issueBody.system, + userInstructions: PROMPTS.issueBody.user, + temperature: 0.2, + reasoningEffort: "high", + }, + openai, + ); +} + +export async function generateCommentsSummary( + comments: Array<{ body: string }>, + openai: OpenAIClient, +): Promise { + if (!comments.length) { + return ""; + } + + return await summarize( + { + textToSummarize: comments.map((c) => c.body).join("\n\n"), + systemPrompt: PROMPTS.comments.system, + userInstructions: PROMPTS.comments.user, + temperature: 0.3, + }, + openai, + ); +} + +export async function generateOverallSummary( + params: { + bodySummary: string; + commentsSummary?: string; + }, + openai: OpenAIClient, +): Promise { + const text = `Issue description summary: +${params.bodySummary} + +Discussion summary: +${params.commentsSummary || "No discussion"}`; + + return await summarize( + { + textToSummarize: text, + systemPrompt: PROMPTS.overall.system, + userInstructions: PROMPTS.overall.user, + temperature: 0.3, + }, + openai, + ); +} + +// Export for testing or custom usage +export { summarize, PROMPTS }; diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index c4b16b26..fc97c0d4 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -5,6 +5,7 @@ import pMap from "p-map"; import type { WranglerEnv } from "@/core/constants/wrangler.constant"; import { eq, inArray } from "@/core/db"; import { issueEmbeddings } from "@/core/db/schema/entities/issue-embedding.sql"; +import { issueTable } from "@/core/db/schema/entities/issue.sql"; import { repos } from "@/core/db/schema/entities/repo.sql"; import { sendEmail } from "@/core/email"; import { @@ -13,6 +14,11 @@ import { selectIssuesForEmbeddingInit, upsertIssueEmbeddings, } from "@/core/embedding"; +import { + generateBodySummary, + generateCommentsSummary, + generateOverallSummary, +} from "@/core/summary"; import { chunkArray } from "@/core/util/truncate"; import { getDeps } from "@/deps"; import { getEnvPrefix } from "@/util"; @@ -30,8 +36,9 @@ interface Env extends WranglerEnv { } /* two modes -1. as part of repo init. takes an array of issueIds (100 at a time), calls DB, creates embeddings, update DB -2. as part of cron sync. no parameter. just query all out-of-sync issueIds 100 at a time, create embeddings, update DB, calls itself recursively until no more such issues are found +1. as part of repo init. takes an array of issueIds (100 at a time), calls DB, creates embeddings and generate summaries, update DB +2. as part of cron sync. no parameter. just query all out-of-sync issueIds 100 at a time, create embeddings and generate summaries, update DB +calls itself recursively until no more such issues are found */ export type EmbeddingParams = | { @@ -86,16 +93,102 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< idx: number, totalBatches: number, ): Promise => { + // Generate body summaries in parallel + const bodySummaries = await step.do( + `generate body summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + bodySummary: await generateBodySummary(issue.body, openai), + })), + ); + }, + ); + + // Generate comment summaries in parallel + const commentSummaries = await step.do( + `generate comment summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + commentsSummary: await generateCommentsSummary( + issue.comments, + openai, + ), + })), + ); + }, + ); + + // Generate overall summaries using both + const overallSummaries = await step.do( + `generate overall summaries (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + return await Promise.all( + issues.map(async (issue) => { + const bodySummary = + bodySummaries.find((s) => s.issueId === issue.id) + ?.bodySummary || ""; + const commentsSummary = commentSummaries.find( + (s) => s.issueId === issue.id, + )?.commentsSummary; + return { + issueId: issue.id, + overallSummary: await generateOverallSummary( + { bodySummary, commentsSummary }, + openai, + ), + }; + }), + ); + }, + ); + + // Update issues with summaries + await step.do( + `update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + for (const issue of issues) { + await dbSession + .update(issueTable) + .set({ + bodySummary: bodySummaries.find((s) => s.issueId === issue.id) + ?.bodySummary, + commentsSummary: commentSummaries.find( + (s) => s.issueId === issue.id, + )?.commentsSummary, + overallSummary: overallSummaries.find( + (s) => s.issueId === issue.id, + )?.overallSummary, + }) + .where(eq(issueTable.id, issue.id)); + } + }, + ); + + // Create embeddings using overall summaries const embeddings = await step.do( - `create embeddings for selected issues from API (batch ${idx + 1} of ${totalBatches})`, + `create embeddings using overall summaries (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { return await createEmbeddings({ - issues, + issues: issues.map((issue) => ({ + ...issue, + body: + overallSummaries.find((s) => s.issueId === issue.id) + ?.overallSummary || issue.body, + })), openai, }); }, ); + await step.do( `upsert issue embeddings in db (batch ${idx + 1})`, getStepDuration("medium"), diff --git a/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts b/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts index f84d6849..684528c0 100644 --- a/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts +++ b/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts @@ -70,8 +70,6 @@ export class IssueWorkflow extends WorkflowEntrypoint { const name = `${repoOwner}/${repoName}`; caughtName = name; caughtRepoId = repoId; - // don't have to worry about getting same issues twice because - // we are using hasNextPage to determine if we should continue let syncCursor = repoSyncCursor; while (true) { diff --git a/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts b/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts index d71ef3c6..8d53f32c 100644 --- a/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts +++ b/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts @@ -135,7 +135,7 @@ export class RepoInitWorkflow extends WorkflowEntrypoint { attempt++ ) { const numIssues = getNumIssues(attempt); - // only use queryCursor's after if its since is the same as the previous + // only use syncCursor's after if its since is the same as the previous // else, just use null and use the new since const result = await getLatestGithubRepoIssues({ repoId, From 043577c7e95a8717b8cd246090f96bf5e6ce882d Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 11:26:04 +0800 Subject: [PATCH 02/12] more progress --- .../src/db/schema/entities/issue.schema.ts | 35 ++++--- packages/core/src/db/utils/json.ts | 1 - packages/core/src/summary.ts | 99 +++++++++++++++++-- .../sync/embedding/embedding.workflow.ts | 98 +++++++++--------- 4 files changed, 161 insertions(+), 72 deletions(-) diff --git a/packages/core/src/db/schema/entities/issue.schema.ts b/packages/core/src/db/schema/entities/issue.schema.ts index 756a4fcd..94a8dae1 100644 --- a/packages/core/src/db/schema/entities/issue.schema.ts +++ b/packages/core/src/db/schema/entities/issue.schema.ts @@ -1,7 +1,7 @@ // putting these in a separate file so that migrations can be generated as is // currently a bug in drizzle-zod vs drizzle-kit interaction import { createInsertSchema, createSelectSchema } from "drizzle-zod"; -import type { z } from "zod"; +import { z } from "zod"; import { aggregateReactionsSchema, @@ -26,17 +26,28 @@ const selectIssueSchema = createSelectSchema(issueTable).extend({ export type SelectIssue = z.infer; -const _selectIssueForEmbeddingSchema = selectIssueSchema.pick({ - id: true, - number: true, - author: true, - title: true, - body: true, - issueState: true, - issueStateReason: true, - issueCreatedAt: true, - issueClosedAt: true, -}); +const _selectIssueForEmbeddingSchema = selectIssueSchema + .pick({ + id: true, + number: true, + author: true, + title: true, + body: true, + issueState: true, + issueStateReason: true, + issueCreatedAt: true, + issueClosedAt: true, + }) + .extend({ + labels: z + .array( + z.object({ + name: z.string(), + description: z.string().nullable(), + }), + ) + .optional(), + }); export type SelectIssueForEmbedding = z.infer< typeof _selectIssueForEmbeddingSchema diff --git a/packages/core/src/db/utils/json.ts b/packages/core/src/db/utils/json.ts index bd2fa708..aafb0c02 100644 --- a/packages/core/src/db/utils/json.ts +++ b/packages/core/src/db/utils/json.ts @@ -100,7 +100,6 @@ export function jsonAggBuildObjectManyToMany< )`; } -// Simpler version for one-to-many relationships where we just need to aggregate related rows export function jsonAggBuildObjectOneToMany< T extends SelectedFields, Column extends AnyColumn, diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index 3a66a348..17cf076f 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -1,3 +1,10 @@ +import dedent from "dedent"; + +import { inArray, sql } from "./db"; +import type { DbClient } from "./db"; +import type { SelectIssueForEmbedding } from "./db/schema/entities/issue.schema"; +import { issueTable } from "./db/schema/entities/issue.sql"; +import type { Author } from "./db/schema/shared"; import type { OpenAIClient } from "./openai"; import { SUMMARY_MODEL } from "./openai"; import { chatCompletionSchema } from "./openai/schema"; @@ -8,7 +15,7 @@ async function summarize( systemPrompt, userInstructions, temperature = 0.2, - reasoningEffort = "medium", + reasoningEffort = "high", }: { textToSummarize: string; systemPrompt: string; @@ -67,15 +74,13 @@ export async function generateBodySummary( textToSummarize: body, systemPrompt: PROMPTS.issueBody.system, userInstructions: PROMPTS.issueBody.user, - temperature: 0.2, - reasoningEffort: "high", }, openai, ); } export async function generateCommentsSummary( - comments: Array<{ body: string }>, + comments: Array<{ body: string; author: Author }>, openai: OpenAIClient, ): Promise { if (!comments.length) { @@ -84,10 +89,16 @@ export async function generateCommentsSummary( return await summarize( { - textToSummarize: comments.map((c) => c.body).join("\n\n"), + textToSummarize: comments + .map((c) => { + const authorName = c.author?.name || "Deleted User"; + return dedent` + ${authorName} wrote: + ${c.body}`; + }) + .join("\n\n---\n\n"), systemPrompt: PROMPTS.comments.system, userInstructions: PROMPTS.comments.user, - temperature: 0.3, }, openai, ); @@ -97,14 +108,35 @@ export async function generateOverallSummary( params: { bodySummary: string; commentsSummary?: string; + issue: SelectIssueForEmbedding; }, openai: OpenAIClient, ): Promise { - const text = `Issue description summary: -${params.bodySummary} + const { + number, + title, + author, + issueState: state, + issueStateReason: stateReason, + issueCreatedAt: createdAt, + issueClosedAt: closedAt, + labels, + } = params.issue; + + const text = dedent` + Issue #${number}: ${title} + + Description Summary: + ${params.bodySummary} -Discussion summary: -${params.commentsSummary || "No discussion"}`; + Discussion Summary: + ${params.commentsSummary || "No discussion"} + + Additional Context: + - State: ${state}${stateReason ? `, Reason: ${stateReason}` : ""} + - Author: ${author?.name || "Anonymous"} + - Created: ${createdAt.toISOString()}${closedAt ? `\n- Closed: ${closedAt.toISOString()}` : ""} + ${labels?.length ? `- Labels: ${labels.map((label) => `${label.name}${label.description ? ` (${label.description})` : ""}`).join(", ")}` : ""}`; return await summarize( { @@ -117,5 +149,52 @@ ${params.commentsSummary || "No discussion"}`; ); } +interface IssueSummary { + issueId: string; + bodySummary?: string | null; + commentsSummary?: string | null; + overallSummary?: string | null; +} + +export async function bulkUpdateIssueSummaries( + summaries: IssueSummary[], + db: DbClient, +): Promise { + if (summaries.length === 0) return; + + const sqlChunks = { + bodySummary: [sql`(case`], + commentsSummary: [sql`(case`], + overallSummary: [sql`(case`], + }; + + const issueIds = summaries.map((s) => s.issueId); + + for (const summary of summaries) { + sqlChunks.bodySummary.push( + sql`when id = ${summary.issueId} then ${summary.bodySummary}`, + ); + sqlChunks.commentsSummary.push( + sql`when id = ${summary.issueId} then ${summary.commentsSummary}`, + ); + sqlChunks.overallSummary.push( + sql`when id = ${summary.issueId} then ${summary.overallSummary}`, + ); + } + + for (const key of Object.keys(sqlChunks) as Array) { + sqlChunks[key].push(sql`end)`); + } + + await db + .update(issueTable) + .set({ + bodySummary: sql.join(sqlChunks.bodySummary, sql.raw(" ")), + commentsSummary: sql.join(sqlChunks.commentsSummary, sql.raw(" ")), + overallSummary: sql.join(sqlChunks.overallSummary, sql.raw(" ")), + }) + .where(inArray(issueTable.id, issueIds)); +} + // Export for testing or custom usage export { summarize, PROMPTS }; diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index fc97c0d4..8263606d 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -5,7 +5,6 @@ import pMap from "p-map"; import type { WranglerEnv } from "@/core/constants/wrangler.constant"; import { eq, inArray } from "@/core/db"; import { issueEmbeddings } from "@/core/db/schema/entities/issue-embedding.sql"; -import { issueTable } from "@/core/db/schema/entities/issue.sql"; import { repos } from "@/core/db/schema/entities/repo.sql"; import { sendEmail } from "@/core/email"; import { @@ -15,6 +14,7 @@ import { upsertIssueEmbeddings, } from "@/core/embedding"; import { + bulkUpdateIssueSummaries, generateBodySummary, generateCommentsSummary, generateOverallSummary, @@ -93,38 +93,37 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< idx: number, totalBatches: number, ): Promise => { - // Generate body summaries in parallel - const bodySummaries = await step.do( - `generate body summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, - getStepDuration("medium"), - async () => { - return await Promise.all( - issues.map(async (issue) => ({ - issueId: issue.id, - bodySummary: await generateBodySummary(issue.body, openai), - })), - ); - }, - ); - - // Generate comment summaries in parallel - const commentSummaries = await step.do( - `generate comment summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, - getStepDuration("medium"), - async () => { - return await Promise.all( - issues.map(async (issue) => ({ - issueId: issue.id, - commentsSummary: await generateCommentsSummary( - issue.comments, - openai, - ), - })), - ); - }, - ); + // Generate both summaries in parallel + const [bodySummaries, commentSummaries] = await Promise.all([ + step.do( + `generate body summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + bodySummary: await generateBodySummary(issue.body, openai), + })), + ); + }, + ), + step.do( + `generate comment summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + commentsSummary: await generateCommentsSummary( + issue.comments, + openai, + ), + })), + ); + }, + ), + ]); - // Generate overall summaries using both const overallSummaries = await step.do( `generate overall summaries (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), @@ -140,7 +139,11 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< return { issueId: issue.id, overallSummary: await generateOverallSummary( - { bodySummary, commentsSummary }, + { + bodySummary, + commentsSummary, + issue, + }, openai, ), }; @@ -151,24 +154,21 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< // Update issues with summaries await step.do( - `update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, + `bulk update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { - for (const issue of issues) { - await dbSession - .update(issueTable) - .set({ - bodySummary: bodySummaries.find((s) => s.issueId === issue.id) - ?.bodySummary, - commentsSummary: commentSummaries.find( - (s) => s.issueId === issue.id, - )?.commentsSummary, - overallSummary: overallSummaries.find( - (s) => s.issueId === issue.id, - )?.overallSummary, - }) - .where(eq(issueTable.id, issue.id)); - } + const summaries = issues.map((issue) => ({ + issueId: issue.id, + bodySummary: bodySummaries.find((s) => s.issueId === issue.id) + ?.bodySummary, + commentsSummary: commentSummaries.find( + (s) => s.issueId === issue.id, + )?.commentsSummary, + overallSummary: overallSummaries.find( + (s) => s.issueId === issue.id, + )?.overallSummary, + })); + await bulkUpdateIssueSummaries(summaries, dbSession); }, ); From 19ea63c5f4135895454758aa2219024174a7d527 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 11:28:13 +0800 Subject: [PATCH 03/12] don't change how embeddings are generated first --- .../src/workflows/sync/embedding/embedding.workflow.ts | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index 8263606d..62a46d80 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -172,18 +172,12 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< }, ); - // Create embeddings using overall summaries const embeddings = await step.do( - `create embeddings using overall summaries (batch ${idx + 1} of ${totalBatches})`, + `create embeddings for selected issues from API (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { return await createEmbeddings({ - issues: issues.map((issue) => ({ - ...issue, - body: - overallSummaries.find((s) => s.issueId === issue.id) - ?.overallSummary || issue.body, - })), + issues, openai, }); }, From e7b640d31e9be63abf47c714009469abf1f50244 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 11:28:47 +0800 Subject: [PATCH 04/12] migration --- .../migrations/0044_pale_black_panther.sql | 3 + .../core/migrations/meta/0044_snapshot.json | 1747 +++++++++++++++++ packages/core/migrations/meta/_journal.json | 7 + 3 files changed, 1757 insertions(+) create mode 100644 packages/core/migrations/0044_pale_black_panther.sql create mode 100644 packages/core/migrations/meta/0044_snapshot.json diff --git a/packages/core/migrations/0044_pale_black_panther.sql b/packages/core/migrations/0044_pale_black_panther.sql new file mode 100644 index 00000000..ccccce7f --- /dev/null +++ b/packages/core/migrations/0044_pale_black_panther.sql @@ -0,0 +1,3 @@ +ALTER TABLE "issues" ADD COLUMN "overall_summary" text;--> statement-breakpoint +ALTER TABLE "issues" ADD COLUMN "body_summary" text;--> statement-breakpoint +ALTER TABLE "issues" ADD COLUMN "comments_summary" text; \ No newline at end of file diff --git a/packages/core/migrations/meta/0044_snapshot.json b/packages/core/migrations/meta/0044_snapshot.json new file mode 100644 index 00000000..ae3ba47b --- /dev/null +++ b/packages/core/migrations/meta/0044_snapshot.json @@ -0,0 +1,1747 @@ +{ + "id": "55636e9d-d940-4ddc-8bce-cd6f70d72f7c", + "prevId": "58d49aa7-c13d-417c-a3a4-edd243f9cdc5", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.comments": { + "name": "comments", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "author": { + "name": "author", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "comment_created_at": { + "name": "comment_created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "comment_updated_at": { + "name": "comment_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "issue_id_idx": { + "name": "issue_id_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "comments_issue_id_issues_id_fk": { + "name": "comments_issue_id_issues_id_fk", + "tableFrom": "comments", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "comments_node_id_unique": { + "name": "comments_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.installations_to_repos": { + "name": "installations_to_repos", + "schema": "", + "columns": { + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "installation_id": { + "name": "installation_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "github_repo_id": { + "name": "github_repo_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "repo_node_id": { + "name": "repo_node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "added_at": { + "name": "added_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "removed_at": { + "name": "removed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "installations_to_repos_installation_idx": { + "name": "installations_to_repos_installation_idx", + "columns": [ + { + "expression": "installation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_repo_idx": { + "name": "installations_to_repos_repo_idx", + "columns": [ + { + "expression": "repo_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_repo_node_idx": { + "name": "installations_to_repos_repo_node_idx", + "columns": [ + { + "expression": "repo_node_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_active_idx": { + "name": "installations_to_repos_active_idx", + "columns": [ + { + "expression": "installation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"installations_to_repos\".\"removed_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "installations_to_repos_installation_id_installations_id_fk": { + "name": "installations_to_repos_installation_id_installations_id_fk", + "tableFrom": "installations_to_repos", + "tableTo": "installations", + "columnsFrom": [ + "installation_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + }, + "installations_to_repos_repo_id_repos_id_fk": { + "name": "installations_to_repos_repo_id_repos_id_fk", + "tableFrom": "installations_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "installations_to_repos_installation_id_github_repo_id_pk": { + "name": "installations_to_repos_installation_id_github_repo_id_pk", + "columns": [ + "installation_id", + "github_repo_id" + ] + } + }, + "uniqueConstraints": {} + }, + "public.installations": { + "name": "installations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "github_installation_id": { + "name": "github_installation_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "target_type": { + "name": "target_type", + "type": "target_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "target_id": { + "name": "target_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "target_github_id": { + "name": "target_github_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "target_node_id": { + "name": "target_node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repository_selection": { + "name": "repository_selection", + "type": "repository_selection", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "installed_by_user_id": { + "name": "installed_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "installed_at": { + "name": "installed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "uninstalled_at": { + "name": "uninstalled_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "suspended_at": { + "name": "suspended_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "suspended_by": { + "name": "suspended_by", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "permissions": { + "name": "permissions", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "permissions_updated_at": { + "name": "permissions_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "installations_target_idx": { + "name": "installations_target_idx", + "columns": [ + { + "expression": "target_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "target_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_installed_by_user_idx": { + "name": "installations_installed_by_user_idx", + "columns": [ + { + "expression": "installed_by_user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_active_idx": { + "name": "installations_active_idx", + "columns": [ + { + "expression": "uninstalled_at", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "suspended_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"installations\".\"uninstalled_at\" IS NULL AND \"installations\".\"suspended_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "installations_installed_by_user_id_users_id_fk": { + "name": "installations_installed_by_user_id_users_id_fk", + "tableFrom": "installations", + "tableTo": "users", + "columnsFrom": [ + "installed_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "installations_github_installation_id_unique": { + "name": "installations_github_installation_id_unique", + "nullsNotDistinct": false, + "columns": [ + "github_installation_id" + ] + } + } + }, + "public.issue_embeddings": { + "name": "issue_embeddings", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding_model": { + "name": "embedding_model", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "embedding": { + "name": "embedding", + "type": "vector(256)", + "primaryKey": false, + "notNull": false + }, + "embedding_generated_at": { + "name": "embedding_generated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "issue_embedding_sync_status": { + "name": "issue_embedding_sync_status", + "type": "issue_embedding_sync_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + } + }, + "indexes": { + "issue_embeddings_issue_id_idx": { + "name": "issue_embeddings_issue_id_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_sync_status_idx": { + "name": "issue_embeddings_sync_status_idx", + "columns": [ + { + "expression": "issue_embedding_sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_embedding_idx": { + "name": "issue_embeddings_embedding_idx", + "columns": [ + { + "expression": "embedding", + "isExpression": false, + "asc": true, + "nulls": "last", + "opclass": "vector_cosine_ops" + } + ], + "isUnique": false, + "concurrently": false, + "method": "hnsw", + "with": {} + }, + "issue_embeddings_status_generated_at_idx": { + "name": "issue_embeddings_status_generated_at_idx", + "columns": [ + { + "expression": "issue_embedding_sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "embedding_generated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_null_idx": { + "name": "issue_embeddings_null_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "embedding IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "issue_embeddings_issue_id_issues_id_fk": { + "name": "issue_embeddings_issue_id_issues_id_fk", + "tableFrom": "issue_embeddings", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.issues_to_labels": { + "name": "issues_to_labels", + "schema": "", + "columns": { + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "label_id": { + "name": "label_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "issues_to_labels_issue_id_issues_id_fk": { + "name": "issues_to_labels_issue_id_issues_id_fk", + "tableFrom": "issues_to_labels", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "issues_to_labels_label_id_labels_id_fk": { + "name": "issues_to_labels_label_id_labels_id_fk", + "tableFrom": "issues_to_labels", + "tableTo": "labels", + "columnsFrom": [ + "label_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "issues_to_labels_issue_id_label_id_pk": { + "name": "issues_to_labels_issue_id_label_id_pk", + "columns": [ + "issue_id", + "label_id" + ] + } + }, + "uniqueConstraints": { + "issues_to_labels_label_id_issue_id_unique": { + "name": "issues_to_labels_label_id_issue_id_unique", + "nullsNotDistinct": false, + "columns": [ + "label_id", + "issue_id" + ] + } + } + }, + "public.issues": { + "name": "issues", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "number": { + "name": "number", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "author": { + "name": "author", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "issue_state": { + "name": "issue_state", + "type": "issue_state", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "issue_state_reason": { + "name": "issue_state_reason", + "type": "issue_state_reason", + "typeSchema": "public", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "overall_summary": { + "name": "overall_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "body_summary": { + "name": "body_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "comments_summary": { + "name": "comments_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "aggregate_reactions": { + "name": "aggregate_reactions", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "top_commenters": { + "name": "top_commenters", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "issue_created_at": { + "name": "issue_created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "issue_updated_at": { + "name": "issue_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "issue_closed_at": { + "name": "issue_closed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "repo_id_idx": { + "name": "repo_id_idx", + "columns": [ + { + "expression": "repo_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "title_substring_idx": { + "name": "title_substring_idx", + "columns": [ + { + "expression": "\"title\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "body_substring_idx": { + "name": "body_substring_idx", + "columns": [ + { + "expression": "\"body\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "author_name_idx": { + "name": "author_name_idx", + "columns": [ + { + "expression": "lower((\"author\"->>'name'::text))", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_state_open_idx": { + "name": "issue_state_open_idx", + "columns": [ + { + "expression": "issue_state", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "issue_state = 'OPEN'", + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_updated_at_idx": { + "name": "issue_updated_at_idx", + "columns": [ + { + "expression": "issue_updated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "issues_repo_id_repos_id_fk": { + "name": "issues_repo_id_repos_id_fk", + "tableFrom": "issues", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "issues_node_id_unique": { + "name": "issues_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.labels": { + "name": "labels", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "color": { + "name": "color", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "labels_node_id_unique": { + "name": "labels_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.organizations": { + "name": "organizations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "login": { + "name": "login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "organizations_node_id_unique": { + "name": "organizations_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.public_collections_to_repos": { + "name": "public_collections_to_repos", + "schema": "", + "columns": { + "collection_id": { + "name": "collection_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "public_collections_to_repos_collection_id_public_collections_id_fk": { + "name": "public_collections_to_repos_collection_id_public_collections_id_fk", + "tableFrom": "public_collections_to_repos", + "tableTo": "public_collections", + "columnsFrom": [ + "collection_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "public_collections_to_repos_repo_id_repos_id_fk": { + "name": "public_collections_to_repos_repo_id_repos_id_fk", + "tableFrom": "public_collections_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "public_collections_to_repos_collection_id_repo_id_pk": { + "name": "public_collections_to_repos_collection_id_repo_id_pk", + "columns": [ + "collection_id", + "repo_id" + ] + } + }, + "uniqueConstraints": { + "public_collections_to_repos_repo_id_collection_id_unique": { + "name": "public_collections_to_repos_repo_id_collection_id_unique", + "nullsNotDistinct": false, + "columns": [ + "repo_id", + "collection_id" + ] + } + } + }, + "public.public_collections": { + "name": "public_collections", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "public_collections_name_unique": { + "name": "public_collections_name_unique", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + } + } + }, + "public.repos": { + "name": "repos", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "owner_login": { + "name": "owner_login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "owner_avatar_url": { + "name": "owner_avatar_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "is_private": { + "name": "is_private", + "type": "boolean", + "primaryKey": false, + "notNull": true + }, + "sync_status": { + "name": "sync_status", + "type": "sync_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + }, + "last_synced_at": { + "name": "last_synced_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "init_status": { + "name": "init_status", + "type": "init_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + }, + "initialized_at": { + "name": "initialized_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "sync_cursor": { + "name": "sync_cursor", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "owner_name_idx": { + "name": "owner_name_idx", + "columns": [ + { + "expression": "owner_login", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "owner_idx": { + "name": "owner_idx", + "columns": [ + { + "expression": "owner_login", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "created_at_idx": { + "name": "created_at_idx", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "repo_sync_idx": { + "name": "repo_sync_idx", + "columns": [ + { + "expression": "init_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "last_synced_at", + "isExpression": false, + "asc": true, + "nulls": "first" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "repo_init_idx": { + "name": "repo_init_idx", + "columns": [ + { + "expression": "init_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "repos_node_id_unique": { + "name": "repos_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.users_to_repos": { + "name": "users_to_repos", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "subscription_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'active'" + }, + "subscribed_at": { + "name": "subscribed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "unsubscribed_at": { + "name": "unsubscribed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_status_idx": { + "name": "user_status_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "subscribed_at", + "isExpression": false, + "asc": false, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "users_to_repos_user_id_users_id_fk": { + "name": "users_to_repos_user_id_users_id_fk", + "tableFrom": "users_to_repos", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "users_to_repos_repo_id_repos_id_fk": { + "name": "users_to_repos_repo_id_repos_id_fk", + "tableFrom": "users_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "users_to_repos_user_id_repo_id_pk": { + "name": "users_to_repos_user_id_repo_id_pk", + "columns": [ + "user_id", + "repo_id" + ] + } + }, + "uniqueConstraints": { + "users_to_repos_repo_id_user_id_unique": { + "name": "users_to_repos_repo_id_user_id_unique", + "nullsNotDistinct": false, + "columns": [ + "repo_id", + "user_id" + ] + } + } + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "login": { + "name": "login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "github_scopes": { + "name": "github_scopes", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "auth_revoked_at": { + "name": "auth_revoked_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "email_idx": { + "name": "email_idx", + "columns": [ + { + "expression": "email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_node_id_unique": { + "name": "users_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + } + }, + "enums": { + "public.repository_selection": { + "name": "repository_selection", + "schema": "public", + "values": [ + "all", + "selected" + ] + }, + "public.target_type": { + "name": "target_type", + "schema": "public", + "values": [ + "user", + "organization" + ] + }, + "public.issue_embedding_sync_status": { + "name": "issue_embedding_sync_status", + "schema": "public", + "values": [ + "ready", + "in_progress", + "error" + ] + }, + "public.issue_state": { + "name": "issue_state", + "schema": "public", + "values": [ + "OPEN", + "CLOSED" + ] + }, + "public.issue_state_reason": { + "name": "issue_state_reason", + "schema": "public", + "values": [ + "COMPLETED", + "REOPENED", + "NOT_PLANNED", + "DUPLICATE" + ] + }, + "public.init_status": { + "name": "init_status", + "schema": "public", + "values": [ + "pending", + "ready", + "in_progress", + "completed", + "error", + "no_issues" + ] + }, + "public.sync_status": { + "name": "sync_status", + "schema": "public", + "values": [ + "ready", + "queued", + "in_progress", + "error" + ] + }, + "public.subscription_status": { + "name": "subscription_status", + "schema": "public", + "values": [ + "active", + "inactive" + ] + } + }, + "schemas": {}, + "sequences": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/core/migrations/meta/_journal.json b/packages/core/migrations/meta/_journal.json index a7d67e14..7b3cc073 100644 --- a/packages/core/migrations/meta/_journal.json +++ b/packages/core/migrations/meta/_journal.json @@ -309,6 +309,13 @@ "when": 1739172466880, "tag": "0043_lying_jimmy_woo", "breakpoints": true + }, + { + "idx": 44, + "version": "7", + "when": 1739244517513, + "tag": "0044_pale_black_panther", + "breakpoints": true } ] } \ No newline at end of file From cb674dbb3cc993d84e421760106552adff4e81db Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:17:37 +0800 Subject: [PATCH 05/12] prompt engineeringzzz --- packages/core/src/summary.ts | 23 +++++++--------- packages/scripts/src/script.ts | 27 +++++-------------- .../sync/embedding/embedding.workflow.ts | 26 ++++++++++++------ 3 files changed, 33 insertions(+), 43 deletions(-) diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index 17cf076f..cb7ab20f 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -14,13 +14,11 @@ async function summarize( textToSummarize, systemPrompt, userInstructions, - temperature = 0.2, reasoningEffort = "high", }: { textToSummarize: string; systemPrompt: string; userInstructions: string; - temperature?: number; reasoningEffort?: "low" | "medium" | "high"; }, openai: OpenAIClient, @@ -39,7 +37,6 @@ async function summarize( const response = await openai.chat.completions.create({ model: SUMMARY_MODEL, messages, - temperature, reasoning_effort: reasoningEffort, }); const result = chatCompletionSchema.parse(response); @@ -50,18 +47,18 @@ async function summarize( const PROMPTS = { issueBody: { system: - "You are a helpful assistant that generates concise summaries of GitHub issue descriptions. Focus on the problem, proposed solutions, and key technical details.", - user: "Please summarize this GitHub issue description:", + "You are a helpful assistant that generates concise summaries of GitHub issue descriptions. Describe the issue directly and focus on the problem, proposed solutions, and key technical details.", + user: "Please summarize this GitHub issue description in no more than 3 short paragraphs. Just provide the summary directly.", }, comments: { system: - "You are a helpful assistant that generates concise summaries of GitHub issue comments. Focus on key decisions, solutions proposed, and final outcomes.", - user: "Please summarize the discussion in these GitHub issue comments:", + "You are a helpful assistant that generates concise summaries of GitHub issue comments. Summarise the comments so that a human can capture the main points of discussion without reading the entire comment thread. If you include the name of the author, make sure to stick to the original casing and don't modify it.", + user: "Please summarize the discussion in these GitHub issue comments in no more than 3 short paragraphs. Just provide the summary directly.", }, overall: { system: - "You are a helpful assistant that generates concise overall summaries of GitHub issues. Synthesize the issue description and discussion into a clear summary.", - user: "Please provide a concise overall summary of this GitHub issue based on these summaries:", + "You are a helpful assistant that generates concise overall summaries of GitHub issues so that a human can understand the issue at a glance. You will be provided with information of the issue, a summary of of the issue body and a summary of the comments, and additional context. Don't use 'this issue' or 'this discussion', just provide the summary directly.", + user: "Please provide a direct summary of this issue based on the provided information in no more than 3 short paragraphs.", }, } as const; @@ -113,7 +110,6 @@ export async function generateOverallSummary( openai: OpenAIClient, ): Promise { const { - number, title, author, issueState: state, @@ -124,13 +120,13 @@ export async function generateOverallSummary( } = params.issue; const text = dedent` - Issue #${number}: ${title} + Issue: ${title} Description Summary: ${params.bodySummary} - Discussion Summary: - ${params.commentsSummary || "No discussion"} + Comments Summary: + ${params.commentsSummary || "No comments"} Additional Context: - State: ${state}${stateReason ? `, Reason: ${stateReason}` : ""} @@ -143,7 +139,6 @@ export async function generateOverallSummary( textToSummarize: text, systemPrompt: PROMPTS.overall.system, userInstructions: PROMPTS.overall.user, - temperature: 0.3, }, openai, ); diff --git a/packages/scripts/src/script.ts b/packages/scripts/src/script.ts index 796a43c9..13bc4a10 100644 --- a/packages/scripts/src/script.ts +++ b/packages/scripts/src/script.ts @@ -1,29 +1,14 @@ -import { eq } from "drizzle-orm"; - -import { repos } from "@/core/db/schema/entities/repo.sql"; +import { generateBodySummary } from "@/core/summary"; import { getDeps } from "./deps"; -const { db, closeConnection } = await getDeps(); -const repoId = "rep_01JEK73YA0FDWVBEN21R4ATTB4"; +const { openai } = await getDeps(); try { - const [result] = await db - .select({ - initStatus: repos.initStatus, - repoName: repos.name, - repoOwner: repos.ownerLogin, - isPrivate: repos.isPrivate, - repoSyncCursor: repos.syncCursor, - }) - .from(repos) - .where(eq(repos.id, repoId)) - .limit(1); - if (!result) { - throw new Error("Repo not found"); - } + const result = await generateBodySummary( + "We need basic workspace creation for our Alpha that will allow minimum needed functionality and so our team can use it for dogfooding. This epic focuses on the essential UI, CLI and API for Workspace Creation in V2 Alpha", + openai, + ); console.log(result); } catch (e) { console.error(e); -} finally { - await closeConnection(); } diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index 62a46d80..4e4cb952 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -97,27 +97,36 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< const [bodySummaries, commentSummaries] = await Promise.all([ step.do( `generate body summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, - getStepDuration("medium"), + getStepDuration("long"), async () => { return await Promise.all( issues.map(async (issue) => ({ issueId: issue.id, - bodySummary: await generateBodySummary(issue.body, openai), + bodySummary: + issue.body.length > 1000 + ? await generateBodySummary(issue.body, openai) + : issue.body, })), ); }, ), step.do( `generate comment summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, - getStepDuration("medium"), + getStepDuration("long"), async () => { return await Promise.all( issues.map(async (issue) => ({ issueId: issue.id, - commentsSummary: await generateCommentsSummary( - issue.comments, - openai, - ), + commentsSummary: + issue.comments.reduce((acc, c) => acc + c.body, "").length > + 1000 + ? await generateCommentsSummary(issue.comments, openai) + : issue.comments + .map( + (c) => + `${c.author?.name ?? "Deleted User"}: ${c.body}`, + ) + .join("\n"), })), ); }, @@ -126,7 +135,7 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< const overallSummaries = await step.do( `generate overall summaries (batch ${idx + 1} of ${totalBatches})`, - getStepDuration("medium"), + getStepDuration("long"), async () => { return await Promise.all( issues.map(async (issue) => { @@ -176,6 +185,7 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< `create embeddings for selected issues from API (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { + // TODO: in the future, create embeddings using overall summary instead of issues return await createEmbeddings({ issues, openai, From 0e929b955efaecf35852a9fca4459cf424499dd8 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:41:44 +0800 Subject: [PATCH 06/12] frontend --- packages/core/src/semsearch/db.ts | 1 + packages/core/src/semsearch/index.ts | 1 + packages/core/src/semsearch/schema.output.ts | 1 + .../web/src/components/search/IssueCard.tsx | 24 +++++++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/packages/core/src/semsearch/db.ts b/packages/core/src/semsearch/db.ts index 40ef394c..e5918d13 100644 --- a/packages/core/src/semsearch/db.ts +++ b/packages/core/src/semsearch/db.ts @@ -46,6 +46,7 @@ export function getBaseSelect() { issueUpdatedAt: issueTable.issueUpdatedAt, aggregateReactions: issueTable.aggregateReactions, topCommenters: issueTable.topCommenters, + overallSummary: issueTable.overallSummary, repoName: repos.name, repoUrl: sql`${repos.htmlUrl}`.as("repoUrl"), repoOwnerName: repos.ownerLogin, diff --git a/packages/core/src/semsearch/index.ts b/packages/core/src/semsearch/index.ts index bee182b4..072aa6e3 100644 --- a/packages/core/src/semsearch/index.ts +++ b/packages/core/src/semsearch/index.ts @@ -278,6 +278,7 @@ async function filterBeforeVectorSearch( repoOwnerName: vectorSearchSubquery.repoOwnerName, repoLastSyncedAt: vectorSearchSubquery.repoLastSyncedAt, commentCount: vectorSearchSubquery.commentCount, + overallSummary: vectorSearchSubquery.overallSummary, rankingScore, similarityScore, // Add window function to get total count in same query diff --git a/packages/core/src/semsearch/schema.output.ts b/packages/core/src/semsearch/schema.output.ts index 61efe7fc..b43df78a 100644 --- a/packages/core/src/semsearch/schema.output.ts +++ b/packages/core/src/semsearch/schema.output.ts @@ -59,6 +59,7 @@ const searchIssueSchema = createSelectSchema(issueTable, { issueCreatedAt: true, issueClosedAt: true, issueUpdatedAt: true, + overallSummary: true, }) .extend({ labels: z.array(selectLabelForSearchSchema), diff --git a/packages/web/src/components/search/IssueCard.tsx b/packages/web/src/components/search/IssueCard.tsx index 4c137766..3f93e323 100644 --- a/packages/web/src/components/search/IssueCard.tsx +++ b/packages/web/src/components/search/IssueCard.tsx @@ -5,6 +5,7 @@ import { CircleDotIcon, CircleSlashIcon, MessageSquareIcon, + ScanEyeIcon, } from "lucide-react"; import { NORMALIZATION_ANCHOR } from "@/core/constants/ranking.constant"; @@ -12,7 +13,13 @@ import type { AggregateReactions } from "@/core/db/schema/shared"; import type { PublicSearchIssuesResponse } from "@/lib/api/search"; import { formatLocalDateTime, getTimeAgo } from "@/lib/time"; import { Badge } from "@/components/ui/badge"; +import { Button } from "@/components/ui/button"; import { FastTooltip } from "@/components/ui/fast-tooltip"; +import { + Popover, + PopoverContent, + PopoverTrigger, +} from "@/components/ui/popover"; import { Tooltip, TooltipContent, @@ -35,6 +42,23 @@ export function IssueCard({ issue }: { issue: Issue }) {
+ {issue.overallSummary && ( + + + + + +
+
Summary
+
+ {issue.overallSummary} +
+
+
+
+ )}
From d86be1055137602d329fb10124709d2425e42406 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:45:52 +0800 Subject: [PATCH 07/12] make summary even shorter --- packages/core/src/summary.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index cb7ab20f..416c33aa 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -58,7 +58,7 @@ const PROMPTS = { overall: { system: "You are a helpful assistant that generates concise overall summaries of GitHub issues so that a human can understand the issue at a glance. You will be provided with information of the issue, a summary of of the issue body and a summary of the comments, and additional context. Don't use 'this issue' or 'this discussion', just provide the summary directly.", - user: "Please provide a direct summary of this issue based on the provided information in no more than 3 short paragraphs.", + user: "Please provide a direct summary of this issue based on the provided information in a single paragraph no more than 5 sentences.", }, } as const; From ebcdf817dbf7a794014ed692e81ac6278e27bf24 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:07:06 +0800 Subject: [PATCH 08/12] move preview button more --- .../web/src/components/search/IssueCard.tsx | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/packages/web/src/components/search/IssueCard.tsx b/packages/web/src/components/search/IssueCard.tsx index 3f93e323..61327c52 100644 --- a/packages/web/src/components/search/IssueCard.tsx +++ b/packages/web/src/components/search/IssueCard.tsx @@ -42,23 +42,6 @@ export function IssueCard({ issue }: { issue: Issue }) {
- {issue.overallSummary && ( - - - - - -
-
Summary
-
- {issue.overallSummary} -
-
-
-
- )}
@@ -237,6 +220,27 @@ function IssueTitleWithLabels({ issue }: { issue: Issue }) { rankingScore={issue.rankingScore} similarityScore={issue.similarityScore} /> + {issue.overallSummary && ( + + + + + +
+
Summary
+
+ {issue.overallSummary} +
+
+
+
+ )} Date: Tue, 11 Feb 2025 16:40:41 +0800 Subject: [PATCH 09/12] modify embedding to use summary --- packages/core/src/embedding.ts | 17 +++++++++++++++-- packages/core/src/summary.ts | 2 +- .../sync/embedding/embedding.workflow.ts | 5 +++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts index e8f1af27..1d8ef737 100644 --- a/packages/core/src/embedding.ts +++ b/packages/core/src/embedding.ts @@ -22,6 +22,7 @@ import { import { EMBEDDING_MODEL, type OpenAIClient } from "./openai"; import { isReducePromptError } from "./openai/errors"; import { embeddingsCreateSchema } from "./openai/schema"; +import type { IssueSummary } from "./summary"; export async function createEmbedding( { @@ -42,10 +43,12 @@ export async function createEmbedding( export async function createEmbeddings({ issues, + summaries, openai, concurrencyLimit, }: { issues: Awaited>; + summaries: IssueSummary[]; openai: OpenAIClient; concurrencyLimit?: number; }) { @@ -53,6 +56,7 @@ export async function createEmbeddings({ const processIssue = async (issue: (typeof issues)[number]) => { let attempt = 0; const labels = issue.labels; + const summary = summaries.find((s) => s.issueId === issue.id); while (attempt <= TRUNCATION_MAX_ATTEMPTS) { try { const embedding = await createEmbedding( @@ -61,6 +65,7 @@ export async function createEmbeddings({ issue, labels, attempt, + summary, }), }, openai, @@ -310,6 +315,7 @@ interface FormatIssueParams { attempt: number; issue: SelectIssueForEmbedding; labels: SelectLabelForEmbedding[]; + summary?: IssueSummary; } /* Alternate way to format issue for embedding */ @@ -318,6 +324,7 @@ function formatIssueForEmbedding({ issue, attempt = 0, labels, + summary, }: FormatIssueParams): string { const { number, @@ -329,8 +336,13 @@ function formatIssueForEmbedding({ issueCreatedAt, issueClosedAt, } = issue; - // Truncate body to roughly 6000 tokens to leave room for other fields - const truncatedBody = truncateText(body, attempt); + // If attempt > 0 and we have a summary, use it instead of truncating + const truncatedBody = + attempt > 0 && summary?.bodySummary + ? summary.bodySummary + : truncateText(body, attempt); + + const commentsSummary = summary?.commentsSummary; return ( dedent` @@ -341,6 +353,7 @@ function formatIssueForEmbedding({ // the following are "metadata" fields, but including them because conceivably // users may include them in their search dedent` + ${commentsSummary ? `Comments: ${commentsSummary}` : ""} State: ${issueState} State Reason: ${issueStateReason} ${author ? `Author: ${author.name}` : ""} diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index 416c33aa..4e95be03 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -144,7 +144,7 @@ export async function generateOverallSummary( ); } -interface IssueSummary { +export interface IssueSummary { issueId: string; bodySummary?: string | null; commentsSummary?: string | null; diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index 4e4cb952..558d3fb5 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -162,7 +162,7 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< ); // Update issues with summaries - await step.do( + const summaries = await step.do( `bulk update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { @@ -178,6 +178,7 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< )?.overallSummary, })); await bulkUpdateIssueSummaries(summaries, dbSession); + return summaries; }, ); @@ -185,9 +186,9 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< `create embeddings for selected issues from API (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { - // TODO: in the future, create embeddings using overall summary instead of issues return await createEmbeddings({ issues, + summaries, openai, }); }, From 6bb19d4354917edc5963c85a99fdae193e447e81 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Tue, 11 Feb 2025 17:08:55 +0800 Subject: [PATCH 10/12] modify your repo page --- packages/web/src/routes/r/your/repo.tsx | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/packages/web/src/routes/r/your/repo.tsx b/packages/web/src/routes/r/your/repo.tsx index b5e4ac3d..9aaffca8 100644 --- a/packages/web/src/routes/r/your/repo.tsx +++ b/packages/web/src/routes/r/your/repo.tsx @@ -32,20 +32,25 @@ function YourRepoPage() {
  • - Help users find answers faster with semantic search + ✨ Help users find answers faster with semantic search and + quick summaries
  • - Reduce duplicate issues by making existing ones discoverable + 🎯 Reduce duplicate issues by making existing ones + discoverable
  • - Simple setup - just add a badge to your README + + 🆓 Simple setup and free to use, just add a badge to your + README! +
  • - Search across pull requests and discussions + 💬 Search across pull requests and discussions Coming soon @@ -53,7 +58,7 @@ function YourRepoPage() {
  • - Search across a collection of multiple repos, including + 🔒 Search across a collection of multiple repos, including private repos Coming soon From 01a445da8bca5144ffe174b3894b9faf189c1624 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Wed, 12 Feb 2025 08:00:18 +0800 Subject: [PATCH 11/12] simplify embedding process --- packages/core/src/embedding.ts | 115 ++++++++---------- packages/core/src/summary.ts | 14 +-- .../sync/embedding/embedding.workflow.ts | 64 ++++++---- 3 files changed, 97 insertions(+), 96 deletions(-) diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts index 1d8ef737..7038d959 100644 --- a/packages/core/src/embedding.ts +++ b/packages/core/src/embedding.ts @@ -1,8 +1,6 @@ import dedent from "dedent"; import pMap from "p-map"; -import { truncateCodeBlocks, truncateToByteSize } from "@/util/truncate"; - import type { DbClient } from "./db"; import { and, asc, eq, gt, inArray, isNull, lt, ne, or, sql } from "./db"; import { comments as commentTable } from "./db/schema/entities/comment.sql"; @@ -52,42 +50,34 @@ export async function createEmbeddings({ openai: OpenAIClient; concurrencyLimit?: number; }) { - const TRUNCATION_MAX_ATTEMPTS = 8; const processIssue = async (issue: (typeof issues)[number]) => { - let attempt = 0; - const labels = issue.labels; + const { labels, number } = issue; const summary = summaries.find((s) => s.issueId === issue.id); - while (attempt <= TRUNCATION_MAX_ATTEMPTS) { - try { - const embedding = await createEmbedding( - { - input: formatIssueForEmbedding({ - issue, - labels, - attempt, - summary, - }), - }, - openai, - ); - return { - issueId: issue.id, - embedding, - }; - } catch (error) { - if (isReducePromptError(error) && attempt < TRUNCATION_MAX_ATTEMPTS) { - console.warn( - `Retrying issue #${issue.number} with truncation attempt ${attempt + 1}`, - ); - attempt++; - } else { - throw error; - } + if (!summary) { + throw new Error(`No summary found for issue #${number}`); + } + try { + const embedding = await createEmbedding( + { + input: formatIssueForEmbedding({ + issue, + labels, + summary, + }), + }, + openai, + ); + return { + issueId: issue.id, + embedding, + }; + } catch (error) { + if (isReducePromptError(error)) { + // TODO: do something? + throw error; } } - throw new Error( - `Failed to create embedding for issue #${issue.number} after ${TRUNCATION_MAX_ATTEMPTS} attempts`, - ); + throw new Error(`Failed to create embedding for issue #${number}`); }; return await pMap(issues, processIssue, { concurrency: concurrencyLimit }); } @@ -312,17 +302,15 @@ export async function unstuckIssueEmbeddings(db: DbClient) { } interface FormatIssueParams { - attempt: number; issue: SelectIssueForEmbedding; labels: SelectLabelForEmbedding[]; - summary?: IssueSummary; + summary: IssueSummary; } /* Alternate way to format issue for embedding */ /* Instead of truncating the body repeatedly, we could pass the body into a LLM and obtain a summary. Then, we pass the summary into the embedding API instead. */ function formatIssueForEmbedding({ issue, - attempt = 0, labels, summary, }: FormatIssueParams): string { @@ -330,24 +318,17 @@ function formatIssueForEmbedding({ number, author, title, - body, issueState, issueStateReason, issueCreatedAt, issueClosedAt, } = issue; - // If attempt > 0 and we have a summary, use it instead of truncating - const truncatedBody = - attempt > 0 && summary?.bodySummary - ? summary.bodySummary - : truncateText(body, attempt); - - const commentsSummary = summary?.commentsSummary; + const { commentsSummary, bodySummary } = summary; return ( dedent` Issue #${number}: ${title} - Body: ${truncatedBody} + Body: ${bodySummary} ${labels ? `Labels: ${labels.map((label) => `${label.name}${label.description ? ` (${label.description})` : ""}`).join(", ")}` : ""} ` + // the following are "metadata" fields, but including them because conceivably @@ -363,24 +344,24 @@ function formatIssueForEmbedding({ ); } -function truncateText(text: string, attempt: number): string { - // currently, it seem like issues that have huge blocks of code and logs are being tokenized very differently from this heuristic - // we first truncate per the body schema - const MAX_BODY_SIZE_KB = 8; - const CODE_BLOCK_PREVIEW_LINES = 10; - text = truncateToByteSize( - truncateCodeBlocks(text, CODE_BLOCK_PREVIEW_LINES), - MAX_BODY_SIZE_KB * 1024, - ); - // DISCUSSION: - // - could use a tokenizer to more accurately measure token length, e.g. https://github.com/dqbd/tiktoken - // - alternatively, the error returned by OpenAI also tells you how many token it is and hence how much it needs to be reduced - const TRUNCATION_FACTOR = 0.75; // after 8x retry, will be 10% of original length - const TRUNCATION_MAX_TOKENS = 6000; // somewhat arbitrary - // Rough approximation: 1 token ≈ 4 characters - const maxChars = Math.floor( - TRUNCATION_MAX_TOKENS * 4 * Math.pow(TRUNCATION_FACTOR, attempt), - ); - if (text.length <= maxChars) return text; - return text.slice(0, maxChars); -} +// function truncateText(text: string, attempt: number): string { +// // currently, it seem like issues that have huge blocks of code and logs are being tokenized very differently from this heuristic +// // we first truncate per the body schema +// const MAX_BODY_SIZE_KB = 8; +// const CODE_BLOCK_PREVIEW_LINES = 10; +// text = truncateToByteSize( +// truncateCodeBlocks(text, CODE_BLOCK_PREVIEW_LINES), +// MAX_BODY_SIZE_KB * 1024, +// ); +// // DISCUSSION: +// // - could use a tokenizer to more accurately measure token length, e.g. https://github.com/dqbd/tiktoken +// // - alternatively, the error returned by OpenAI also tells you how many token it is and hence how much it needs to be reduced +// const TRUNCATION_FACTOR = 0.75; // after 8x retry, will be 10% of original length +// const TRUNCATION_MAX_TOKENS = 6000; // somewhat arbitrary +// // Rough approximation: 1 token ≈ 4 characters +// const maxChars = Math.floor( +// TRUNCATION_MAX_TOKENS * 4 * Math.pow(TRUNCATION_FACTOR, attempt), +// ); +// if (text.length <= maxChars) return text; +// return text.slice(0, maxChars); +// } diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index 4e95be03..da01cadc 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -104,7 +104,7 @@ export async function generateCommentsSummary( export async function generateOverallSummary( params: { bodySummary: string; - commentsSummary?: string; + commentsSummary: string | null; issue: SelectIssueForEmbedding; }, openai: OpenAIClient, @@ -125,9 +125,7 @@ export async function generateOverallSummary( Description Summary: ${params.bodySummary} - Comments Summary: - ${params.commentsSummary || "No comments"} - + ${params.commentsSummary ? `Comments Summary: ${params.commentsSummary}\n` : ""} Additional Context: - State: ${state}${stateReason ? `, Reason: ${stateReason}` : ""} - Author: ${author?.name || "Anonymous"} @@ -146,9 +144,9 @@ export async function generateOverallSummary( export interface IssueSummary { issueId: string; - bodySummary?: string | null; - commentsSummary?: string | null; - overallSummary?: string | null; + bodySummary: string; + commentsSummary: string | null; + overallSummary: string; } export async function bulkUpdateIssueSummaries( @@ -170,7 +168,7 @@ export async function bulkUpdateIssueSummaries( sql`when id = ${summary.issueId} then ${summary.bodySummary}`, ); sqlChunks.commentsSummary.push( - sql`when id = ${summary.issueId} then ${summary.commentsSummary}`, + sql`when id = ${summary.issueId} then ${summary.commentsSummary === null ? sql`null` : summary.commentsSummary}`, ); sqlChunks.overallSummary.push( sql`when id = ${summary.issueId} then ${summary.overallSummary}`, diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index 558d3fb5..7a725442 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -118,15 +118,17 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< issues.map(async (issue) => ({ issueId: issue.id, commentsSummary: - issue.comments.reduce((acc, c) => acc + c.body, "").length > - 1000 - ? await generateCommentsSummary(issue.comments, openai) - : issue.comments - .map( - (c) => - `${c.author?.name ?? "Deleted User"}: ${c.body}`, - ) - .join("\n"), + issue.comments.length === 0 + ? null + : issue.comments.reduce((acc, c) => acc + c.body, "") + .length > 1000 + ? await generateCommentsSummary(issue.comments, openai) + : issue.comments + .map( + (c) => + `${c.author?.name ?? "Deleted User"}: ${c.body}`, + ) + .join("\n"), })), ); }, @@ -139,12 +141,19 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< async () => { return await Promise.all( issues.map(async (issue) => { - const bodySummary = - bodySummaries.find((s) => s.issueId === issue.id) - ?.bodySummary || ""; + const bodySummary = bodySummaries.find( + (s) => s.issueId === issue.id, + )?.bodySummary; const commentsSummary = commentSummaries.find( (s) => s.issueId === issue.id, )?.commentsSummary; + // this should never happen + if ( + bodySummary === undefined || + commentsSummary === undefined + ) { + throw new Error(`No summary found for issue #${issue.id}`); + } return { issueId: issue.id, overallSummary: await generateOverallSummary( @@ -166,17 +175,30 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< `bulk update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { - const summaries = issues.map((issue) => ({ - issueId: issue.id, - bodySummary: bodySummaries.find((s) => s.issueId === issue.id) - ?.bodySummary, - commentsSummary: commentSummaries.find( + const summaries = issues.map((issue) => { + const bodySummary = bodySummaries.find( + (s) => s.issueId === issue.id, + )?.bodySummary; + const commentsSummary = commentSummaries.find( (s) => s.issueId === issue.id, - )?.commentsSummary, - overallSummary: overallSummaries.find( + )?.commentsSummary; + const overallSummary = overallSummaries.find( (s) => s.issueId === issue.id, - )?.overallSummary, - })); + )?.overallSummary; + if ( + bodySummary === undefined || + commentsSummary === undefined || + overallSummary === undefined + ) { + throw new Error(`No summary found for issue #${issue.id}`); + } + return { + issueId: issue.id, + bodySummary, + commentsSummary, + overallSummary, + }; + }); await bulkUpdateIssueSummaries(summaries, dbSession); return summaries; }, From 1d4fa7822e8b9e3c0d6992ad3bf23970ab307db6 Mon Sep 17 00:00:00 2001 From: zx <67887489+tan-zx@users.noreply.github.com> Date: Wed, 12 Feb 2025 08:29:00 +0800 Subject: [PATCH 12/12] use aggregate reaction in summary --- .../src/db/schema/entities/issue.schema.ts | 22 ++++++++----- packages/core/src/embedding.ts | 9 ++++-- packages/core/src/summary.ts | 32 +++++++++++++++++-- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/packages/core/src/db/schema/entities/issue.schema.ts b/packages/core/src/db/schema/entities/issue.schema.ts index 94a8dae1..3d0001cc 100644 --- a/packages/core/src/db/schema/entities/issue.schema.ts +++ b/packages/core/src/db/schema/entities/issue.schema.ts @@ -22,6 +22,7 @@ export type CreateIssue = z.infer; const selectIssueSchema = createSelectSchema(issueTable).extend({ author: authorSchema, + aggregateReactions: aggregateReactionsSchema.nullable(), }); export type SelectIssue = z.infer; @@ -37,16 +38,21 @@ const _selectIssueForEmbeddingSchema = selectIssueSchema issueStateReason: true, issueCreatedAt: true, issueClosedAt: true, + aggregateReactions: true, }) .extend({ - labels: z - .array( - z.object({ - name: z.string(), - description: z.string().nullable(), - }), - ) - .optional(), + labels: z.array( + z.object({ + name: z.string(), + description: z.string().nullable(), + }), + ), + comments: z.array( + z.object({ + body: z.string(), + author: authorSchema, + }), + ), }); export type SelectIssueForEmbedding = z.infer< diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts index 7038d959..861015ce 100644 --- a/packages/core/src/embedding.ts +++ b/packages/core/src/embedding.ts @@ -45,7 +45,7 @@ export async function createEmbeddings({ openai, concurrencyLimit, }: { - issues: Awaited>; + issues: SelectIssueForEmbedding[]; summaries: IssueSummary[]; openai: OpenAIClient; concurrencyLimit?: number; @@ -85,7 +85,7 @@ export async function createEmbeddings({ export async function selectIssuesForEmbeddingInit( issueIds: string[], db: DbClient, -) { +): Promise { return await db .select({ id: issueTable.id, @@ -97,6 +97,7 @@ export async function selectIssuesForEmbeddingInit( issueStateReason: issueTable.issueStateReason, issueCreatedAt: issueTable.issueCreatedAt, issueClosedAt: issueTable.issueClosedAt, + aggregateReactions: issueTable.aggregateReactions, labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, @@ -143,7 +144,7 @@ export async function selectIssuesForEmbeddingCron({ db: DbClient; numIssues: number; intervalInHours: number; -}) { +}): Promise { return await db.transaction(async (tx) => { const lockedIssues = tx.$with("locked_issues").as( tx @@ -158,6 +159,7 @@ export async function selectIssuesForEmbeddingCron({ issueCreatedAt: issueTable.issueCreatedAt, issueClosedAt: issueTable.issueClosedAt, issueUpdatedAt: issueTable.issueUpdatedAt, // needed for the WHERE clause later + aggregateReactions: issueTable.aggregateReactions, }) .from(issueTable) .innerJoin(repos, eq(repos.id, issueTable.repoId)) @@ -187,6 +189,7 @@ export async function selectIssuesForEmbeddingCron({ issueStateReason: lockedIssues.issueStateReason, issueCreatedAt: lockedIssues.issueCreatedAt, issueClosedAt: lockedIssues.issueClosedAt, + aggregateReactions: lockedIssues.aggregateReactions, labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts index da01cadc..fbf4b092 100644 --- a/packages/core/src/summary.ts +++ b/packages/core/src/summary.ts @@ -117,8 +117,35 @@ export async function generateOverallSummary( issueCreatedAt: createdAt, issueClosedAt: closedAt, labels, + aggregateReactions, } = params.issue; + // Transform aggregate reactions into a human-readable string + // Format: "thumbs up (5), heart (3)" for reactions with count > 0 + const reactionsSummary = aggregateReactions + ? Object.entries(aggregateReactions) + // Only include reactions that have been used + .filter(([, count]) => count > 0) + // Format each reaction as "reaction_name (count)" + .map( + ([reaction, count]) => + `${reaction.toLowerCase().replace("_", " ")} (${count})`, + ) + // Join all reactions with commas + .join(", ") + : ""; + + // Transform labels into a human-readable string + // Format: "bug (needs triage), feature (high priority)" + const labelsSummary = labels?.length + ? labels + .map( + (label) => + `${label.name}${label.description ? ` (${label.description})` : ""}`, + ) + .join(", ") + : ""; + const text = dedent` Issue: ${title} @@ -128,9 +155,10 @@ export async function generateOverallSummary( ${params.commentsSummary ? `Comments Summary: ${params.commentsSummary}\n` : ""} Additional Context: - State: ${state}${stateReason ? `, Reason: ${stateReason}` : ""} - - Author: ${author?.name || "Anonymous"} + - Author: ${author?.name || "Deleted User"} - Created: ${createdAt.toISOString()}${closedAt ? `\n- Closed: ${closedAt.toISOString()}` : ""} - ${labels?.length ? `- Labels: ${labels.map((label) => `${label.name}${label.description ? ` (${label.description})` : ""}`).join(", ")}` : ""}`; + ${labelsSummary ? `- Labels: ${labelsSummary}` : ""} + ${reactionsSummary ? `- Reactions: ${reactionsSummary}` : ""}`; return await summarize( {