diff --git a/bun.lock b/bun.lock index 330a5a5a..1d375b0c 100644 --- a/bun.lock +++ b/bun.lock @@ -32,7 +32,7 @@ "gql.tada": "^1.8.10", "graphql": "^16.9.0", "octokit": "^4.0.2", - "openai": "^4.69.0", + "openai": "^4.83.0", "postgres": "^3.4.4", "resend": "^4.0.1", "tldts": "^6.1.68", @@ -1352,7 +1352,7 @@ "oidc-token-hash": ["oidc-token-hash@5.0.3", "", {}, "sha512-IF4PcGgzAr6XXSff26Sk/+P4KZFJVuHAJZj3wgO3vX2bMdNVp/QXTP3P7CEm9V1IdG8lDLY3HhiqpsE/nOwpPw=="], - "openai": ["openai@4.69.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "zod": "^3.23.8" }, "optionalPeers": ["zod"], "bin": { "openai": "bin/cli" } }, "sha512-S3hOHSkk609KqwgH+7dwFrSvO3Gm3Nk0YWGyPHNscoMH/Y2tH1qunMi7gtZnLbUv4/N1elqCp6bDior2401kCQ=="], + "openai": ["openai@4.83.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-fmTsqud0uTtRKsPC7L8Lu55dkaTwYucqncDHzVvO64DKOpNTuiYwjbR/nVgpapXuYy8xSnhQQPUm+3jQaxICgw=="], "openid-client": ["openid-client@5.6.4", "", { "dependencies": { "jose": "^4.15.4", "lru-cache": "^6.0.0", "object-hash": "^2.2.0", "oidc-token-hash": "^5.0.3" } }, "sha512-T1h3B10BRPKfcObdBklX639tVz+xh34O7GjofqrqiAQdm7eHsQ00ih18x6wuJ/E6FxdtS2u3FmUGPDeEcMwzNA=="], diff --git a/packages/core/migrations/0044_pale_black_panther.sql b/packages/core/migrations/0044_pale_black_panther.sql new file mode 100644 index 00000000..ccccce7f --- /dev/null +++ b/packages/core/migrations/0044_pale_black_panther.sql @@ -0,0 +1,3 @@ +ALTER TABLE "issues" ADD COLUMN "overall_summary" text;--> statement-breakpoint +ALTER TABLE "issues" ADD COLUMN "body_summary" text;--> statement-breakpoint +ALTER TABLE "issues" ADD COLUMN "comments_summary" text; \ No newline at end of file diff --git a/packages/core/migrations/meta/0044_snapshot.json b/packages/core/migrations/meta/0044_snapshot.json new file mode 100644 index 00000000..ae3ba47b --- /dev/null +++ b/packages/core/migrations/meta/0044_snapshot.json @@ -0,0 +1,1747 @@ +{ + "id": "55636e9d-d940-4ddc-8bce-cd6f70d72f7c", + "prevId": "58d49aa7-c13d-417c-a3a4-edd243f9cdc5", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.comments": { + "name": "comments", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "author": { + "name": "author", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "comment_created_at": { + "name": "comment_created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "comment_updated_at": { + "name": "comment_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "issue_id_idx": { + "name": "issue_id_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "comments_issue_id_issues_id_fk": { + "name": "comments_issue_id_issues_id_fk", + "tableFrom": "comments", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "comments_node_id_unique": { + "name": "comments_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.installations_to_repos": { + "name": "installations_to_repos", + "schema": "", + "columns": { + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "installation_id": { + "name": "installation_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "github_repo_id": { + "name": "github_repo_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "repo_node_id": { + "name": "repo_node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "added_at": { + "name": "added_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "removed_at": { + "name": "removed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "installations_to_repos_installation_idx": { + "name": "installations_to_repos_installation_idx", + "columns": [ + { + "expression": "installation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_repo_idx": { + "name": "installations_to_repos_repo_idx", + "columns": [ + { + "expression": "repo_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_repo_node_idx": { + "name": "installations_to_repos_repo_node_idx", + "columns": [ + { + "expression": "repo_node_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_to_repos_active_idx": { + "name": "installations_to_repos_active_idx", + "columns": [ + { + "expression": "installation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"installations_to_repos\".\"removed_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "installations_to_repos_installation_id_installations_id_fk": { + "name": "installations_to_repos_installation_id_installations_id_fk", + "tableFrom": "installations_to_repos", + "tableTo": "installations", + "columnsFrom": [ + "installation_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + }, + "installations_to_repos_repo_id_repos_id_fk": { + "name": "installations_to_repos_repo_id_repos_id_fk", + "tableFrom": "installations_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "installations_to_repos_installation_id_github_repo_id_pk": { + "name": "installations_to_repos_installation_id_github_repo_id_pk", + "columns": [ + "installation_id", + "github_repo_id" + ] + } + }, + "uniqueConstraints": {} + }, + "public.installations": { + "name": "installations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "github_installation_id": { + "name": "github_installation_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "target_type": { + "name": "target_type", + "type": "target_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "target_id": { + "name": "target_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "target_github_id": { + "name": "target_github_id", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "target_node_id": { + "name": "target_node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repository_selection": { + "name": "repository_selection", + "type": "repository_selection", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "installed_by_user_id": { + "name": "installed_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "installed_at": { + "name": "installed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "uninstalled_at": { + "name": "uninstalled_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "suspended_at": { + "name": "suspended_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "suspended_by": { + "name": "suspended_by", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "permissions": { + "name": "permissions", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "permissions_updated_at": { + "name": "permissions_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "installations_target_idx": { + "name": "installations_target_idx", + "columns": [ + { + "expression": "target_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "target_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_installed_by_user_idx": { + "name": "installations_installed_by_user_idx", + "columns": [ + { + "expression": "installed_by_user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "installations_active_idx": { + "name": "installations_active_idx", + "columns": [ + { + "expression": "uninstalled_at", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "suspended_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"installations\".\"uninstalled_at\" IS NULL AND \"installations\".\"suspended_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "installations_installed_by_user_id_users_id_fk": { + "name": "installations_installed_by_user_id_users_id_fk", + "tableFrom": "installations", + "tableTo": "users", + "columnsFrom": [ + "installed_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "installations_github_installation_id_unique": { + "name": "installations_github_installation_id_unique", + "nullsNotDistinct": false, + "columns": [ + "github_installation_id" + ] + } + } + }, + "public.issue_embeddings": { + "name": "issue_embeddings", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding_model": { + "name": "embedding_model", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "embedding": { + "name": "embedding", + "type": "vector(256)", + "primaryKey": false, + "notNull": false + }, + "embedding_generated_at": { + "name": "embedding_generated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "issue_embedding_sync_status": { + "name": "issue_embedding_sync_status", + "type": "issue_embedding_sync_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + } + }, + "indexes": { + "issue_embeddings_issue_id_idx": { + "name": "issue_embeddings_issue_id_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_sync_status_idx": { + "name": "issue_embeddings_sync_status_idx", + "columns": [ + { + "expression": "issue_embedding_sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_embedding_idx": { + "name": "issue_embeddings_embedding_idx", + "columns": [ + { + "expression": "embedding", + "isExpression": false, + "asc": true, + "nulls": "last", + "opclass": "vector_cosine_ops" + } + ], + "isUnique": false, + "concurrently": false, + "method": "hnsw", + "with": {} + }, + "issue_embeddings_status_generated_at_idx": { + "name": "issue_embeddings_status_generated_at_idx", + "columns": [ + { + "expression": "issue_embedding_sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "embedding_generated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_embeddings_null_idx": { + "name": "issue_embeddings_null_idx", + "columns": [ + { + "expression": "issue_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "embedding IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "issue_embeddings_issue_id_issues_id_fk": { + "name": "issue_embeddings_issue_id_issues_id_fk", + "tableFrom": "issue_embeddings", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.issues_to_labels": { + "name": "issues_to_labels", + "schema": "", + "columns": { + "issue_id": { + "name": "issue_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "label_id": { + "name": "label_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "issues_to_labels_issue_id_issues_id_fk": { + "name": "issues_to_labels_issue_id_issues_id_fk", + "tableFrom": "issues_to_labels", + "tableTo": "issues", + "columnsFrom": [ + "issue_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "issues_to_labels_label_id_labels_id_fk": { + "name": "issues_to_labels_label_id_labels_id_fk", + "tableFrom": "issues_to_labels", + "tableTo": "labels", + "columnsFrom": [ + "label_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "issues_to_labels_issue_id_label_id_pk": { + "name": "issues_to_labels_issue_id_label_id_pk", + "columns": [ + "issue_id", + "label_id" + ] + } + }, + "uniqueConstraints": { + "issues_to_labels_label_id_issue_id_unique": { + "name": "issues_to_labels_label_id_issue_id_unique", + "nullsNotDistinct": false, + "columns": [ + "label_id", + "issue_id" + ] + } + } + }, + "public.issues": { + "name": "issues", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "number": { + "name": "number", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "author": { + "name": "author", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "issue_state": { + "name": "issue_state", + "type": "issue_state", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "issue_state_reason": { + "name": "issue_state_reason", + "type": "issue_state_reason", + "typeSchema": "public", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "overall_summary": { + "name": "overall_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "body_summary": { + "name": "body_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "comments_summary": { + "name": "comments_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "aggregate_reactions": { + "name": "aggregate_reactions", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "top_commenters": { + "name": "top_commenters", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "issue_created_at": { + "name": "issue_created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "issue_updated_at": { + "name": "issue_updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true + }, + "issue_closed_at": { + "name": "issue_closed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "repo_id_idx": { + "name": "repo_id_idx", + "columns": [ + { + "expression": "repo_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "title_substring_idx": { + "name": "title_substring_idx", + "columns": [ + { + "expression": "\"title\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "body_substring_idx": { + "name": "body_substring_idx", + "columns": [ + { + "expression": "\"body\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "author_name_idx": { + "name": "author_name_idx", + "columns": [ + { + "expression": "lower((\"author\"->>'name'::text))", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_state_open_idx": { + "name": "issue_state_open_idx", + "columns": [ + { + "expression": "issue_state", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "issue_state = 'OPEN'", + "concurrently": false, + "method": "btree", + "with": {} + }, + "issue_updated_at_idx": { + "name": "issue_updated_at_idx", + "columns": [ + { + "expression": "issue_updated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "issues_repo_id_repos_id_fk": { + "name": "issues_repo_id_repos_id_fk", + "tableFrom": "issues", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "issues_node_id_unique": { + "name": "issues_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.labels": { + "name": "labels", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "color": { + "name": "color", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "labels_node_id_unique": { + "name": "labels_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.organizations": { + "name": "organizations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "login": { + "name": "login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "organizations_node_id_unique": { + "name": "organizations_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.public_collections_to_repos": { + "name": "public_collections_to_repos", + "schema": "", + "columns": { + "collection_id": { + "name": "collection_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "public_collections_to_repos_collection_id_public_collections_id_fk": { + "name": "public_collections_to_repos_collection_id_public_collections_id_fk", + "tableFrom": "public_collections_to_repos", + "tableTo": "public_collections", + "columnsFrom": [ + "collection_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "public_collections_to_repos_repo_id_repos_id_fk": { + "name": "public_collections_to_repos_repo_id_repos_id_fk", + "tableFrom": "public_collections_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "public_collections_to_repos_collection_id_repo_id_pk": { + "name": "public_collections_to_repos_collection_id_repo_id_pk", + "columns": [ + "collection_id", + "repo_id" + ] + } + }, + "uniqueConstraints": { + "public_collections_to_repos_repo_id_collection_id_unique": { + "name": "public_collections_to_repos_repo_id_collection_id_unique", + "nullsNotDistinct": false, + "columns": [ + "repo_id", + "collection_id" + ] + } + } + }, + "public.public_collections": { + "name": "public_collections", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "public_collections_name_unique": { + "name": "public_collections_name_unique", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + } + } + }, + "public.repos": { + "name": "repos", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "owner_login": { + "name": "owner_login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "owner_avatar_url": { + "name": "owner_avatar_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "is_private": { + "name": "is_private", + "type": "boolean", + "primaryKey": false, + "notNull": true + }, + "sync_status": { + "name": "sync_status", + "type": "sync_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + }, + "last_synced_at": { + "name": "last_synced_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "init_status": { + "name": "init_status", + "type": "init_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'ready'" + }, + "initialized_at": { + "name": "initialized_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "sync_cursor": { + "name": "sync_cursor", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "owner_name_idx": { + "name": "owner_name_idx", + "columns": [ + { + "expression": "owner_login", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "owner_idx": { + "name": "owner_idx", + "columns": [ + { + "expression": "owner_login", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "created_at_idx": { + "name": "created_at_idx", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "repo_sync_idx": { + "name": "repo_sync_idx", + "columns": [ + { + "expression": "init_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "sync_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "last_synced_at", + "isExpression": false, + "asc": true, + "nulls": "first" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "repo_init_idx": { + "name": "repo_init_idx", + "columns": [ + { + "expression": "init_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "repos_node_id_unique": { + "name": "repos_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + }, + "public.users_to_repos": { + "name": "users_to_repos", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "repo_id": { + "name": "repo_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "subscription_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'active'" + }, + "subscribed_at": { + "name": "subscribed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "unsubscribed_at": { + "name": "unsubscribed_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_status_idx": { + "name": "user_status_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "subscribed_at", + "isExpression": false, + "asc": false, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "users_to_repos_user_id_users_id_fk": { + "name": "users_to_repos_user_id_users_id_fk", + "tableFrom": "users_to_repos", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "users_to_repos_repo_id_repos_id_fk": { + "name": "users_to_repos_repo_id_repos_id_fk", + "tableFrom": "users_to_repos", + "tableTo": "repos", + "columnsFrom": [ + "repo_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "users_to_repos_user_id_repo_id_pk": { + "name": "users_to_repos_user_id_repo_id_pk", + "columns": [ + "user_id", + "repo_id" + ] + } + }, + "uniqueConstraints": { + "users_to_repos_repo_id_user_id_unique": { + "name": "users_to_repos_repo_id_user_id_unique", + "nullsNotDistinct": false, + "columns": [ + "repo_id", + "user_id" + ] + } + } + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "node_id": { + "name": "node_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "login": { + "name": "login", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "html_url": { + "name": "html_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "github_scopes": { + "name": "github_scopes", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "auth_revoked_at": { + "name": "auth_revoked_at", + "type": "timestamp (6) with time zone", + "primaryKey": false, + "notNull": false + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "email_idx": { + "name": "email_idx", + "columns": [ + { + "expression": "email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_node_id_unique": { + "name": "users_node_id_unique", + "nullsNotDistinct": false, + "columns": [ + "node_id" + ] + } + } + } + }, + "enums": { + "public.repository_selection": { + "name": "repository_selection", + "schema": "public", + "values": [ + "all", + "selected" + ] + }, + "public.target_type": { + "name": "target_type", + "schema": "public", + "values": [ + "user", + "organization" + ] + }, + "public.issue_embedding_sync_status": { + "name": "issue_embedding_sync_status", + "schema": "public", + "values": [ + "ready", + "in_progress", + "error" + ] + }, + "public.issue_state": { + "name": "issue_state", + "schema": "public", + "values": [ + "OPEN", + "CLOSED" + ] + }, + "public.issue_state_reason": { + "name": "issue_state_reason", + "schema": "public", + "values": [ + "COMPLETED", + "REOPENED", + "NOT_PLANNED", + "DUPLICATE" + ] + }, + "public.init_status": { + "name": "init_status", + "schema": "public", + "values": [ + "pending", + "ready", + "in_progress", + "completed", + "error", + "no_issues" + ] + }, + "public.sync_status": { + "name": "sync_status", + "schema": "public", + "values": [ + "ready", + "queued", + "in_progress", + "error" + ] + }, + "public.subscription_status": { + "name": "subscription_status", + "schema": "public", + "values": [ + "active", + "inactive" + ] + } + }, + "schemas": {}, + "sequences": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/core/migrations/meta/_journal.json b/packages/core/migrations/meta/_journal.json index a7d67e14..7b3cc073 100644 --- a/packages/core/migrations/meta/_journal.json +++ b/packages/core/migrations/meta/_journal.json @@ -309,6 +309,13 @@ "when": 1739172466880, "tag": "0043_lying_jimmy_woo", "breakpoints": true + }, + { + "idx": 44, + "version": "7", + "when": 1739244517513, + "tag": "0044_pale_black_panther", + "breakpoints": true } ] } \ No newline at end of file diff --git a/packages/core/package.json b/packages/core/package.json index 915c90af..d5af2f27 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -28,7 +28,7 @@ "gql.tada": "^1.8.10", "graphql": "^16.9.0", "octokit": "^4.0.2", - "openai": "^4.69.0", + "openai": "^4.83.0", "postgres": "^3.4.4", "resend": "^4.0.1", "tldts": "^6.1.68", diff --git a/packages/core/src/db/schema/entities/issue.schema.ts b/packages/core/src/db/schema/entities/issue.schema.ts index 756a4fcd..3d0001cc 100644 --- a/packages/core/src/db/schema/entities/issue.schema.ts +++ b/packages/core/src/db/schema/entities/issue.schema.ts @@ -1,7 +1,7 @@ // putting these in a separate file so that migrations can be generated as is // currently a bug in drizzle-zod vs drizzle-kit interaction import { createInsertSchema, createSelectSchema } from "drizzle-zod"; -import type { z } from "zod"; +import { z } from "zod"; import { aggregateReactionsSchema, @@ -22,21 +22,38 @@ export type CreateIssue = z.infer; const selectIssueSchema = createSelectSchema(issueTable).extend({ author: authorSchema, + aggregateReactions: aggregateReactionsSchema.nullable(), }); export type SelectIssue = z.infer; -const _selectIssueForEmbeddingSchema = selectIssueSchema.pick({ - id: true, - number: true, - author: true, - title: true, - body: true, - issueState: true, - issueStateReason: true, - issueCreatedAt: true, - issueClosedAt: true, -}); +const _selectIssueForEmbeddingSchema = selectIssueSchema + .pick({ + id: true, + number: true, + author: true, + title: true, + body: true, + issueState: true, + issueStateReason: true, + issueCreatedAt: true, + issueClosedAt: true, + aggregateReactions: true, + }) + .extend({ + labels: z.array( + z.object({ + name: z.string(), + description: z.string().nullable(), + }), + ), + comments: z.array( + z.object({ + body: z.string(), + author: authorSchema, + }), + ), + }); export type SelectIssueForEmbedding = z.infer< typeof _selectIssueForEmbeddingSchema diff --git a/packages/core/src/db/schema/entities/issue.sql.ts b/packages/core/src/db/schema/entities/issue.sql.ts index 33a4fb80..3b474f62 100644 --- a/packages/core/src/db/schema/entities/issue.sql.ts +++ b/packages/core/src/db/schema/entities/issue.sql.ts @@ -38,6 +38,9 @@ export const issueTable = pgTable( htmlUrl: text("html_url").notNull(), title: text("title").notNull(), body: text("body").notNull(), + overallSummary: text("overall_summary"), + bodySummary: text("body_summary"), + commentsSummary: text("comments_summary"), aggregateReactions: jsonb( "aggregate_reactions", ).$type(), diff --git a/packages/core/src/db/utils/json.ts b/packages/core/src/db/utils/json.ts index 9be03b39..aafb0c02 100644 --- a/packages/core/src/db/utils/json.ts +++ b/packages/core/src/db/utils/json.ts @@ -63,8 +63,7 @@ export function jsonArrayContains< )`; } -// improvised somewhat, probably not the best way to do this -export function jsonAggBuildObjectFromJoin< +export function jsonAggBuildObjectManyToMany< T extends SelectedFields, Column extends AnyColumn, >( @@ -100,3 +99,35 @@ export function jsonAggBuildObjectFromJoin< '[]'::json )`; } + +export function jsonAggBuildObjectOneToMany< + T extends SelectedFields, + Column extends AnyColumn, +>( + shape: T, + { + from, + foreignKeyEquals, + orderBy, + }: { + from: Table; + foreignKeyEquals: SQL; + orderBy?: { colName: Column; direction: "ASC" | "DESC" }; + }, +) { + return sql[]>` + COALESCE( + ( + SELECT json_agg(${jsonBuildObject(shape)} + ${ + orderBy + ? sql`ORDER BY ${orderBy.colName} ${sql.raw(orderBy.direction)}` + : undefined + } + ) + FROM ${sql`${from}`} + WHERE ${foreignKeyEquals} + ), + '[]'::json + )`; +} diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts index b6c6b2a4..861015ce 100644 --- a/packages/core/src/embedding.ts +++ b/packages/core/src/embedding.ts @@ -1,10 +1,9 @@ import dedent from "dedent"; import pMap from "p-map"; -import { truncateCodeBlocks, truncateToByteSize } from "@/util/truncate"; - import type { DbClient } from "./db"; import { and, asc, eq, gt, inArray, isNull, lt, ne, or, sql } from "./db"; +import { comments as commentTable } from "./db/schema/entities/comment.sql"; import { issueEmbeddings } from "./db/schema/entities/issue-embedding.sql"; import { issuesToLabels } from "./db/schema/entities/issue-to-label.sql"; import type { SelectIssueForEmbedding } from "./db/schema/entities/issue.schema"; @@ -14,10 +13,14 @@ import { labels as labelTable } from "./db/schema/entities/label.sql"; import { repos } from "./db/schema/entities/repo.sql"; import { conflictUpdateOnly } from "./db/utils/conflict"; import { convertToSqlRaw } from "./db/utils/general"; -import { jsonAggBuildObjectFromJoin } from "./db/utils/json"; +import { + jsonAggBuildObjectManyToMany, + jsonAggBuildObjectOneToMany, +} from "./db/utils/json"; import { EMBEDDING_MODEL, type OpenAIClient } from "./openai"; import { isReducePromptError } from "./openai/errors"; import { embeddingsCreateSchema } from "./openai/schema"; +import type { IssueSummary } from "./summary"; export async function createEmbedding( { @@ -38,47 +41,43 @@ export async function createEmbedding( export async function createEmbeddings({ issues, + summaries, openai, concurrencyLimit, }: { - issues: Awaited>; + issues: SelectIssueForEmbedding[]; + summaries: IssueSummary[]; openai: OpenAIClient; concurrencyLimit?: number; }) { - const TRUNCATION_MAX_ATTEMPTS = 8; const processIssue = async (issue: (typeof issues)[number]) => { - let attempt = 0; - const labels = issue.labels; - while (attempt <= TRUNCATION_MAX_ATTEMPTS) { - try { - const embedding = await createEmbedding( - { - input: formatIssueForEmbedding({ - issue, - labels, - attempt, - }), - }, - openai, - ); - return { - issueId: issue.id, - embedding, - }; - } catch (error) { - if (isReducePromptError(error) && attempt < TRUNCATION_MAX_ATTEMPTS) { - console.warn( - `Retrying issue #${issue.number} with truncation attempt ${attempt + 1}`, - ); - attempt++; - } else { - throw error; - } + const { labels, number } = issue; + const summary = summaries.find((s) => s.issueId === issue.id); + if (!summary) { + throw new Error(`No summary found for issue #${number}`); + } + try { + const embedding = await createEmbedding( + { + input: formatIssueForEmbedding({ + issue, + labels, + summary, + }), + }, + openai, + ); + return { + issueId: issue.id, + embedding, + }; + } catch (error) { + if (isReducePromptError(error)) { + // TODO: do something? + throw error; } } - throw new Error( - `Failed to create embedding for issue #${issue.number} after ${TRUNCATION_MAX_ATTEMPTS} attempts`, - ); + throw new Error(`Failed to create embedding for issue #${number}`); }; return await pMap(issues, processIssue, { concurrency: concurrencyLimit }); } @@ -86,7 +85,7 @@ export async function createEmbeddings({ export async function selectIssuesForEmbeddingInit( issueIds: string[], db: DbClient, -) { +): Promise { return await db .select({ id: issueTable.id, @@ -98,7 +97,8 @@ export async function selectIssuesForEmbeddingInit( issueStateReason: issueTable.issueStateReason, issueCreatedAt: issueTable.issueCreatedAt, issueClosedAt: issueTable.issueClosedAt, - labels: jsonAggBuildObjectFromJoin( + aggregateReactions: issueTable.aggregateReactions, + labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, description: labelTable.description, @@ -110,6 +110,16 @@ export async function selectIssuesForEmbeddingInit( whereCondition: eq(issuesToLabels.issueId, issueTable.id), }, ), + comments: jsonAggBuildObjectOneToMany( + { + body: commentTable.body, + author: commentTable.author, + }, + { + from: commentTable, + foreignKeyEquals: eq(commentTable.issueId, issueTable.id), + }, + ), }) .from(issueTable) .leftJoin(issueEmbeddings, eq(issueEmbeddings.issueId, issueTable.id)) @@ -134,7 +144,7 @@ export async function selectIssuesForEmbeddingCron({ db: DbClient; numIssues: number; intervalInHours: number; -}) { +}): Promise { return await db.transaction(async (tx) => { const lockedIssues = tx.$with("locked_issues").as( tx @@ -149,6 +159,7 @@ export async function selectIssuesForEmbeddingCron({ issueCreatedAt: issueTable.issueCreatedAt, issueClosedAt: issueTable.issueClosedAt, issueUpdatedAt: issueTable.issueUpdatedAt, // needed for the WHERE clause later + aggregateReactions: issueTable.aggregateReactions, }) .from(issueTable) .innerJoin(repos, eq(repos.id, issueTable.repoId)) @@ -178,7 +189,8 @@ export async function selectIssuesForEmbeddingCron({ issueStateReason: lockedIssues.issueStateReason, issueCreatedAt: lockedIssues.issueCreatedAt, issueClosedAt: lockedIssues.issueClosedAt, - labels: jsonAggBuildObjectFromJoin( + aggregateReactions: lockedIssues.aggregateReactions, + labels: jsonAggBuildObjectManyToMany( { name: labelTable.name, description: labelTable.description, @@ -190,6 +202,16 @@ export async function selectIssuesForEmbeddingCron({ whereCondition: eq(issuesToLabels.issueId, lockedIssues.id), }, ), + comments: jsonAggBuildObjectOneToMany( + { + body: commentTable.body, + author: commentTable.author, + }, + { + from: commentTable, + foreignKeyEquals: eq(commentTable.issueId, lockedIssues.id), + }, + ), }) .from(lockedIssues) .leftJoin(issueEmbeddings, eq(issueEmbeddings.issueId, lockedIssues.id)) @@ -283,40 +305,39 @@ export async function unstuckIssueEmbeddings(db: DbClient) { } interface FormatIssueParams { - attempt: number; issue: SelectIssueForEmbedding; labels: SelectLabelForEmbedding[]; + summary: IssueSummary; } /* Alternate way to format issue for embedding */ /* Instead of truncating the body repeatedly, we could pass the body into a LLM and obtain a summary. Then, we pass the summary into the embedding API instead. */ function formatIssueForEmbedding({ issue, - attempt = 0, labels, + summary, }: FormatIssueParams): string { const { number, author, title, - body, issueState, issueStateReason, issueCreatedAt, issueClosedAt, } = issue; - // Truncate body to roughly 6000 tokens to leave room for other fields - const truncatedBody = truncateText(body, attempt); + const { commentsSummary, bodySummary } = summary; return ( dedent` Issue #${number}: ${title} - Body: ${truncatedBody} + Body: ${bodySummary} ${labels ? `Labels: ${labels.map((label) => `${label.name}${label.description ? ` (${label.description})` : ""}`).join(", ")}` : ""} ` + // the following are "metadata" fields, but including them because conceivably // users may include them in their search dedent` + ${commentsSummary ? `Comments: ${commentsSummary}` : ""} State: ${issueState} State Reason: ${issueStateReason} ${author ? `Author: ${author.name}` : ""} @@ -326,24 +347,24 @@ function formatIssueForEmbedding({ ); } -function truncateText(text: string, attempt: number): string { - // currently, it seem like issues that have huge blocks of code and logs are being tokenized very differently from this heuristic - // we first truncate per the body schema - const MAX_BODY_SIZE_KB = 8; - const CODE_BLOCK_PREVIEW_LINES = 10; - text = truncateToByteSize( - truncateCodeBlocks(text, CODE_BLOCK_PREVIEW_LINES), - MAX_BODY_SIZE_KB * 1024, - ); - // DISCUSSION: - // - could use a tokenizer to more accurately measure token length, e.g. https://github.com/dqbd/tiktoken - // - alternatively, the error returned by OpenAI also tells you how many token it is and hence how much it needs to be reduced - const TRUNCATION_FACTOR = 0.75; // after 8x retry, will be 10% of original length - const TRUNCATION_MAX_TOKENS = 6000; // somewhat arbitrary - // Rough approximation: 1 token ≈ 4 characters - const maxChars = Math.floor( - TRUNCATION_MAX_TOKENS * 4 * Math.pow(TRUNCATION_FACTOR, attempt), - ); - if (text.length <= maxChars) return text; - return text.slice(0, maxChars); -} +// function truncateText(text: string, attempt: number): string { +// // currently, it seem like issues that have huge blocks of code and logs are being tokenized very differently from this heuristic +// // we first truncate per the body schema +// const MAX_BODY_SIZE_KB = 8; +// const CODE_BLOCK_PREVIEW_LINES = 10; +// text = truncateToByteSize( +// truncateCodeBlocks(text, CODE_BLOCK_PREVIEW_LINES), +// MAX_BODY_SIZE_KB * 1024, +// ); +// // DISCUSSION: +// // - could use a tokenizer to more accurately measure token length, e.g. https://github.com/dqbd/tiktoken +// // - alternatively, the error returned by OpenAI also tells you how many token it is and hence how much it needs to be reduced +// const TRUNCATION_FACTOR = 0.75; // after 8x retry, will be 10% of original length +// const TRUNCATION_MAX_TOKENS = 6000; // somewhat arbitrary +// // Rough approximation: 1 token ≈ 4 characters +// const maxChars = Math.floor( +// TRUNCATION_MAX_TOKENS * 4 * Math.pow(TRUNCATION_FACTOR, attempt), +// ); +// if (text.length <= maxChars) return text; +// return text.slice(0, maxChars); +// } diff --git a/packages/core/src/openai/index.ts b/packages/core/src/openai/index.ts index 86e8af54..59fbeeb9 100644 --- a/packages/core/src/openai/index.ts +++ b/packages/core/src/openai/index.ts @@ -9,3 +9,5 @@ export function createOpenAIClient(apiKey: string) { export type OpenAIClient = ReturnType; export const EMBEDDING_MODEL = "text-embedding-3-small"; + +export const SUMMARY_MODEL = "o3-mini-2025-01-31"; diff --git a/packages/core/src/openai/schema.ts b/packages/core/src/openai/schema.ts index 6564ac2c..6c920ab6 100644 --- a/packages/core/src/openai/schema.ts +++ b/packages/core/src/openai/schema.ts @@ -12,3 +12,11 @@ export const embeddingsCreateSchema = z model: z.string(), }) .strip(); + +export const chatCompletionSchema = z.object({ + choices: z.array( + z.object({ + message: z.object({ content: z.string() }), + }), + ), +}); diff --git a/packages/core/src/semsearch/db.ts b/packages/core/src/semsearch/db.ts index 41f9a388..e5918d13 100644 --- a/packages/core/src/semsearch/db.ts +++ b/packages/core/src/semsearch/db.ts @@ -14,7 +14,7 @@ import { publicCollections } from "@/db/schema/entities/public-collection.sql"; import { repos } from "@/db/schema/entities/repo.sql"; import { usersToRepos } from "@/db/schema/entities/user-to-repo.sql"; import { lower } from "@/db/utils/general"; -import { jsonAggBuildObjectFromJoin, jsonContains } from "@/db/utils/json"; +import { jsonAggBuildObjectManyToMany, jsonContains } from "@/db/utils/json"; import type { SearchParams } from "./schema.output"; import { parseSearchQuery } from "./util"; @@ -24,7 +24,7 @@ export function getBaseSelect() { id: issueTable.id, number: issueTable.number, title: issueTable.title, - labels: jsonAggBuildObjectFromJoin( + labels: jsonAggBuildObjectManyToMany( { name: labels.name, color: labels.color, @@ -46,6 +46,7 @@ export function getBaseSelect() { issueUpdatedAt: issueTable.issueUpdatedAt, aggregateReactions: issueTable.aggregateReactions, topCommenters: issueTable.topCommenters, + overallSummary: issueTable.overallSummary, repoName: repos.name, repoUrl: sql`${repos.htmlUrl}`.as("repoUrl"), repoOwnerName: repos.ownerLogin, diff --git a/packages/core/src/semsearch/index.ts b/packages/core/src/semsearch/index.ts index bee182b4..072aa6e3 100644 --- a/packages/core/src/semsearch/index.ts +++ b/packages/core/src/semsearch/index.ts @@ -278,6 +278,7 @@ async function filterBeforeVectorSearch( repoOwnerName: vectorSearchSubquery.repoOwnerName, repoLastSyncedAt: vectorSearchSubquery.repoLastSyncedAt, commentCount: vectorSearchSubquery.commentCount, + overallSummary: vectorSearchSubquery.overallSummary, rankingScore, similarityScore, // Add window function to get total count in same query diff --git a/packages/core/src/semsearch/schema.output.ts b/packages/core/src/semsearch/schema.output.ts index 61efe7fc..b43df78a 100644 --- a/packages/core/src/semsearch/schema.output.ts +++ b/packages/core/src/semsearch/schema.output.ts @@ -59,6 +59,7 @@ const searchIssueSchema = createSelectSchema(issueTable, { issueCreatedAt: true, issueClosedAt: true, issueUpdatedAt: true, + overallSummary: true, }) .extend({ labels: z.array(selectLabelForSearchSchema), diff --git a/packages/core/src/summary.ts b/packages/core/src/summary.ts new file mode 100644 index 00000000..fbf4b092 --- /dev/null +++ b/packages/core/src/summary.ts @@ -0,0 +1,221 @@ +import dedent from "dedent"; + +import { inArray, sql } from "./db"; +import type { DbClient } from "./db"; +import type { SelectIssueForEmbedding } from "./db/schema/entities/issue.schema"; +import { issueTable } from "./db/schema/entities/issue.sql"; +import type { Author } from "./db/schema/shared"; +import type { OpenAIClient } from "./openai"; +import { SUMMARY_MODEL } from "./openai"; +import { chatCompletionSchema } from "./openai/schema"; + +async function summarize( + { + textToSummarize, + systemPrompt, + userInstructions, + reasoningEffort = "high", + }: { + textToSummarize: string; + systemPrompt: string; + userInstructions: string; + reasoningEffort?: "low" | "medium" | "high"; + }, + openai: OpenAIClient, +): Promise { + const messages = [ + { + role: "system" as const, + content: systemPrompt, + }, + { + role: "user" as const, + content: `${userInstructions}\n\n${textToSummarize}`, + }, + ]; + + const response = await openai.chat.completions.create({ + model: SUMMARY_MODEL, + messages, + reasoning_effort: reasoningEffort, + }); + const result = chatCompletionSchema.parse(response); + return result.choices[0]!.message.content; +} + +// Predefined prompts and instructions +const PROMPTS = { + issueBody: { + system: + "You are a helpful assistant that generates concise summaries of GitHub issue descriptions. Describe the issue directly and focus on the problem, proposed solutions, and key technical details.", + user: "Please summarize this GitHub issue description in no more than 3 short paragraphs. Just provide the summary directly.", + }, + comments: { + system: + "You are a helpful assistant that generates concise summaries of GitHub issue comments. Summarise the comments so that a human can capture the main points of discussion without reading the entire comment thread. If you include the name of the author, make sure to stick to the original casing and don't modify it.", + user: "Please summarize the discussion in these GitHub issue comments in no more than 3 short paragraphs. Just provide the summary directly.", + }, + overall: { + system: + "You are a helpful assistant that generates concise overall summaries of GitHub issues so that a human can understand the issue at a glance. You will be provided with information of the issue, a summary of of the issue body and a summary of the comments, and additional context. Don't use 'this issue' or 'this discussion', just provide the summary directly.", + user: "Please provide a direct summary of this issue based on the provided information in a single paragraph no more than 5 sentences.", + }, +} as const; + +export async function generateBodySummary( + body: string, + openai: OpenAIClient, +): Promise { + return await summarize( + { + textToSummarize: body, + systemPrompt: PROMPTS.issueBody.system, + userInstructions: PROMPTS.issueBody.user, + }, + openai, + ); +} + +export async function generateCommentsSummary( + comments: Array<{ body: string; author: Author }>, + openai: OpenAIClient, +): Promise { + if (!comments.length) { + return ""; + } + + return await summarize( + { + textToSummarize: comments + .map((c) => { + const authorName = c.author?.name || "Deleted User"; + return dedent` + ${authorName} wrote: + ${c.body}`; + }) + .join("\n\n---\n\n"), + systemPrompt: PROMPTS.comments.system, + userInstructions: PROMPTS.comments.user, + }, + openai, + ); +} + +export async function generateOverallSummary( + params: { + bodySummary: string; + commentsSummary: string | null; + issue: SelectIssueForEmbedding; + }, + openai: OpenAIClient, +): Promise { + const { + title, + author, + issueState: state, + issueStateReason: stateReason, + issueCreatedAt: createdAt, + issueClosedAt: closedAt, + labels, + aggregateReactions, + } = params.issue; + + // Transform aggregate reactions into a human-readable string + // Format: "thumbs up (5), heart (3)" for reactions with count > 0 + const reactionsSummary = aggregateReactions + ? Object.entries(aggregateReactions) + // Only include reactions that have been used + .filter(([, count]) => count > 0) + // Format each reaction as "reaction_name (count)" + .map( + ([reaction, count]) => + `${reaction.toLowerCase().replace("_", " ")} (${count})`, + ) + // Join all reactions with commas + .join(", ") + : ""; + + // Transform labels into a human-readable string + // Format: "bug (needs triage), feature (high priority)" + const labelsSummary = labels?.length + ? labels + .map( + (label) => + `${label.name}${label.description ? ` (${label.description})` : ""}`, + ) + .join(", ") + : ""; + + const text = dedent` + Issue: ${title} + + Description Summary: + ${params.bodySummary} + + ${params.commentsSummary ? `Comments Summary: ${params.commentsSummary}\n` : ""} + Additional Context: + - State: ${state}${stateReason ? `, Reason: ${stateReason}` : ""} + - Author: ${author?.name || "Deleted User"} + - Created: ${createdAt.toISOString()}${closedAt ? `\n- Closed: ${closedAt.toISOString()}` : ""} + ${labelsSummary ? `- Labels: ${labelsSummary}` : ""} + ${reactionsSummary ? `- Reactions: ${reactionsSummary}` : ""}`; + + return await summarize( + { + textToSummarize: text, + systemPrompt: PROMPTS.overall.system, + userInstructions: PROMPTS.overall.user, + }, + openai, + ); +} + +export interface IssueSummary { + issueId: string; + bodySummary: string; + commentsSummary: string | null; + overallSummary: string; +} + +export async function bulkUpdateIssueSummaries( + summaries: IssueSummary[], + db: DbClient, +): Promise { + if (summaries.length === 0) return; + + const sqlChunks = { + bodySummary: [sql`(case`], + commentsSummary: [sql`(case`], + overallSummary: [sql`(case`], + }; + + const issueIds = summaries.map((s) => s.issueId); + + for (const summary of summaries) { + sqlChunks.bodySummary.push( + sql`when id = ${summary.issueId} then ${summary.bodySummary}`, + ); + sqlChunks.commentsSummary.push( + sql`when id = ${summary.issueId} then ${summary.commentsSummary === null ? sql`null` : summary.commentsSummary}`, + ); + sqlChunks.overallSummary.push( + sql`when id = ${summary.issueId} then ${summary.overallSummary}`, + ); + } + + for (const key of Object.keys(sqlChunks) as Array) { + sqlChunks[key].push(sql`end)`); + } + + await db + .update(issueTable) + .set({ + bodySummary: sql.join(sqlChunks.bodySummary, sql.raw(" ")), + commentsSummary: sql.join(sqlChunks.commentsSummary, sql.raw(" ")), + overallSummary: sql.join(sqlChunks.overallSummary, sql.raw(" ")), + }) + .where(inArray(issueTable.id, issueIds)); +} + +// Export for testing or custom usage +export { summarize, PROMPTS }; diff --git a/packages/scripts/src/script.ts b/packages/scripts/src/script.ts index 796a43c9..13bc4a10 100644 --- a/packages/scripts/src/script.ts +++ b/packages/scripts/src/script.ts @@ -1,29 +1,14 @@ -import { eq } from "drizzle-orm"; - -import { repos } from "@/core/db/schema/entities/repo.sql"; +import { generateBodySummary } from "@/core/summary"; import { getDeps } from "./deps"; -const { db, closeConnection } = await getDeps(); -const repoId = "rep_01JEK73YA0FDWVBEN21R4ATTB4"; +const { openai } = await getDeps(); try { - const [result] = await db - .select({ - initStatus: repos.initStatus, - repoName: repos.name, - repoOwner: repos.ownerLogin, - isPrivate: repos.isPrivate, - repoSyncCursor: repos.syncCursor, - }) - .from(repos) - .where(eq(repos.id, repoId)) - .limit(1); - if (!result) { - throw new Error("Repo not found"); - } + const result = await generateBodySummary( + "We need basic workspace creation for our Alpha that will allow minimum needed functionality and so our team can use it for dogfooding. This epic focuses on the essential UI, CLI and API for Workspace Creation in V2 Alpha", + openai, + ); console.log(result); } catch (e) { console.error(e); -} finally { - await closeConnection(); } diff --git a/packages/web/src/components/search/IssueCard.tsx b/packages/web/src/components/search/IssueCard.tsx index 4c137766..61327c52 100644 --- a/packages/web/src/components/search/IssueCard.tsx +++ b/packages/web/src/components/search/IssueCard.tsx @@ -5,6 +5,7 @@ import { CircleDotIcon, CircleSlashIcon, MessageSquareIcon, + ScanEyeIcon, } from "lucide-react"; import { NORMALIZATION_ANCHOR } from "@/core/constants/ranking.constant"; @@ -12,7 +13,13 @@ import type { AggregateReactions } from "@/core/db/schema/shared"; import type { PublicSearchIssuesResponse } from "@/lib/api/search"; import { formatLocalDateTime, getTimeAgo } from "@/lib/time"; import { Badge } from "@/components/ui/badge"; +import { Button } from "@/components/ui/button"; import { FastTooltip } from "@/components/ui/fast-tooltip"; +import { + Popover, + PopoverContent, + PopoverTrigger, +} from "@/components/ui/popover"; import { Tooltip, TooltipContent, @@ -213,6 +220,27 @@ function IssueTitleWithLabels({ issue }: { issue: Issue }) { rankingScore={issue.rankingScore} similarityScore={issue.similarityScore} /> + {issue.overallSummary && ( + + + + + +
+
Summary
+
+ {issue.overallSummary} +
+
+
+
+ )}
  • - Help users find answers faster with semantic search + ✨ Help users find answers faster with semantic search and + quick summaries
  • - Reduce duplicate issues by making existing ones discoverable + 🎯 Reduce duplicate issues by making existing ones + discoverable
  • - Simple setup - just add a badge to your README + + 🆓 Simple setup and free to use, just add a badge to your + README! +
  • - Search across pull requests and discussions + 💬 Search across pull requests and discussions Coming soon @@ -53,7 +58,7 @@ function YourRepoPage() {
  • - Search across a collection of multiple repos, including + 🔒 Search across a collection of multiple repos, including private repos Coming soon diff --git a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts index c4b16b26..7a725442 100644 --- a/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts +++ b/packages/wrangler/src/workflows/sync/embedding/embedding.workflow.ts @@ -13,6 +13,12 @@ import { selectIssuesForEmbeddingInit, upsertIssueEmbeddings, } from "@/core/embedding"; +import { + bulkUpdateIssueSummaries, + generateBodySummary, + generateCommentsSummary, + generateOverallSummary, +} from "@/core/summary"; import { chunkArray } from "@/core/util/truncate"; import { getDeps } from "@/deps"; import { getEnvPrefix } from "@/util"; @@ -30,8 +36,9 @@ interface Env extends WranglerEnv { } /* two modes -1. as part of repo init. takes an array of issueIds (100 at a time), calls DB, creates embeddings, update DB -2. as part of cron sync. no parameter. just query all out-of-sync issueIds 100 at a time, create embeddings, update DB, calls itself recursively until no more such issues are found +1. as part of repo init. takes an array of issueIds (100 at a time), calls DB, creates embeddings and generate summaries, update DB +2. as part of cron sync. no parameter. just query all out-of-sync issueIds 100 at a time, create embeddings and generate summaries, update DB +calls itself recursively until no more such issues are found */ export type EmbeddingParams = | { @@ -86,16 +93,129 @@ export class EmbeddingWorkflow extends WorkflowEntrypoint< idx: number, totalBatches: number, ): Promise => { + // Generate both summaries in parallel + const [bodySummaries, commentSummaries] = await Promise.all([ + step.do( + `generate body summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("long"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + bodySummary: + issue.body.length > 1000 + ? await generateBodySummary(issue.body, openai) + : issue.body, + })), + ); + }, + ), + step.do( + `generate comment summaries for selected issues (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("long"), + async () => { + return await Promise.all( + issues.map(async (issue) => ({ + issueId: issue.id, + commentsSummary: + issue.comments.length === 0 + ? null + : issue.comments.reduce((acc, c) => acc + c.body, "") + .length > 1000 + ? await generateCommentsSummary(issue.comments, openai) + : issue.comments + .map( + (c) => + `${c.author?.name ?? "Deleted User"}: ${c.body}`, + ) + .join("\n"), + })), + ); + }, + ), + ]); + + const overallSummaries = await step.do( + `generate overall summaries (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("long"), + async () => { + return await Promise.all( + issues.map(async (issue) => { + const bodySummary = bodySummaries.find( + (s) => s.issueId === issue.id, + )?.bodySummary; + const commentsSummary = commentSummaries.find( + (s) => s.issueId === issue.id, + )?.commentsSummary; + // this should never happen + if ( + bodySummary === undefined || + commentsSummary === undefined + ) { + throw new Error(`No summary found for issue #${issue.id}`); + } + return { + issueId: issue.id, + overallSummary: await generateOverallSummary( + { + bodySummary, + commentsSummary, + issue, + }, + openai, + ), + }; + }), + ); + }, + ); + + // Update issues with summaries + const summaries = await step.do( + `bulk update issues with summaries in db (batch ${idx + 1} of ${totalBatches})`, + getStepDuration("medium"), + async () => { + const summaries = issues.map((issue) => { + const bodySummary = bodySummaries.find( + (s) => s.issueId === issue.id, + )?.bodySummary; + const commentsSummary = commentSummaries.find( + (s) => s.issueId === issue.id, + )?.commentsSummary; + const overallSummary = overallSummaries.find( + (s) => s.issueId === issue.id, + )?.overallSummary; + if ( + bodySummary === undefined || + commentsSummary === undefined || + overallSummary === undefined + ) { + throw new Error(`No summary found for issue #${issue.id}`); + } + return { + issueId: issue.id, + bodySummary, + commentsSummary, + overallSummary, + }; + }); + await bulkUpdateIssueSummaries(summaries, dbSession); + return summaries; + }, + ); + const embeddings = await step.do( `create embeddings for selected issues from API (batch ${idx + 1} of ${totalBatches})`, getStepDuration("medium"), async () => { return await createEmbeddings({ issues, + summaries, openai, }); }, ); + await step.do( `upsert issue embeddings in db (batch ${idx + 1})`, getStepDuration("medium"), diff --git a/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts b/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts index f84d6849..684528c0 100644 --- a/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts +++ b/packages/wrangler/src/workflows/sync/issue/issue.workflow.ts @@ -70,8 +70,6 @@ export class IssueWorkflow extends WorkflowEntrypoint { const name = `${repoOwner}/${repoName}`; caughtName = name; caughtRepoId = repoId; - // don't have to worry about getting same issues twice because - // we are using hasNextPage to determine if we should continue let syncCursor = repoSyncCursor; while (true) { diff --git a/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts b/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts index d71ef3c6..8d53f32c 100644 --- a/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts +++ b/packages/wrangler/src/workflows/sync/repo-init/init.workflow.ts @@ -135,7 +135,7 @@ export class RepoInitWorkflow extends WorkflowEntrypoint { attempt++ ) { const numIssues = getNumIssues(attempt); - // only use queryCursor's after if its since is the same as the previous + // only use syncCursor's after if its since is the same as the previous // else, just use null and use the new since const result = await getLatestGithubRepoIssues({ repoId,