diff --git a/extensions/diffbot/src/knowledge-connectors/diffbotCrawlerConnector.test.ts b/extensions/diffbot/src/knowledge-connectors/diffbotCrawlerConnector.test.ts new file mode 100644 index 00000000..6ebda50a --- /dev/null +++ b/extensions/diffbot/src/knowledge-connectors/diffbotCrawlerConnector.test.ts @@ -0,0 +1,484 @@ +import * as assert from "node:assert"; +import { beforeEach, describe, it, mock } from "node:test"; +import type { + CreateKnowledgeChunkReturnValue, + CreateKnowledgeSourceReturnValue, + DeleteKnowledgeSourceReturnValue, + KnowledgeApi, + UpsertKnowledgeSourceReturnValue, +} from "@cognigy/extension-tools/build/interfaces/knowledgeConnector"; +import { diffbotCrawlerConnector } from "./diffbotCrawlerConnector"; +import type { DiffbotJobStatusResponse, DiffbotResult } from "./types"; + +type DeepPartial = { + [K in keyof T]?: T[K] extends object ? DeepPartial : Partial; +}; + +describe("diffbotCrawlerConnector", () => { + // ── KnowledgeApi Mock Setup ────────────────────────────── + let mockApi: { [K in keyof KnowledgeApi]?: it.Mock } = {}; + let createKnowledgeSourceResult: CreateKnowledgeSourceReturnValue; + let deleteKnowledgeSourceResult: DeleteKnowledgeSourceReturnValue; + let upsertKnowledgeSourceResult: UpsertKnowledgeSourceReturnValue; + let createKnowledgeChunkResult: CreateKnowledgeChunkReturnValue; + + beforeEach(() => { + mockApi = { + createKnowledgeSource: mock.fn( + () => Promise.resolve(createKnowledgeSourceResult) as any, + ), + deleteKnowledgeSource: mock.fn( + () => Promise.resolve(deleteKnowledgeSourceResult) as any, + ), + upsertKnowledgeSource: mock.fn( + () => Promise.resolve(upsertKnowledgeSourceResult) as any, + ), + createKnowledgeChunk: mock.fn( + () => Promise.resolve(createKnowledgeChunkResult) as any, + ), + }; + }); + + // ── Diffbot API Mock Helpers ────────────────────────────── + + const defaultCrawlerResult: DiffbotResult = { + title: "Test Page", + type: "article", + pageUrl: "https://example.com/page1", + text: "This is the page content.", + humanLanguage: "en", + html: "

Should be filtered

", + }; + + /** + * Creates a complete config object with all required and optional fields + */ + const createTestConfig = (overrides: Record = {}) => ({ + connection: { accessToken: "test-token" }, + seeds: ["https://example.com"], + extractApiType: "analyze", + querystring: "", + maxToCrawl: 100, + maxToCrawlPerSubdomain: -1, + maxHops: -1, + crawlDelay: 0.25, + obeyRobots: true, + restrictDomain: true, + restrictSubdomain: false, + useProxies: false, + useCanonical: true, + maxToProcess: 100, + maxToProcessPerSubdomain: 100, + sourceTags: ["Web Page"], + retainCrawler: false, + urlCrawlPattern: [] as string[], + urlCrawlRegEx: "", + urlProcessPattern: [] as string[], + urlProcessRegEx: "", + pageProcessPattern: [] as string[], + userAgent: "", + referer: "", + cookie: "", + acceptLanguage: "", + ...overrides, + }); + + /** + * Mocks the POST request to create a crawl job + */ + const mockCreateCrawlJobResponse = () => + mock.method( + global, + "fetch", + async () => ({ + ok: true, + json: async () => ({ response: "Job created successfully" }), + }), + { times: 1 }, + ); + + /** + * Mocks the GET request to check job status + * @param status - Job status code (1=Success, 9=Complete) + * @param downloadJsonUrl - URL to download crawl results + */ + const mockGetJobStatusResponse = ( + response: DeepPartial = { + jobs: [ + { + jobStatus: { + status: 9, // Complete + message: "Job completed successfully", + }, + downloadJson: "https://api.diffbot.com/v3/crawl/download/test-job", + }, + ], + }, + ) => + mock.method( + global, + "fetch", + async () => ({ + ok: true, + json: async () => response, + }), + { times: 1 }, + ); + + /** + * Mocks the GET request to download job data + */ + const mockGetJobDataResponse = ( + results: DiffbotResult[] = [defaultCrawlerResult], + ) => + mock.method( + global, + "fetch", + async () => ({ + ok: true, + json: async () => results, + }), + { times: 1 }, + ); + + /** + * Mocks the DELETE request to delete a crawl job + */ + const mockDeleteJobResponse = () => + mock.method( + global, + "fetch", + async () => ({ + ok: true, + json: async () => ({ response: "Job deleted successfully" }), + }), + { times: 1 }, + ); + + // ── Test Cases ─────────────────────────────────────────── + + it("should create a new knowledge source with chunks from crawled pages", async () => { + // Arrange: Stack mock responses in reverse call order + mockDeleteJobResponse(); // Consumed last (delete job) + mockGetJobDataResponse([defaultCrawlerResult]); // Consumed fourth (get results from download URL) + mockGetJobStatusResponse(); // Consumed third (check status within getJobData) + mockGetJobStatusResponse(); // Consumed second (check status in monitor loop) + mockCreateCrawlJobResponse(); // Consumed first (create job) + upsertKnowledgeSourceResult = { knowledgeSourceId: "source-123" }; + + // Act + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [], + }); + + // Assert: Knowledge source upsert + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + assert.partialDeepStrictEqual( + mockApi.upsertKnowledgeSource.mock.calls[0].arguments[0], + { + name: "Test Page", + description: "Content from web page at https://example.com/page1", + tags: ["Web Page"], + externalIdentifier: "https://example.com/page1", + }, + ); + assert.ok( + mockApi.upsertKnowledgeSource.mock.calls[0].arguments[0] + .contentHashOrTimestamp, + ); + assert.ok( + mockApi.upsertKnowledgeSource.mock.calls[0].arguments[0].chunkCount > 0, + ); + + // Assert: Chunk creation + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length > 0); + assert.partialDeepStrictEqual( + mockApi.createKnowledgeChunk.mock.calls[0].arguments[0], + { + knowledgeSourceId: "source-123", + data: { + url: "https://example.com/page1", + title: "Test Page", + language: "en", + type: "article", + }, + }, + ); + + // Assert: No deletions + assert.strictEqual(mockApi.deleteKnowledgeSource.mock.calls.length, 0); + }); + + it("should create knowledge sources for multiple crawled pages", async () => { + // Arrange: Multiple crawl results + const crawlResults = [ + { + title: "First Page", + type: "article", + pageUrl: "https://example.com/page1", + text: "Content of first page.", + humanLanguage: "en", + }, + { + title: "Second Page", + type: "article", + pageUrl: "https://example.com/page2", + text: "Content of second page.", + humanLanguage: "en", + }, + ]; + + mockDeleteJobResponse(); + mockGetJobDataResponse(crawlResults); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + + // Mock to return different IDs for each call + mockApi.upsertKnowledgeSource.mock.mockImplementation(() => + Promise.resolve( + mockApi.upsertKnowledgeSource.mock.callCount() > 0 + ? { knowledgeSourceId: "source-456" } + : { knowledgeSourceId: "source-123" }, + ), + ); + + // Act + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [], + }); + + // Assert: Two sources created + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 2); + assert.partialDeepStrictEqual( + mockApi.upsertKnowledgeSource.mock.calls[0].arguments[0], + { + name: "First Page", + description: "Content from web page at https://example.com/page1", + tags: ["Web Page"], + externalIdentifier: "https://example.com/page1", + }, + ); + assert.partialDeepStrictEqual( + mockApi.upsertKnowledgeSource.mock.calls[1].arguments[0], + { + name: "Second Page", + description: "Content from web page at https://example.com/page2", + tags: ["Web Page"], + externalIdentifier: "https://example.com/page2", + }, + ); + + // Assert: Chunks created for both sources + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length >= 2); + + // Assert: No deletions + assert.strictEqual(mockApi.deleteKnowledgeSource.mock.calls.length, 0); + }); + + it("should upsert an existing knowledge source with chunks", async () => { + // Arrange + mockDeleteJobResponse(); + mockGetJobDataResponse([defaultCrawlerResult]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + upsertKnowledgeSourceResult = { knowledgeSourceId: "source-123" }; + + // Act: Call with existing source + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [ + { + knowledgeSourceId: "source-123", + name: "Test Page", + description: "Content from web page at https://example.com/page1", + chunkCount: 1, + externalIdentifier: "https://example.com/page1", + tags: ["Web Page"], + contentHashOrTimestamp: "previous-hash", + }, + ], + }); + + // Assert: Source upserted + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + assert.partialDeepStrictEqual( + mockApi.upsertKnowledgeSource.mock.calls[0].arguments[0], + { + name: "Test Page", + description: "Content from web page at https://example.com/page1", + externalIdentifier: "https://example.com/page1", + }, + ); + + // Assert: Chunks created + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length > 0); + + // Assert: No deletions (same externalIdentifier) + assert.strictEqual(mockApi.deleteKnowledgeSource.mock.calls.length, 0); + }); + + it("should delete outdated sources when crawl returns different pages", async () => { + // Arrange: Crawl returns new page + const newCrawlResult = { + title: "New Page", + type: "article", + pageUrl: "https://example.com/new-page", + text: "Content of new page.", + humanLanguage: "en", + }; + + mockDeleteJobResponse(); + mockGetJobDataResponse([newCrawlResult]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + upsertKnowledgeSourceResult = { knowledgeSourceId: "new-source-123" }; + + // Act: Existing sources have different externalIdentifier + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [ + { + knowledgeSourceId: "old-source-123", + name: "Old Page", + description: "Content from web page at https://example.com/old-page", + chunkCount: 1, + externalIdentifier: "https://example.com/old-page", + tags: ["Web Page"], + contentHashOrTimestamp: "old-hash", + }, + ], + }); + + // Assert: New source created + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length > 0); + + // Assert: Old source deleted + assert.strictEqual(mockApi.deleteKnowledgeSource.mock.calls.length, 1); + assert.strictEqual( + mockApi.deleteKnowledgeSource.mock.calls[0].arguments[0] + .knowledgeSourceId, + "old-source-123", + ); + }); + + it("should skip chunk ingestion when upsert returns null", async () => { + // Arrange + mockDeleteJobResponse(); + mockGetJobDataResponse([defaultCrawlerResult]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + upsertKnowledgeSourceResult = null; // Content unchanged + + // Act + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [ + { + knowledgeSourceId: "source-123", + name: "Test Page", + description: "Content from web page at https://example.com/page1", + chunkCount: 1, + externalIdentifier: "https://example.com/page1", + tags: ["Web Page"], + contentHashOrTimestamp: "hash", + }, + ], + }); + + // Assert: Upsert was called + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + + // Assert: No chunks created (content unchanged) + assert.strictEqual(mockApi.createKnowledgeChunk.mock.calls.length, 0); + + // Assert: No deletions + assert.strictEqual(mockApi.deleteKnowledgeSource.mock.calls.length, 0); + }); + + it("should retain crawler when retainCrawler is true", async () => { + // Arrange: No delete job mock added + mockGetJobDataResponse([defaultCrawlerResult]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + upsertKnowledgeSourceResult = { knowledgeSourceId: "source-123" }; + + // Act + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig({ retainCrawler: true }), + sources: [], + }); + + // Assert: Source created successfully + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length > 0); + + // Note: No assertion on deleteJob since we didn't mock it + // The test passes because the connector doesn't call deleteJob when retainCrawler is true + }); + + it("should skip pages without pageUrl", async () => { + // Arrange: Result without pageUrl + const resultWithoutUrl = { + title: "Test Page", + type: "article", + text: "Content without URL.", + humanLanguage: "en", + // pageUrl is missing + }; + + mockDeleteJobResponse(); + mockGetJobDataResponse([resultWithoutUrl]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + + // Act + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig(), + sources: [], + }); + + // Assert: No sources created (pageUrl is required) + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 0); + assert.strictEqual(mockApi.createKnowledgeChunk.mock.calls.length, 0); + }); + + it("should handle query string parameters", async () => { + // Arrange + mockDeleteJobResponse(); + mockGetJobDataResponse([defaultCrawlerResult]); + mockGetJobStatusResponse(); // For getJobData + mockGetJobStatusResponse(); // For monitor loop + mockCreateCrawlJobResponse(); + upsertKnowledgeSourceResult = { knowledgeSourceId: "source-123" }; + + // Act: With query string parameters + await diffbotCrawlerConnector.function({ + api: mockApi as KnowledgeApi, + config: createTestConfig({ + extractApiType: "article", + querystring: "fields=title,text&timeout=10", + }), + sources: [], + }); + + // Assert: Source created successfully (query string was processed) + assert.strictEqual(mockApi.upsertKnowledgeSource.mock.calls.length, 1); + assert.ok(mockApi.createKnowledgeChunk.mock.calls.length > 0); + }); +}); diff --git a/extensions/diffbot/src/knowledge-connectors/helper/crawler.ts b/extensions/diffbot/src/knowledge-connectors/helper/crawler.ts index 8b4b48a3..f86fe157 100644 --- a/extensions/diffbot/src/knowledge-connectors/helper/crawler.ts +++ b/extensions/diffbot/src/knowledge-connectors/helper/crawler.ts @@ -1,50 +1,15 @@ +import type { + CrawlJobSettings, + DiffbotCreateCrawlResponse, + DiffbotJobStatusResponse, + DiffbotResult, +} from "../types"; import { fetchWithRetry, logMessage } from "./utils"; -interface CrawlJobSettings { - name: string; - seeds: string[]; - apiUrl: string; - - // Crawling Settings - urlCrawlPattern?: string[]; - urlCrawlRegEx?: string; - maxToCrawl?: number; - maxToCrawlPerSubdomain?: number; - maxHops?: number; - crawlDelay?: number; - obeyRobots?: boolean; - restrictDomain?: boolean; - restrictSubdomain?: boolean; - useProxies?: boolean; - useCanonical?: boolean; - - // Processing Settings - urlProcessPattern?: string[]; - urlProcessRegEx?: string; - pageProcessPattern?: string[]; - maxToProcess?: number; - maxToProcessPerSubdomain?: number; - - // Custom Headers - userAgent?: string; - referer?: string; - cookie?: string; - acceptLanguage?: string; -} - -interface DiffbotResult { - title?: string; - type?: string; - pageUrl?: string; - html?: string; - humanLanguage?: string; - [key: string]: any; // Allow for additional properties -} - const SUCCESS_JOB_STATUS_CODES = [1, 2, 3, 5, 9]; const FAILED_JOB_STATUS_CODES = [6, 8, 10, 11]; // If Job is paused, it will be considered failed -class DiffbotCrawler { +export class DiffbotCrawler { private token: string; private crawlUrl: string; @@ -56,14 +21,20 @@ class DiffbotCrawler { /** * Create a new crawl job */ - async createCrawlJob(settings: CrawlJobSettings): Promise { + async createCrawlJob( + settings: CrawlJobSettings, + ): Promise { const payload = await this.getPayload(settings); const params = new URLSearchParams({ token: this.token }); - await fetchWithRetry(`${this.crawlUrl}?${params}`, { - method: "POST", - headers: { "Content-Type": "application/x-www-form-urlencoded" }, - body: payload.toString(), - }); + const response = await fetchWithRetry( + `${this.crawlUrl}?${params}`, + { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: payload.toString(), + }, + ); + return response; } /** @@ -105,9 +76,11 @@ class DiffbotCrawler { /** * Get status of a crawl job */ - async getJobStatus(jobName: string): Promise { + async getJobStatus(jobName: string): Promise { const params = new URLSearchParams({ token: this.token, name: jobName }); - const response = await fetchWithRetry(`${this.crawlUrl}?${params}`); + const response = await fetchWithRetry( + `${this.crawlUrl}?${params}`, + ); return response; } @@ -117,22 +90,25 @@ class DiffbotCrawler { async getJobData(jobName: string): Promise { const params = new URLSearchParams({ token: this.token }); const status = await this.getJobStatus(jobName); - const downloadUrl = status.jobs?.[0].downloadJson; + const downloadUrl = status.jobs?.[0]?.downloadJson; return downloadUrl - ? ((await fetchWithRetry(`${downloadUrl}?${params}`)) as DiffbotResult[]) + ? await fetchWithRetry(`${downloadUrl}?${params}`) : []; } /** * Delete crawl job */ - async deleteJob(jobName: string): Promise { + async deleteJob(jobName: string): Promise { const params = new URLSearchParams({ token: this.token, name: jobName, delete: "1", }); - await fetchWithRetry(`${this.crawlUrl}?${params}`); + const response = await fetchWithRetry( + `${this.crawlUrl}?${params}`, + ); + return response; } /** @@ -182,5 +158,3 @@ class DiffbotCrawler { return params; } } - -export { DiffbotCrawler, type CrawlJobSettings }; diff --git a/extensions/diffbot/src/knowledge-connectors/types.ts b/extensions/diffbot/src/knowledge-connectors/types.ts index 743131d5..8045b36a 100644 --- a/extensions/diffbot/src/knowledge-connectors/types.ts +++ b/extensions/diffbot/src/knowledge-connectors/types.ts @@ -60,3 +60,141 @@ export interface DiffbotV3AnalyzeResponse { type: string; title: string; } + +// Crawler API Types + +/** + * Settings for creating a Diffbot crawl job + */ +export interface CrawlJobSettings { + name: string; + seeds: string[]; + apiUrl: string; + + // Crawling Settings + urlCrawlPattern?: string[]; + urlCrawlRegEx?: string; + maxToCrawl?: number; + maxToCrawlPerSubdomain?: number; + maxHops?: number; + crawlDelay?: number; + obeyRobots?: boolean; + restrictDomain?: boolean; + restrictSubdomain?: boolean; + useProxies?: boolean; + useCanonical?: boolean; + + // Processing Settings + urlProcessPattern?: string[]; + urlProcessRegEx?: string; + pageProcessPattern?: string[]; + maxToProcess?: number; + maxToProcessPerSubdomain?: number; + + // Custom Headers + userAgent?: string; + referer?: string; + cookie?: string; + acceptLanguage?: string; +} + +/** + * Represents a single result object extracted from a crawled page. + * The shape matches the output from Diffbot Extract APIs (Article, Product, etc.) + */ +export interface DiffbotResult { + title?: string; + type?: string; + pageUrl?: string; + html?: string; + humanLanguage?: string; + text?: string; + [key: string]: any; // Allow for additional properties from various Extract API types +} + +/** + * Job status information returned by the Crawl API + * Status codes: + * 0 = Job is initializing + * 1 = Job has reached maxRounds limit + * 2 = Job has reached maxToCrawl limit + * 3 = Job has reached maxToProcess limit + * 4 = Next round to start in _ seconds + * 5 = No URLs were added to the crawl + * 6 = Job paused + * 7 = Job in progress + * 8 = All crawling temporarily paused by root administrator for maintenance + * 9 = Job has completed and no repeat is scheduled + * 10 = Failed to crawl any seed + * 11 = Job automatically paused because crawl is inefficient + */ +export interface DiffbotJobStatus { + status: number; + message: string; +} + +/** + * Detailed information about a crawl job returned by the Crawl API + */ +export interface DiffbotCrawlJob { + name: string; + type: string; + jobCreationTimeUTC: number; + jobCompletionTimeUTC: number; + jobStatus: DiffbotJobStatus; + sentJobDoneNotification: number; + objectsFound: number; + urlsHarvested: number; + pageCrawlAttempts: number; + pageCrawlSuccesses: number; + pageCrawlSuccessesThisRound: number; + pageProcessAttempts: number; + pageProcessSuccesses: number; + pageProcessSuccessesThisRound: number; + maxRounds: number; + repeat: number; + crawlDelay: number; + obeyRobots: number; + maxToCrawl: number; + maxToProcess: number; + onlyProcessIfNew?: number; + seeds: string; + roundsCompleted: number; + roundStartTime: number; + currentTime: number; + currentTimeUTC: number; + apiUrl: string; + urlCrawlPattern: string; + urlProcessPattern: string; + pageProcessPattern: string; + urlCrawlRegEx: string; + urlProcessRegEx: string; + maxHops: number; + downloadJson: string; + downloadUrls: string; + notifyEmail?: string; + notifyWebhook?: string; + useCanonical?: number; + useProxies?: number; + customHeaders?: Record; + seedRecrawlFrequency?: number; + maxToCrawlPerSubdomain?: number; + maxToProcessPerSubdomain?: number; + restrictDomain?: number; + restrictSubdomain?: number; +} + +/** + * Response from creating a crawl job (POST /v3/crawl) + */ +export interface DiffbotCreateCrawlResponse { + response: string; + jobs: DiffbotCrawlJob[]; +} + +/** + * Response from getting crawl job status (GET /v3/crawl) + */ +export interface DiffbotJobStatusResponse { + jobs: DiffbotCrawlJob[]; +}