From a7eb2f7c6afe1e4777444bce7b5a3950bfa56e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= <mo.geryy@gmail.com> Date: Thu, 30 Jan 2025 08:16:51 +0100 Subject: [PATCH 1/2] fix(crawler/rust): dedupe --- apps/api/src/scraper/WebScraper/crawler.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ea606f4409..14ae5d7104 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -401,13 +401,13 @@ export class WebCrawler { public async extractLinksFromHTML(html: string, url: string) { try { - return (await this.extractLinksFromHTMLRust(html, url)).map(x => { + return [...new Set((await this.extractLinksFromHTMLRust(html, url)).map(x => { try { return new URL(x, url).href } catch (e) { return null; } - }).filter(x => x !== null) as string[]; + }).filter(x => x !== null) as string[])]; } catch (error) { this.logger.error("Failed to call html-transformer! Falling back to cheerio...", { error, From 71878cf4d9c053dfc8a494bb1688b91f32ffd78f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com> Date: Thu, 30 Jan 2025 16:07:36 +0100 Subject: [PATCH 2/2] fix(cc): hotfix --- apps/api/src/lib/concurrency-limit.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts index 8205113ffe..a2d2707753 100644 --- a/apps/api/src/lib/concurrency-limit.ts +++ b/apps/api/src/lib/concurrency-limit.ts @@ -9,6 +9,7 @@ const constructQueueKey = (team_id: string) => const stalledJobTimeoutMs = 2 * 60 * 1000; export function getConcurrencyLimitMax(plan: string): number { + if (plan === "growth") return 100; return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan); }