From 056c7eac29c585832ff80e47d2b7d02568736e3b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 25 Sep 2025 14:25:48 -0400 Subject: [PATCH 1/4] Add CLI option to respect robots.txt disallows When enabled, the new --robots flag will result in the crawler fetching robots.txt for each page origin, cacheing in Redis by URL to avoid duplicate fetches, and checking if URLs are allowed by the policies therein before queueing. --- package.json | 1 + src/crawler.ts | 107 ++++++++++++++++++++++++++++++++++++++++++ src/util/argParser.ts | 7 +++ src/util/logger.ts | 1 + src/util/state.ts | 14 ++++++ 5 files changed, 130 insertions(+) diff --git a/package.json b/package.json index 762854776..1e7a00dc4 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^24.22.0", + "robots-parser": "^3.0.1", "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", diff --git a/src/crawler.ts b/src/crawler.ts index 39b1b2f5a..488e266cf 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -3,6 +3,8 @@ import path from "path"; import fs, { WriteStream } from "fs"; import os from "os"; import fsp from "fs/promises"; +import { fetch as undiciFetch } from "undici"; +import robotsParser, { Robot } from "robots-parser"; import { RedisCrawlState, @@ -36,6 +38,7 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js"; import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js"; +import { getProxyDispatcher } from "./util/proxy.js"; import { Browser } from "./util/browser.js"; @@ -1249,6 +1252,98 @@ self.__bx_behaviors.selectMainBehavior(); } } + async _fetchRobots(url: string) { + while (true) { + const resp = await undiciFetch(url, { + headers: this.headers, + dispatcher: getProxyDispatcher(url), + }); + + if (resp.ok) { + return resp; + } + + const retry = resp.headers.get("retry-after"); + + if (retry) { + logger.debug( + "Robots.txt fetch: Retry after", + { url, retrySeconds: retry }, + "robots", + ); + await sleep(parseInt(retry)); + continue; + } + + logger.debug( + "Robots.txt not fetched", + { url, status: resp.status }, + "robots", + ); + return null; + } + return null; + } + + async fetchAndParseRobots( + url: string, + logDetails: LogDetails, + ): Promise { + // Fetch robots.txt for url's host and return parser, caching robots + // bodies in Redis by their URL + // TODO: Consider using an LRU cache/only cache so many robots responses + // in Redis at one time and re-fetch if no longer in cache to avoid + // exhausting memory on very large crawls across many hosts + const urlParser = new URL(url); + const robotsUrl = `${urlParser.origin}/robots.txt`; + + const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl); + if (cachedRobots) { + logger.debug( + "Using cached robots.txt body", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return robotsParser(robotsUrl, cachedRobots); + } + + try { + logger.debug( + "Fetching robots.txt", + { url: robotsUrl, ...logDetails }, + "robots", + ); + const resp = await this._fetchRobots(robotsUrl); + if (!resp) { + return null; + } + const content = await resp.text(); + + logger.debug( + "Caching robots.txt body", + { url: robotsUrl, ...logDetails }, + "robots", + ); + await this.crawlState.setCachedRobots(robotsUrl, content); + + return robotsParser(robotsUrl, content); + } catch (e) { + // ignore + } + logger.warn( + "Failed to fetch robots.txt", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return null; + } + async awaitPageExtraDelay(opts: WorkerState) { if (this.params.pageExtraDelay) { const { @@ -2462,6 +2557,18 @@ self.__bx_behaviors.selectMainBehavior(); return false; } + if (this.params.robots) { + const robots = await this.fetchAndParseRobots(url, logDetails); + if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) { + logger.debug( + "Page URL not queued, disallowed by robots.txt", + { url, ...logDetails }, + "links", + ); + return false; + } + } + const result = await this.crawlState.addToQueue( { url, seedId, depth, extraHops, ts, pageid }, this.pageLimit, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index cd64e8fd6..0ea4a898a 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -683,6 +683,13 @@ class ArgParser { "path to SSH known hosts file for SOCKS5 over SSH proxy connection", type: "string", }, + + robots: { + describe: + "If set, fetch and respect page disallows specified in per-host robots.txt", + type: "boolean", + default: false, + }, }); } diff --git a/src/util/logger.ts b/src/util/logger.ts index 7d10939ee..4842aa226 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [ "replay", "proxy", "scope", + "robots", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; diff --git a/src/util/state.ts b/src/util/state.ts index 9309116a5..9739b8a2f 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -200,7 +200,9 @@ export class RedisCrawlState { fkey: string; ekey: string; bkey: string; + rkey: string; pageskey: string; + esKey: string; esMap: string; @@ -233,6 +235,8 @@ export class RedisCrawlState { this.ekey = this.key + ":e"; // crawler behavior script messages this.bkey = this.key + ":b"; + // cached robots.txt bodies (per-origin) + this.rkey = this.key + ":r"; // pages this.pageskey = this.key + ":pages"; @@ -1025,6 +1029,16 @@ return inx; return await this.redis.lpush(this.bkey, behaviorLog); } + async setCachedRobots(robotsUrl: string, body: string) { + const urlKey = `${this.rkey}:${robotsUrl}`; + return await this.redis.set(urlKey, body); + } + + async getCachedRobots(robotsUrl: string) { + const urlKey = `${this.rkey}:${robotsUrl}`; + return await this.redis.get(urlKey); + } + async writeToPagesQueue( data: Record, ) { From bdcc828a38abd81836d860e2cf11d07762a279ab Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 29 Sep 2025 17:46:20 -0400 Subject: [PATCH 2/4] Implement LRU cache with limit of 100 robots.txt bodies --- src/crawler.ts | 8 +++----- src/util/constants.ts | 2 ++ src/util/state.ts | 34 +++++++++++++++++++++++++++++----- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 488e266cf..84e1e7bae 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1289,11 +1289,9 @@ self.__bx_behaviors.selectMainBehavior(); url: string, logDetails: LogDetails, ): Promise { - // Fetch robots.txt for url's host and return parser, caching robots - // bodies in Redis by their URL - // TODO: Consider using an LRU cache/only cache so many robots responses - // in Redis at one time and re-fetch if no longer in cache to avoid - // exhausting memory on very large crawls across many hosts + // Fetch robots.txt for url's host and return parser. + // Results are cached by robots.txt URL in Redis using an LRU cache + // implementation that retains the 100 most recently used values. const urlParser = new URL(url); const robotsUrl = `${urlParser.origin}/robots.txt`; diff --git a/src/util/constants.ts b/src/util/constants.ts index 15b00bd70..ebf83c571 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; +export const ROBOTS_CACHE_LIMIT = 100; + export type ExtractSelector = { selector: string; extract: string; diff --git a/src/util/state.ts b/src/util/state.ts index 9739b8a2f..cffd32e46 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid"; import { logger } from "./logger.js"; -import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js"; +import { + MAX_DEPTH, + DEFAULT_MAX_RETRIES, + ROBOTS_CACHE_LIMIT, +} from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename } from "./storage.js"; @@ -201,6 +205,7 @@ export class RedisCrawlState { ekey: string; bkey: string; rkey: string; + lkey: string; pageskey: string; esKey: string; @@ -237,6 +242,8 @@ export class RedisCrawlState { this.bkey = this.key + ":b"; // cached robots.txt bodies (per-origin) this.rkey = this.key + ":r"; + // LRU cache of robots.txt keys + this.lkey = this.key + ":l"; // pages this.pageskey = this.key + ":pages"; @@ -1029,14 +1036,31 @@ return inx; return await this.redis.lpush(this.bkey, behaviorLog); } + async _updateRobotsAccessTime(robotsUrl: string) { + const accessTime = Date.now(); + await this.redis.zadd(this.lkey, accessTime, robotsUrl); + } + async setCachedRobots(robotsUrl: string, body: string) { - const urlKey = `${this.rkey}:${robotsUrl}`; - return await this.redis.set(urlKey, body); + await this._updateRobotsAccessTime(robotsUrl); + await this.redis.set(`${this.rkey}:${robotsUrl}`, body); + + // prune least-recently used items in zset and robots cache if over limit + const cacheCount = await this.redis.zcard(this.lkey); + if (cacheCount > ROBOTS_CACHE_LIMIT) { + const diff = cacheCount - ROBOTS_CACHE_LIMIT; + const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1); + + for (const keyToDelete of keysToDelete) { + await this.redis.del(`${this.rkey}:${keyToDelete}`); + await this.redis.zrem(this.lkey, keyToDelete); + } + } } async getCachedRobots(robotsUrl: string) { - const urlKey = `${this.rkey}:${robotsUrl}`; - return await this.redis.get(urlKey); + await this._updateRobotsAccessTime(robotsUrl); + return await this.redis.get(`${this.rkey}:${robotsUrl}`); } async writeToPagesQueue( From f192e798c53e327042515516fc219879d6bec096 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 30 Sep 2025 10:42:57 -0400 Subject: [PATCH 3/4] Add debug log line for deleting cached robots --- src/util/state.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/util/state.ts b/src/util/state.ts index cffd32e46..43978de60 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1052,6 +1052,11 @@ return inx; const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1); for (const keyToDelete of keysToDelete) { + logger.debug( + "Deleting cached robots.txt, over cache limit", + { url: keyToDelete }, + "robots", + ); await this.redis.del(`${this.rkey}:${keyToDelete}`); await this.redis.zrem(this.lkey, keyToDelete); } From 004c4ebd9b8a27f3ba0078247ad5ff22b48921e9 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 30 Sep 2025 11:08:00 -0400 Subject: [PATCH 4/4] Add tests for robots.txt being fetched and cached Does not yet include testing that a page URL disallowed by robots is not queued, as I haven't yet been able to find a Webrecorder- managed site with a robots.txt with disallows to test against. --- tests/robots_txt.test.js | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/robots_txt.test.js diff --git a/tests/robots_txt.test.js b/tests/robots_txt.test.js new file mode 100644 index 000000000..43ffe1975 --- /dev/null +++ b/tests/robots_txt.test.js @@ -0,0 +1,35 @@ +import child_process from "child_process"; + +test("test robots.txt is fetched and cached", async () => { + const res = child_process.execSync( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug", + ); + + const log = res.toString(); + + // robots.txt not found + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', + ) > 0, + ).toBe(true); + + // robots.txt found and cached + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); +});