From dbb6a21b025f93791e2675bef707eb07dcb7ddb8 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 04:39:35 -0500 Subject: [PATCH 01/14] Add VSCode workspace file in .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index c132d3b4..36664c9f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ storage # any output from the crawler *.json pnpm-lock.yaml +# VS Code workspace files +*.code-workspace From 60ec1883756c7a838d95d347888e25dfc0d4b85d Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 04:42:09 -0500 Subject: [PATCH 02/14] Add outputs dir to .gitignore for final outputs --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 36664c9f..4daa0eb2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,9 @@ storage # any output from the crawler *.json pnpm-lock.yaml + +# Final ouputs folder +outputs + # VS Code workspace files *.code-workspace From 14eb9fab1e74d952ee8b48813b2fc00e51865a96 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 04:44:53 -0500 Subject: [PATCH 03/14] Add Dynamic OutputFileName based on date-timestamp --- config.ts | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/config.ts b/config.ts index bc2d22e0..07a3bcf6 100644 --- a/config.ts +++ b/config.ts @@ -1,4 +1,25 @@ import { Config } from "./src/config"; +import { fileURLToPath } from 'url'; +import { dirname } from 'path'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const starting_url = "https://www.builder.io/c/docs/developers"; +const url_prefix = "https://" +const domain = "www.builder.io"; +const url_suffix = "/c/docs"; +const base_url = url_prefix + domain; +const match_url_prefix = base_url + url_suffix; +const match_url = match_url_prefix + "/**"; + +// Now date stamp for output file name +const now = new Date(); +const date = now.toISOString().split('T')[0]; +const time = now.toTimeString().split(' ')[0]; +const outputs_dir = __dirname.split('/').slice(0, -1).join('/') + '/outputs'; + +const outputFileName = outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; export const defaultConfig: Config = { url: "https://www.builder.io/c/docs/developers", From e700f6e14db09e225bf594307dc59bcce3873c2a Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 04:57:47 -0500 Subject: [PATCH 04/14] Allow maxPagesToCrawl to be optional and infinite by setting 0 which will display the infinity symbol. Default is 50 --- README.md | 3 ++- src/config.ts | 3 ++- src/core.ts | 10 +++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 43bfe4c7..0111b472 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,8 @@ type Config = { match: string; /** Selector to grab the inner text from */ selector: string; - /** Don't crawl more than this many pages */ + + /** Optional - Don't crawl more than this many pages (0 = Crawl all, Default = 50)*/ maxPagesToCrawl: number; /** File name for the finished data */ outputFileName: string; diff --git a/src/config.ts b/src/config.ts index 7e5f5fbf..7ad3844e 100644 --- a/src/config.ts +++ b/src/config.ts @@ -26,10 +26,11 @@ export const configSchema = z.object({ */ selector: z.string().optional(), /** + * **Optional:** * Don't crawl more than this many pages * @default 50 */ - maxPagesToCrawl: z.number().int().positive(), + maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(), /** * File name for the finished data * @default "output.json" diff --git a/src/core.ts b/src/core.ts index 8e03bbe5..be010c6a 100644 --- a/src/core.ts +++ b/src/core.ts @@ -55,6 +55,12 @@ export async function crawl(config: Config) { const crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { + // Warn if unlimited crawling is enabled + if (config.maxPagesToCrawl == 0) { + config.maxPagesToCrawl = undefined; + log.warningOnce(`maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`); + } + if (config.cookie) { // Set the cookie for the specific URL const cookie = { @@ -66,9 +72,11 @@ export async function crawl(config: Config) { } const title = await page.title(); + // Display the pageCounter/maxPagesToCrawl number or pageCounter/∞ if maxPagesToCrawl=0 + const maxPagesToCrawlDisplay = config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; pageCounter++; log.info( - `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, + `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...` ); // Use custom handling for XPath selector From ac0ac25a3240bd57f392f8655d43af2436b93777 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:06:59 -0500 Subject: [PATCH 05/14] Added maxConcurrency config to set maximum concurrent parallel requests. --- README.md | 2 ++ config.ts | 2 +- src/config.ts | 7 +++++++ src/core.ts | 6 +----- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0111b472..8b08f997 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,8 @@ type Config = { maxFileSize?: number; /** Optional maximum number tokens to include in the output file */ maxTokens?: number; + /** Optional - Maximum concurent parellel requets at a time */ + maxConcurrency?: number; }; ``` diff --git a/config.ts b/config.ts index 07a3bcf6..dd658a6b 100644 --- a/config.ts +++ b/config.ts @@ -25,5 +25,5 @@ export const defaultConfig: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, - outputFileName: "output.json", + maxConcurrency: 1, }; diff --git a/src/config.ts b/src/config.ts index 7ad3844e..848dcbeb 100644 --- a/src/config.ts +++ b/src/config.ts @@ -71,6 +71,13 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** + * **Optional:** + * maxConcurrency + * description: ( 0 = Unlimited, Doesn't stop until cancelled, undefined = max parellel requests possible ) + * @default 1 + * */ + maxConcurrency: z.number().int().nonnegative().optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index be010c6a..5eccb720 100644 --- a/src/core.ts +++ b/src/core.ts @@ -109,11 +109,7 @@ export async function crawl(config: Config) { globs: typeof config.match === "string" ? [config.match] : config.match, }); - }, - // Comment this option to scrape the full website. - maxRequestsPerCrawl: config.maxPagesToCrawl, - // Uncomment this option to see the browser window. - // headless: false, + maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency preNavigationHooks: [ // Abort requests for certain resource types async ({ page, log }) => { From a6b4b1f61119340b97dc0776d1cc45924e08b354 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:09:41 -0500 Subject: [PATCH 06/14] Update to core.ts for maxPagesToCrawl --- src/core.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core.ts b/src/core.ts index 5eccb720..aee047df 100644 --- a/src/core.ts +++ b/src/core.ts @@ -110,6 +110,7 @@ export async function crawl(config: Config) { typeof config.match === "string" ? [config.match] : config.match, }); maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency + maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. preNavigationHooks: [ // Abort requests for certain resource types async ({ page, log }) => { From c6b63036a9615550b27eda09debeb746b569b7ee Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:15:17 -0500 Subject: [PATCH 07/14] Addded headless mode as a configuration parameter --- README.md | 2 ++ config.ts | 1 + src/config.ts | 6 ++++++ src/core.ts | 1 + 4 files changed, 10 insertions(+) diff --git a/README.md b/README.md index 8b08f997..e4ff7767 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,8 @@ type Config = { maxTokens?: number; /** Optional - Maximum concurent parellel requets at a time */ maxConcurrency?: number; + /** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */ + headless?: boolean; }; ``` diff --git a/config.ts b/config.ts index dd658a6b..53e66eda 100644 --- a/config.ts +++ b/config.ts @@ -25,5 +25,6 @@ export const defaultConfig: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, + headless: true, maxConcurrency: 1, }; diff --git a/src/config.ts b/src/config.ts index 848dcbeb..25ea155a 100644 --- a/src/config.ts +++ b/src/config.ts @@ -71,6 +71,12 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** + * **Optional:** + * Headless mode + * @default true + */ + headless: z.boolean().optional(), /** * **Optional:** * maxConcurrency diff --git a/src/core.ts b/src/core.ts index aee047df..b789fee0 100644 --- a/src/core.ts +++ b/src/core.ts @@ -111,6 +111,7 @@ export async function crawl(config: Config) { }); maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. + headless: config.headless ?? true, // Set to false to see the browser in action preNavigationHooks: [ // Abort requests for certain resource types async ({ page, log }) => { From b427b253951fb403fff7ccc3c763f6cc8db02134 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:18:49 -0500 Subject: [PATCH 08/14] Added waitPerPageCrawlTimeoutRange for a random range in milliseconds between page requests to help with rate limiting --- README.md | 9 +++++++++ config.ts | 1 + src/config.ts | 9 +++++++++ src/core.ts | 22 ++++++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/README.md b/README.md index e4ff7767..ce69a526 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,15 @@ type Config = { maxTokens?: number; /** Optional - Maximum concurent parellel requets at a time */ maxConcurrency?: number; + + /** Optional - waitPerPageCrawlTimeoutRange is a object containing a min and max each for the number of milliseconds to wait after each page crawl. + * Use waitPerPageCrawlTimeoutRange to handle rate limiting. + */ + waitPerPageCrawlTimeoutRange?: { + min: number, + max: number, + }; + /** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */ headless?: boolean; }; diff --git a/config.ts b/config.ts index 53e66eda..d1c7882f 100644 --- a/config.ts +++ b/config.ts @@ -25,6 +25,7 @@ export const defaultConfig: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, + waitPerPageCrawlTimeoutRange: {min:1000, max:1000}, headless: true, maxConcurrency: 1, }; diff --git a/src/config.ts b/src/config.ts index 25ea155a..418c6227 100644 --- a/src/config.ts +++ b/src/config.ts @@ -71,6 +71,15 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** + * **Optional:** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + * @default {min:1000,max:1000} + * */ + waitPerPageCrawlTimeoutRange: z.object({ + min: z.number().int().nonnegative(), + max: z.number().int().nonnegative(), + }).optional(), /** * **Optional:** * Headless mode diff --git a/src/core.ts b/src/core.ts index b789fee0..78b95bd0 100644 --- a/src/core.ts +++ b/src/core.ts @@ -47,6 +47,14 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) { } export async function crawl(config: Config) { + + // Function to delay the next crawl + function delay(time: number) { + return new Promise(function(resolve) { + setTimeout(resolve, time) + }); + } + configSchema.parse(config); if (process.env.NO_CRAWL !== "true") { @@ -109,6 +117,20 @@ export async function crawl(config: Config) { globs: typeof config.match === "string" ? [config.match] : config.match, }); + // Use waitPerPageCrawlTimeoutRange to handle rate limiting + if (config.waitPerPageCrawlTimeoutRange) { + // Create a random number between min and max + const randomTimeout = Math.floor(Math.random() * (config.waitPerPageCrawlTimeoutRange.max - config.waitPerPageCrawlTimeoutRange.min + 1) + config.waitPerPageCrawlTimeoutRange.min); + log.info( + `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...` + ); + // Wait for the random amount of time before crawling the next page + await delay(randomTimeout); + }else{ + // Wait for 1 second before crawling the next page + await delay(1000); + } + }, maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. headless: config.headless ?? true, // Set to false to see the browser in action From 35ea95bdd8f42d137a11b732a8249b4c72e722e3 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:20:44 -0500 Subject: [PATCH 09/14] Added ts-ignore for docker config.ts to prevent VSCode from declaring missing file that isn't created until the Docker is. --- containerapp/data/config.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/containerapp/data/config.ts b/containerapp/data/config.ts index eb923667..45b81fd7 100644 --- a/containerapp/data/config.ts +++ b/containerapp/data/config.ts @@ -1,3 +1,4 @@ +// @ts-ignore import { Config } from "./src/config"; export const defaultConfig: Config = { From a996ab125b4af4bb332beea6f4159a728b77ce26 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:25:00 -0500 Subject: [PATCH 10/14] Added Output Directory for all outputFileName to go into so they aren't overwritten in storage --- config.ts | 1 + containerapp/Dockerfile | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/config.ts b/config.ts index d1c7882f..c30d803a 100644 --- a/config.ts +++ b/config.ts @@ -25,6 +25,7 @@ export const defaultConfig: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, + outputFileName: outputFileName, waitPerPageCrawlTimeoutRange: {min:1000, max:1000}, headless: true, maxConcurrency: 1, diff --git a/containerapp/Dockerfile b/containerapp/Dockerfile index 876a9a10..1edbf8cf 100644 --- a/containerapp/Dockerfile +++ b/containerapp/Dockerfile @@ -28,8 +28,9 @@ RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-cra npx playwright install && \ npx playwright install-deps -# Directory to mount in the docker container to get the output.json data +# Directories to mount in the docker container to get the output json data RUN cd /home && mkdir data - +# Final output directory +RUN cd /home && mkdir outputs WORKDIR /home \ No newline at end of file From 33faaf5a89223cf2e64493406b39866fc5368ad0 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:27:06 -0500 Subject: [PATCH 11/14] Additions to dynamic url and match configurations in config.ts --- config.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.ts b/config.ts index c30d803a..e6df4001 100644 --- a/config.ts +++ b/config.ts @@ -22,8 +22,8 @@ const outputs_dir = __dirname.split('/').slice(0, -1).join('/') + '/outputs'; const outputFileName = outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; export const defaultConfig: Config = { - url: "https://www.builder.io/c/docs/developers", - match: "https://www.builder.io/c/docs/**", + url: starting_url, + match: match_url, maxPagesToCrawl: 50, outputFileName: outputFileName, waitPerPageCrawlTimeoutRange: {min:1000, max:1000}, From 83a5b0cd42d37df6f2e7f74e43cbf6bc6797bbec Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 05:36:39 -0500 Subject: [PATCH 12/14] Added waitForSelectorTimeout to README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ce69a526..241391f7 100644 --- a/README.md +++ b/README.md @@ -79,8 +79,10 @@ type Config = { maxPagesToCrawl: number; /** File name for the finished data */ outputFileName: string; - /** Optional resources to exclude - * + + /** Optional - Timeout for waiting for a selector to appear */ + waitForSelectorTimeout: number; + * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ From 401fa9bf55599c5bea2c3a6e27bb113223771630 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Mon, 4 Dec 2023 06:45:57 -0500 Subject: [PATCH 13/14] Adding details to README.md and config.ts as well as extra formatting. --- README.md | 16 +++++++++++----- src/config.ts | 29 +++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 241391f7..a05d751d 100644 --- a/README.md +++ b/README.md @@ -64,25 +64,31 @@ export const defaultConfig: Config = { }; ``` -See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options: +See [config.ts](src/config.ts) for all available options. Here is a sample of the common config options: ```ts type Config = { - /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ + + /** Required - URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ url: string; - /** Pattern to match against for links on a page to subsequently crawl */ + + /** Required - Pattern to match against for links on a page to subsequently crawl */ match: string; - /** Selector to grab the inner text from */ + + /** Optional - Selector to grab the inner text from */ selector: string; /** Optional - Don't crawl more than this many pages (0 = Crawl all, Default = 50)*/ maxPagesToCrawl: number; - /** File name for the finished data */ + + /** Optional - File name for the finished data */ outputFileName: string; /** Optional - Timeout for waiting for a selector to appear */ waitForSelectorTimeout: number; + /** Optional - Resource file extensions to exclude from crawl + * * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ diff --git a/src/config.ts b/src/config.ts index 418c6227..2d6bd45b 100644 --- a/src/config.ts +++ b/src/config.ts @@ -6,6 +6,7 @@ const Page: z.ZodType = z.any(); export const configSchema = z.object({ /** + * **Required:** * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap * @example "https://www.builder.io/c/docs/developers" * @example "https://www.builder.io/sitemap.xml" @@ -13,6 +14,7 @@ export const configSchema = z.object({ */ url: z.string(), /** + * **Required:** * Pattern to match against for links on a page to subsequently crawl * @example "https://www.builder.io/c/docs/**" * @default "" @@ -20,6 +22,7 @@ export const configSchema = z.object({ match: z.string().or(z.array(z.string())), /** + * **Optional:** * Selector to grab the inner text from * @example ".docs-builder-container" * @default "" @@ -32,18 +35,25 @@ export const configSchema = z.object({ */ maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(), /** + * **Optional:** * File name for the finished data * @default "output.json" */ outputFileName: z.string(), - /** Optional cookie to be set. E.g. for Cookie Consent */ + /** + * **Optional:** + * Cookie to be set. E.g. for Cookie Consent + * */ cookie: z .object({ name: z.string(), value: z.string(), }) .optional(), - /** Optional function to run for each page found */ + /** + * **Optional:** + * Function to run for each page found + * */ onVisitPage: z .function() .args( @@ -56,18 +66,25 @@ export const configSchema = z.object({ .optional(), /** Optional timeout for waiting for a selector to appear */ waitForSelectorTimeout: z.number().int().nonnegative().optional(), - /** Optional resources to exclude + /** + * **Optional:** + * Resources to exclude * * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions: z.array(z.string()).optional(), - - /** Optional maximum file size in megabytes to include in the output file + + /** + * **Optional:** + * Maximum file size in megabytes to include in the output file * @example 1 */ maxFileSize: z.number().int().positive().optional(), - /** Optional maximum number tokens to include in the output file + + /** + * **Optional:** + * The maximum number tokens to include in the output file * @example 5000 */ maxTokens: z.number().int().positive().optional(), From 62521b7294d9e06730395394a54f081295e01784 Mon Sep 17 00:00:00 2001 From: "Charles P. Cross" Date: Wed, 6 Dec 2023 18:47:25 -0500 Subject: [PATCH 14/14] Update prettier formatting for README.md, src/config.ts, src/core.ts, and config.ts and formatting for jsdoc/typedoc as recommened by @marcelovicentegc in pull request #102, added .prettierignore file --- .prettierignore | 30 ++++++++++++ README.md | 126 ++++++++++++++++++++++++++++++++++++++---------- config.ts | 33 +++++++------ src/config.ts | 94 ++++++++++++++++++------------------ src/core.ts | 32 +++++++----- 5 files changed, 215 insertions(+), 100 deletions(-) create mode 100644 .prettierignore diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000..fe827dbd --- /dev/null +++ b/.prettierignore @@ -0,0 +1,30 @@ +# Ignore artifacts + +node_modules +.github +storage +outputs +*.code-workspace + +## This file tells which files shouldn't be added to source control + +.idea +dist +node_modules +apify_storage +crawlee_storage +storage +.DS_Store + +## any output from the crawler + +*.json +pnpm-lock.yaml + +## Final ouputs folder + +outputs + +## VS Code workspace files + +*.code-workspace diff --git a/README.md b/README.md index a05d751d..33c67488 100644 --- a/README.md +++ b/README.md @@ -66,52 +66,110 @@ export const defaultConfig: Config = { See [config.ts](src/config.ts) for all available options. Here is a sample of the common config options: -```ts +````ts type Config = { - - /** Required - URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ + /** + * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap + * @example "https://www.builder.io/c/docs/developers" + * @example "https://www.builder.io/sitemap.xml" + * @default "" + * @required + */ url: string; - - /** Required - Pattern to match against for links on a page to subsequently crawl */ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ match: string; - - /** Optional - Selector to grab the inner text from */ + /** + * Selector to grab the inner text from + * @example ".docs-builder-container" + * @default "" + * @required + */ selector: string; - - /** Optional - Don't crawl more than this many pages (0 = Crawl all, Default = 50)*/ + /** + * Don't crawl more than this many pages + * @default 50 + */ maxPagesToCrawl: number; - - /** Optional - File name for the finished data */ + /** + * File name for the finished data + * @example "output.json" + */ outputFileName: string; - - /** Optional - Timeout for waiting for a selector to appear */ - waitForSelectorTimeout: number; - - /** Optional - Resource file extensions to exclude from crawl - * + /** + * Cookie to be set. E.g. for Cookie Consent + */ + cookie?: { + name: string, + value: string, + url: string, + }; + /** + * Function to run for each page found + */ + onVisitPage?: (page: object, data: string); + /** + * Timeout to wait for a selector to appear + */ + waitForSelectorTimeout: object; + /** + * Resource file extensions to exclude from crawl * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions?: string[]; - /** Optional maximum file size in megabytes to include in the output file */ + /** + * Maximum file size in megabytes to include in the output file + * @example 1 + */ maxFileSize?: number; - /** Optional maximum number tokens to include in the output file */ + /** + * The maximum number tokens to include in the output file + * @example 5000 + */ maxTokens?: number; - /** Optional - Maximum concurent parellel requets at a time */ + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ maxConcurrency?: number; - - /** Optional - waitPerPageCrawlTimeoutRange is a object containing a min and max each for the number of milliseconds to wait after each page crawl. - * Use waitPerPageCrawlTimeoutRange to handle rate limiting. - */ + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + * @default {min:1000,max:1000} + * @example {min:1000, max:2000} + */ waitPerPageCrawlTimeoutRange?: { - min: number, + min: number, max: number, }; /** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */ + /** + * Headless mode + * @default true + */ headless?: boolean; }; -``` +```` #### Run your crawler @@ -125,6 +183,22 @@ npm start To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. +#### [Running as a CLI](#running-as-a-cli) + +To run the `./dist/cli.ts` command line interface, follow these instructions: + +1. Open a terminal. +2. Navigate to the root directory of the project. +3. Run the following command: `./dist/cli.ts [arguments]` + Replace `[arguments]` with the appropriate command line arguments for your use case. +4. The CLI will execute the specified command and display the output in the terminal. + +> Note: Make sure you have the necessary dependencies installed and the project has been built before running the CLI. + +#### [Development](#development) + +> Instructions for Development will go here... + ### Upload your data to OpenAI The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT. diff --git a/config.ts b/config.ts index e6df4001..e289244f 100644 --- a/config.ts +++ b/config.ts @@ -1,32 +1,33 @@ import { Config } from "./src/config"; -import { fileURLToPath } from 'url'; -import { dirname } from 'path'; +import { fileURLToPath } from "url"; +import { dirname } from "path"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); -const starting_url = "https://www.builder.io/c/docs/developers"; -const url_prefix = "https://" +const startingUrl = "https://www.builder.io/c/docs/developers"; +const urlPrefix = "https://"; const domain = "www.builder.io"; -const url_suffix = "/c/docs"; -const base_url = url_prefix + domain; -const match_url_prefix = base_url + url_suffix; -const match_url = match_url_prefix + "/**"; +const urlSuffix = "/c/docs"; +const baseUrl = urlPrefix + domain; +const matchUrl_prefix = baseUrl + urlSuffix; +const matchUrl = matchUrl_prefix + "/**"; // Now date stamp for output file name const now = new Date(); -const date = now.toISOString().split('T')[0]; -const time = now.toTimeString().split(' ')[0]; -const outputs_dir = __dirname.split('/').slice(0, -1).join('/') + '/outputs'; +const date = now.toISOString().split("T")[0]; +const time = now.toTimeString().split(" ")[0]; +const outputs_dir = __dirname.split("/").slice(0, -1).join("/") + "/outputs"; -const outputFileName = outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; +const outputFileName = + outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; export const defaultConfig: Config = { - url: starting_url, - match: match_url, + url: startingUrl, + match: matchUrl, maxPagesToCrawl: 50, - outputFileName: outputFileName, - waitPerPageCrawlTimeoutRange: {min:1000, max:1000}, + outputFileName: outputFileName, + waitPerPageCrawlTimeoutRange: { min: 1000, max: 1000 }, headless: true, maxConcurrency: 1, }; diff --git a/src/config.ts b/src/config.ts index 2d6bd45b..d5417738 100644 --- a/src/config.ts +++ b/src/config.ts @@ -6,54 +6,48 @@ const Page: z.ZodType = z.any(); export const configSchema = z.object({ /** - * **Required:** * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap * @example "https://www.builder.io/c/docs/developers" * @example "https://www.builder.io/sitemap.xml" * @default "" + * @required */ url: z.string(), /** - * **Required:** * Pattern to match against for links on a page to subsequently crawl * @example "https://www.builder.io/c/docs/**" * @default "" + * @required */ match: z.string().or(z.array(z.string())), - /** - * **Optional:** * Selector to grab the inner text from * @example ".docs-builder-container" * @default "" */ selector: z.string().optional(), /** - * **Optional:** * Don't crawl more than this many pages * @default 50 */ maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(), /** - * **Optional:** * File name for the finished data - * @default "output.json" + * @example "output.json" */ outputFileName: z.string(), - /** - * **Optional:** - * Cookie to be set. E.g. for Cookie Consent - * */ + /** + * Cookie to be set. E.g. for Cookie Consent + */ cookie: z .object({ name: z.string(), value: z.string(), }) .optional(), - /** - * **Optional:** - * Function to run for each page found - * */ + /** + * Function to run for each page found + */ onVisitPage: z .function() .args( @@ -64,52 +58,60 @@ export const configSchema = z.object({ ) .returns(z.promise(z.void())) .optional(), - /** Optional timeout for waiting for a selector to appear */ - waitForSelectorTimeout: z.number().int().nonnegative().optional(), - /** - * **Optional:** - * Resources to exclude - * + /** + * Resources to exclude * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions: z.array(z.string()).optional(), - - /** - * **Optional:** + /** * Maximum file size in megabytes to include in the output file * @example 1 */ maxFileSize: z.number().int().positive().optional(), - - /** - * **Optional:** + /** * The maximum number tokens to include in the output file * @example 5000 */ maxTokens: z.number().int().positive().optional(), - /** - * **Optional:** - * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ + maxConcurrency: z.number().int().nonnegative().optional(), + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl * @default {min:1000,max:1000} - * */ - waitPerPageCrawlTimeoutRange: z.object({ + * @example {min:1000,max:2000} + */ + waitForSelectorTimeout: z.number().int().nonnegative().optional(), + waitPerPageCrawlTimeoutRange: z + .object({ min: z.number().int().nonnegative(), max: z.number().int().nonnegative(), - }).optional(), - /** - * **Optional:** - * Headless mode - * @default true - */ - headless: z.boolean().optional(), - /** - * **Optional:** - * maxConcurrency - * description: ( 0 = Unlimited, Doesn't stop until cancelled, undefined = max parellel requests possible ) - * @default 1 - * */ - maxConcurrency: z.number().int().nonnegative().optional(), + }) + .optional(), + /** + * Headless mode + * @default true + */ + headless: z.boolean().optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index 78b95bd0..7741c108 100644 --- a/src/core.ts +++ b/src/core.ts @@ -47,11 +47,10 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) { } export async function crawl(config: Config) { - // Function to delay the next crawl function delay(time: number) { - return new Promise(function(resolve) { - setTimeout(resolve, time) + return new Promise(function (resolve) { + setTimeout(resolve, time); }); } @@ -66,9 +65,11 @@ export async function crawl(config: Config) { // Warn if unlimited crawling is enabled if (config.maxPagesToCrawl == 0) { config.maxPagesToCrawl = undefined; - log.warningOnce(`maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`); + log.warningOnce( + `maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`, + ); } - + if (config.cookie) { // Set the cookie for the specific URL const cookie = { @@ -81,10 +82,11 @@ export async function crawl(config: Config) { const title = await page.title(); // Display the pageCounter/maxPagesToCrawl number or pageCounter/∞ if maxPagesToCrawl=0 - const maxPagesToCrawlDisplay = config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; + const maxPagesToCrawlDisplay = + config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; pageCounter++; log.info( - `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...` + `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...`, ); // Use custom handling for XPath selector @@ -120,19 +122,25 @@ export async function crawl(config: Config) { // Use waitPerPageCrawlTimeoutRange to handle rate limiting if (config.waitPerPageCrawlTimeoutRange) { // Create a random number between min and max - const randomTimeout = Math.floor(Math.random() * (config.waitPerPageCrawlTimeoutRange.max - config.waitPerPageCrawlTimeoutRange.min + 1) + config.waitPerPageCrawlTimeoutRange.min); + const randomTimeout = Math.floor( + Math.random() * + (config.waitPerPageCrawlTimeoutRange.max - + config.waitPerPageCrawlTimeoutRange.min + + 1) + + config.waitPerPageCrawlTimeoutRange.min, + ); log.info( - `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...` + `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...`, ); // Wait for the random amount of time before crawling the next page await delay(randomTimeout); - }else{ + } else { // Wait for 1 second before crawling the next page await delay(1000); } }, - maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency - maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. + maxConcurrency: config.maxConcurrency || 1, // Set the max concurrency + maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. headless: config.headless ?? true, // Set to false to see the browser in action preNavigationHooks: [ // Abort requests for certain resource types