From e42c848c0431edf7b4127b887b19581d1f21114e Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 4 Jan 2026 10:43:30 +0000 Subject: [PATCH 1/7] feat(ingester): crawl Starknet blog 2025-2026 --- .../__tests__/StarknetBlogIngester.test.ts | 228 ++++++ ingesters/bun.lock | 44 + ingesters/config/sources.json | 4 +- ingesters/package.json | 2 + .../src/ingesters/StarknetBlogIngester.ts | 753 +++++++++++++++--- .../ingestion/web_targets.py | 8 +- python/src/scripts/ingest.py | 8 +- 7 files changed, 934 insertions(+), 113 deletions(-) create mode 100644 ingesters/__tests__/StarknetBlogIngester.test.ts diff --git a/ingesters/__tests__/StarknetBlogIngester.test.ts b/ingesters/__tests__/StarknetBlogIngester.test.ts new file mode 100644 index 00000000..606b3a40 --- /dev/null +++ b/ingesters/__tests__/StarknetBlogIngester.test.ts @@ -0,0 +1,228 @@ +import axios from 'axios'; +import { beforeEach, afterEach, describe, expect, it, vi } from 'bun:test'; +import { StarknetBlogIngester } from '../src/ingesters/StarknetBlogIngester'; +import { type BookPageDto } from '../src/utils/types'; + +const BASE_URL = 'https://www.starknet.io/blog'; +const SITEMAP_URL = 'https://www.starknet.io/sitemap.xml'; + +type MockResponse = { + status: number; + data: string; + headers: Record; +}; + +class TestStarknetBlogIngester extends StarknetBlogIngester { + public exposedDownloadAndExtractDocs(): Promise { + return this.downloadAndExtractDocs(); + } + + public exposedCreateChunks( + pages: BookPageDto[], + ): ReturnType { + return this.createChunks(pages); + } +} + +const buildHtml = (options: { + title: string; + metaDate?: string; + timeDate?: string; + bodyText?: string; +}): string => { + const { title, metaDate, timeDate, bodyText } = options; + return ` + + + ${title} + ${metaDate ? `` : ''} + + +
+

${title}

+ ${timeDate ? `` : ''} +

${bodyText ?? 'Body content goes here.'}

+

Join our newsletter

+

Subscribe for updates.

+

May also interest you

+

Other posts.

+
+ +`; +}; + +const buildSitemap = (urls: string[]): string => ` + +${urls.map((url) => ` ${url}`).join('\n')} +`; + +const mockAxiosGet = (responses: Map) => { + return vi.spyOn(axios, 'get').mockImplementation(async (url) => { + const key = typeof url === 'string' ? url : url.toString(); + const response = responses.get(key); + if (response) { + return response as any; + } + + return { + status: 404, + data: '', + headers: { 'content-type': 'text/html' }, + } as any; + }); +}; + +describe('StarknetBlogIngester (crawler)', () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('filters to 2025/2026 posts and strips boilerplate sections', async () => { + const sitemap = buildSitemap([ + 'https://starknet.io/blog/post-2025', + 'https://www.starknet.io/blog/post-2026', + 'https://www.starknet.io/blog/post-2024', + 'https://www.starknet.io/blog', + 'https://www.starknet.io/blog/tag/something', + ]); + + const responses = new Map([ + [ + SITEMAP_URL, + { + status: 200, + data: sitemap, + headers: { 'content-type': 'application/xml' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2025', + { + status: 200, + data: buildHtml({ + title: 'Post 2025', + metaDate: '2025-05-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2026', + { + status: 200, + data: buildHtml({ + title: 'Post 2026', + metaDate: '2026-06-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2024', + { + status: 200, + data: buildHtml({ + title: 'Post 2024', + metaDate: '2024-03-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + ]); + + mockAxiosGet(responses); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + + expect(pages.map((page) => page.name).sort()).toEqual([ + 'post-2025', + 'post-2026', + ]); + + pages.forEach((page) => { + expect(page.content).not.toContain('Join our newsletter'); + expect(page.content).not.toContain('May also interest you'); + expect(page.content.startsWith('# ')).toBe(true); + }); + }); + + it.each([ + { + label: 'meta tag', + html: buildHtml({ + title: 'Meta Date', + metaDate: '2025-02-10T00:00:00Z', + }), + }, + { + label: 'time element', + html: buildHtml({ + title: 'Time Date', + timeDate: '2026-04-12T00:00:00Z', + }), + }, + { + label: 'markdown text', + html: buildHtml({ + title: 'Text Date', + bodyText: 'Apr 3, 2025 · 3 min read', + }), + }, + ])('includes posts when year is detected via $label', async ({ html }) => { + const sitemap = buildSitemap(['https://www.starknet.io/blog/year-test']); + const responses = new Map([ + [ + SITEMAP_URL, + { + status: 200, + data: sitemap, + headers: { 'content-type': 'application/xml' }, + }, + ], + [ + 'https://www.starknet.io/blog/year-test', + { + status: 200, + data: html, + headers: { 'content-type': 'text/html' }, + }, + ], + ]); + + mockAxiosGet(responses); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + + expect(pages).toHaveLength(1); + expect(pages[0]?.name).toBe('year-test'); + }); + + it('creates chunks with page-scoped source links and stable IDs', async () => { + const ingester = new TestStarknetBlogIngester(); + const pages: BookPageDto[] = [ + { + name: 'posts/2025/hello-world', + content: '# Hello World\n\nSome content here.', + }, + ]; + + const chunks = await ingester.exposedCreateChunks(pages); + + expect(chunks.length).toBeGreaterThan(0); + chunks.forEach((chunk) => { + expect(chunk.metadata.name).toBe('posts/2025/hello-world'); + expect(chunk.metadata.sourceLink).toBe( + 'https://www.starknet.io/blog/posts/2025/hello-world', + ); + expect(chunk.metadata.uniqueId.startsWith('starknet-blog-posts-2025-hello-world-')).toBe( + true, + ); + }); + }); +}); diff --git a/ingesters/bun.lock b/ingesters/bun.lock index eb47321b..d4aa78e2 100644 --- a/ingesters/bun.lock +++ b/ingesters/bun.lock @@ -15,9 +15,11 @@ "adm-zip": "^0.5.16", "asciidoctor": "^3.0.4", "axios": "^1.7.9", + "cheerio": "^1.0.0-rc.12", "dotenv": "^16.4.7", "downdoc": "1.0.2-stable", "lunr": "^2.3.9", + "node-html-markdown": "^2.0.0", "pg": "^8.14.1", "winston": "^3.17.0", }, @@ -150,6 +152,8 @@ "binary-search": ["binary-search@1.3.6", "", {}, "sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA=="], + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + "brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="], "braces": ["braces@3.0.3", "", { "dependencies": { "fill-range": "^7.1.1" } }, "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA=="], @@ -166,6 +170,10 @@ "character-parser": ["character-parser@2.2.0", "", { "dependencies": { "is-regex": "^1.0.3" } }, "sha512-+UqJQjFEFaTAs3bNsF2j2kEN1baG/zghZbdqoYEDxGZtJo9LBzl1A+m0D4n3qKx8N2FNv8/Xp6yV9mQmBuptaw=="], + "cheerio": ["cheerio@1.1.2", "", { "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "encoding-sniffer": "^0.2.1", "htmlparser2": "^10.0.0", "parse5": "^7.3.0", "parse5-htmlparser2-tree-adapter": "^7.1.0", "parse5-parser-stream": "^7.1.2", "undici": "^7.12.0", "whatwg-mimetype": "^4.0.0" } }, "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg=="], + + "cheerio-select": ["cheerio-select@2.1.0", "", { "dependencies": { "boolbase": "^1.0.0", "css-select": "^5.1.0", "css-what": "^6.1.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1" } }, "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g=="], + "ci-info": ["ci-info@4.3.0", "", {}, "sha512-l+2bNRMiQgcfILUi33labAZYIWlH1kWDp+ecNo5iisRKrbm0xcRyCww71/YU0Fkw0mAFpz9bJayXPjey6vkmaQ=="], "cliui": ["cliui@7.0.4", "", { "dependencies": { "string-width": "^4.2.0", "strip-ansi": "^6.0.0", "wrap-ansi": "^7.0.0" } }, "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ=="], @@ -186,6 +194,10 @@ "constantinople": ["constantinople@4.0.1", "", { "dependencies": { "@babel/parser": "^7.6.0", "@babel/types": "^7.6.1" } }, "sha512-vCrqcSIq4//Gx74TXXCGnHpulY1dskqLTFGDmhrGxzeXL8lF8kvXv6mpNWlJj1uD4DW23D4ljAqbY4RRaaUZIw=="], + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], "decamelize": ["decamelize@1.2.0", "", {}, "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA=="], @@ -214,6 +226,8 @@ "enabled": ["enabled@2.0.0", "", {}, "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ=="], + "encoding-sniffer": ["encoding-sniffer@0.2.1", "", { "dependencies": { "iconv-lite": "^0.6.3", "whatwg-encoding": "^3.1.1" } }, "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw=="], + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], @@ -276,10 +290,14 @@ "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + "he": ["he@1.2.0", "", { "bin": { "he": "bin/he" } }, "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw=="], + "htmlparser2": ["htmlparser2@9.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.1.0", "entities": "^4.5.0" } }, "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ=="], "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], + "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], + "inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="], "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], @@ -364,6 +382,12 @@ "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], + "node-html-markdown": ["node-html-markdown@2.0.0", "", { "dependencies": { "node-html-parser": "^6.1.13" } }, "sha512-DqUC3GGP7pwSYxS93SwHoP+qCw78xcMP6C6H2DuC8rPD2AweJRjBzQb5SdXpKtDlqAQ7hVotJcfhgU7hU5Gthw=="], + + "node-html-parser": ["node-html-parser@6.1.13", "", { "dependencies": { "css-select": "^5.1.0", "he": "1.2.0" } }, "sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + "num-sort": ["num-sort@2.1.0", "", {}, "sha512-1MQz1Ed8z2yckoBeSfkQHHO9K1yDRxxtotKSJ9yvcTUUxSvfvzEq5GwBrjjHEpMlq/k5gvXdmJ1SbYxWtpNoVg=="], "nunjucks": ["nunjucks@3.2.4", "", { "dependencies": { "a-sync-waterfall": "^1.0.0", "asap": "^2.0.3", "commander": "^5.1.0" }, "peerDependencies": { "chokidar": "^3.3.0" }, "optionalPeers": ["chokidar"], "bin": { "nunjucks-precompile": "bin/precompile" } }, "sha512-26XRV6BhkgK0VOxfbU5cQI+ICFUtMLixv1noZn1tGU38kQH5A5nmmbk/O45xdyBhD1esk47nKrY0mvQpZIhRjQ=="], @@ -384,6 +408,12 @@ "p-timeout": ["p-timeout@3.2.0", "", { "dependencies": { "p-finally": "^1.0.0" } }, "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg=="], + "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], + + "parse5-htmlparser2-tree-adapter": ["parse5-htmlparser2-tree-adapter@7.1.0", "", { "dependencies": { "domhandler": "^5.0.3", "parse5": "^7.0.0" } }, "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g=="], + + "parse5-parser-stream": ["parse5-parser-stream@7.1.2", "", { "dependencies": { "parse5": "^7.0.0" } }, "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow=="], + "path-parse": ["path-parse@1.0.7", "", {}, "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="], "pg": ["pg@8.16.3", "", { "dependencies": { "pg-connection-string": "^2.9.1", "pg-pool": "^3.10.1", "pg-protocol": "^1.10.3", "pg-types": "2.2.0", "pgpass": "1.0.5" }, "optionalDependencies": { "pg-cloudflare": "^1.2.7" }, "peerDependencies": { "pg-native": ">=3.0.1" }, "optionalPeers": ["pg-native"] }, "sha512-enxc1h0jA/aq5oSDMvqyW3q89ra6XIIDZgCX9vkMrnz5DFTw/Ny3Li2lFQ+pt3L6MCgm/5o2o8HW9hiJji+xvw=="], @@ -460,6 +490,8 @@ "safe-stable-stringify": ["safe-stable-stringify@2.5.0", "", {}, "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA=="], + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + "semver": ["semver@7.7.2", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA=="], "simple-wcswidth": ["simple-wcswidth@1.1.2", "", {}, "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw=="], @@ -498,6 +530,8 @@ "uglify-js": ["uglify-js@3.19.3", "", { "bin": { "uglifyjs": "bin/uglifyjs" } }, "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ=="], + "undici": ["undici@7.16.0", "", {}, "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g=="], + "undici-types": ["undici-types@7.13.0", "", {}, "sha512-Ov2Rr9Sx+fRgagJ5AX0qvItZG/JKKoBRAVITs1zk7IqZGTJUwgUr7qoYBpWwakpWilTZFM98rG/AFRocu10iIQ=="], "unxhr": ["unxhr@1.2.0", "", {}, "sha512-6cGpm8NFXPD9QbSNx0cD2giy7teZ6xOkCUH3U89WKVkL9N9rBrWjlCwhR94Re18ZlAop4MOc3WU1M3Hv/bgpIw=="], @@ -512,6 +546,10 @@ "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], + "whatwg-encoding": ["whatwg-encoding@3.1.1", "", { "dependencies": { "iconv-lite": "0.6.3" } }, "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ=="], + + "whatwg-mimetype": ["whatwg-mimetype@4.0.0", "", {}, "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg=="], + "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], "winston": ["winston@3.18.3", "", { "dependencies": { "@colors/colors": "^1.6.0", "@dabh/diagnostics": "^2.0.8", "async": "^3.2.3", "is-stream": "^2.0.0", "logform": "^2.7.0", "one-time": "^1.0.0", "readable-stream": "^3.4.0", "safe-stable-stringify": "^2.3.1", "stack-trace": "0.0.x", "triple-beam": "^1.3.0", "winston-transport": "^4.9.0" } }, "sha512-NoBZauFNNWENgsnC9YpgyYwOVrl2m58PpQ8lNHjV3kosGs7KJ7Npk9pCUE+WJlawVSe8mykWDKWFSVfs3QO9ww=="], @@ -552,6 +590,8 @@ "chalk/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], + "cheerio/htmlparser2": ["htmlparser2@10.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.1", "entities": "^6.0.0" } }, "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g=="], + "color/color-convert": ["color-convert@3.1.2", "", { "dependencies": { "color-name": "^2.0.0" } }, "sha512-UNqkvCDXstVck3kdowtOTWROIJQwafjOfXSmddoDrXo4cewMKmusCeF22Q24zvjR8nwWib/3S/dfyzPItPEiJg=="], "color-string/color-name": ["color-name@2.0.2", "", {}, "sha512-9vEt7gE16EW7Eu7pvZnR0abW9z6ufzhXxGXZEVU9IqPdlsUiMwJeJfRtq0zePUmnbHGT9zajca7mX8zgoayo4A=="], @@ -564,6 +604,8 @@ "openai/@types/node": ["@types/node@18.19.129", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-hrmi5jWt2w60ayox3iIXwpMEnfUvOLJCRtrOPbHtH15nTjvO7uhnelvrdAs0dO0/zl5DZ3ZbahiaXEVb54ca/A=="], + "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + "wrap-ansi/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], "@jest/pattern/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], @@ -578,6 +620,8 @@ "bun-types/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], + "cheerio/htmlparser2/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + "color/color-convert/color-name": ["color-name@2.0.2", "", {}, "sha512-9vEt7gE16EW7Eu7pvZnR0abW9z6ufzhXxGXZEVU9IqPdlsUiMwJeJfRtq0zePUmnbHGT9zajca7mX8zgoayo4A=="], "jest-mock/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], diff --git a/ingesters/config/sources.json b/ingesters/config/sources.json index bc2aebcc..af4127a1 100644 --- a/ingesters/config/sources.json +++ b/ingesters/config/sources.json @@ -73,7 +73,7 @@ "chunkOverlap": 512, "baseUrl": "https://docs.openzeppelin.com/contracts-cairo", "urlSuffix": "", - "useUrlMapping": false + "useUrlMapping": true } }, "corelib_docs": { @@ -131,7 +131,7 @@ "fileExtensions": [".md"], "chunkSize": 4096, "chunkOverlap": 512, - "baseUrl": "https://starknet.io/blog", + "baseUrl": "https://www.starknet.io/blog", "urlSuffix": "", "useUrlMapping": false } diff --git a/ingesters/package.json b/ingesters/package.json index a7725e04..1517884b 100644 --- a/ingesters/package.json +++ b/ingesters/package.json @@ -18,9 +18,11 @@ "adm-zip": "^0.5.16", "asciidoctor": "^3.0.4", "axios": "^1.7.9", + "cheerio": "^1.0.0-rc.12", "dotenv": "^16.4.7", "downdoc": "1.0.2-stable", "lunr": "^2.3.9", + "node-html-markdown": "^2.0.0", "pg": "^8.14.1", "winston": "^3.17.0" }, diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts index d5671d21..b197ef23 100644 --- a/ingesters/src/ingesters/StarknetBlogIngester.ts +++ b/ingesters/src/ingesters/StarknetBlogIngester.ts @@ -1,32 +1,103 @@ -import { type BookConfig } from '../utils/types'; -import { MarkdownIngester } from './MarkdownIngester'; -import { type BookChunk, DocumentSource } from '../types'; +import axios from 'axios'; +import { load, type Cheerio, type Element } from 'cheerio'; +import { NodeHtmlMarkdown } from 'node-html-markdown'; import { Document } from '@langchain/core/documents'; -import { VectorStore } from '../db/postgresVectorStore'; -import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils'; +import { type BookChunk, DocumentSource } from '../types'; +import { type BookConfig, type BookPageDto } from '../utils/types'; +import { MarkdownIngester } from './MarkdownIngester'; import { logger } from '../utils/logger'; -import * as fs from 'fs/promises'; -import * as path from 'path'; import { calculateHash } from '../utils/contentUtils'; import { RecursiveMarkdownSplitter, type SplitOptions, } from '../utils/RecursiveMarkdownSplitter'; -import { getPythonPath } from '../utils/paths'; + +const USER_AGENT = 'cairo-coder-ingester'; +const CONCURRENCY = 4; +const MAX_RETRIES = 5; +const TIMEOUT_MS = 30_000; +const REQUEST_DELAY_MS = 300; +const MAX_CRAWL_PAGES = 200; +const ALLOWED_YEARS = new Set([2025, 2026]); + +const DEFAULT_EXCLUDE_PATTERNS: RegExp[] = [ + /\/admin/i, + /\/api\//i, + /\/login/i, + /\/search/i, + /\/tag\//i, + /\/category\//i, + /\/author\//i, + /\/user\//i, + /\/wp-admin/i, + /\/wp-content/i, + /\/wp-includes/i, + /\/_next\//i, + /\/static\//i, + /\/assets\//i, + /\/js\//i, + /\/css\//i, + /\/images\//i, + /\/feed/i, + /\/rss/i, + /\/atom/i, + /\/sitemap/i, + /\/robots\.txt/i, + /\bmailto:/i, + /\btel:/i, + /#/, // fragments handled separately, but keep as guard + /\.css$/i, + /\/video\/?$/i, +]; + +const MAIN_CONTENT_SELECTORS = [ + 'main', + 'article', + '[role="main"]', + '.content', + '.doc-content', + '.markdown-body', + '.docs-content', + '.documentation', + '.post-content', + '.entry-content', + '.page-content', + '#content', + '.container-fluid', + '.container', + '.wrapper', +]; + +const BOILERPLATE_KEYWORDS = [ + 'navbar', + 'sidebar', + 'nav-bar', + 'side-bar', + 'menu', + 'toc', + 'breadcrumb', + 'footer', + 'header', +]; + +const PUBLISHED_META_SELECTORS = [ + 'meta[property="article:published_time"]', + 'meta[name="article:published_time"]', + 'meta[property="article:published"]', + 'meta[name="publish_date"]', + 'meta[name="pubdate"]', + 'meta[name="date"]', + 'meta[property="og:pubdate"]', +]; + +const MONTH_REGEX = + /(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s+(\d{4})/i; /** - * Ingester for Starknet blog posts documentation - * - * This ingester processes pre-summarized Starknet blog posts from the generated - * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them - * in the vector database for retrieval. + * Ingester for Starknet blog posts (2025/2026). */ export class StarknetBlogIngester extends MarkdownIngester { - /** - * Constructor for the Starknet Blog ingester - */ constructor() { - // Define the configuration for the Starknet Blog const config: BookConfig = { repoOwner: 'starknet', repoName: 'starknet-blog', @@ -35,123 +106,599 @@ export class StarknetBlogIngester extends MarkdownIngester { chunkOverlap: 512, baseUrl: 'https://www.starknet.io/blog', urlSuffix: '', - useUrlMapping: false, + useUrlMapping: true, }; super(config, DocumentSource.STARKNET_BLOG); } - /** - * Read the pre-summarized Starknet blog documentation file - */ - async readSummaryFile(): Promise { - const summaryPath = getPythonPath( - 'src', - 'cairo_coder_tools', - 'ingestion', - 'generated', - 'starknet-blog.md', + protected override async downloadAndExtractDocs(): Promise { + logger.info('Crawling Starknet blog posts for 2025-2026'); + + const baseUrl = this.config.baseUrl; + const discoveredUrls = await discoverUrls(baseUrl); + + if (discoveredUrls.length === 0) { + throw new Error('No URLs discovered for Starknet blog crawl'); + } + + const filteredUrls = filterUrls(discoveredUrls, baseUrl); + + if (filteredUrls.length === 0) { + throw new Error('No URLs remaining after Starknet blog filtering'); + } + + logger.info(`Processing ${filteredUrls.length} Starknet blog URLs`); + + const results = await mapWithConcurrency( + filteredUrls, + CONCURRENCY, + async (url) => { + const page = await fetchAndProcessPage(url, baseUrl); + if (!page) { + return null; + } + return page; + }, ); - logger.info(`Reading Starknet blog summary from ${summaryPath}`); - const text = await fs.readFile(summaryPath, 'utf-8'); - return text; - } - - /** - * Chunk the blog summary file using RecursiveMarkdownSplitter - * - * This function takes the markdown content and splits it using a recursive - * strategy that respects headers, code blocks, and maintains overlap between chunks. - * - * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks - */ - async chunkSummaryFile(text: string): Promise[]> { - // Configure the splitter with appropriate settings + const pages = results.filter((page): page is BookPageDto => page !== null); + + logger.info(`Collected ${pages.length} Starknet blog posts for ingestion`); + return pages; + } + + protected override async createChunks( + pages: BookPageDto[], + ): Promise[]> { + logger.info('Creating chunks from Starknet blog pages'); + + const chunks: Document[] = []; const splitOptions: SplitOptions = { maxChars: 2048, minChars: 500, overlap: 256, - headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest) + headerLevels: [1, 2, 3], preserveCodeBlocks: true, - idPrefix: 'starknet-blog', trim: true, }; - // Create the splitter and split the content - const splitter = new RecursiveMarkdownSplitter(splitOptions); - const chunks = splitter.splitMarkdownToChunks(text); + for (const page of pages) { + const pageId = sanitizePageId(page.name); + const splitter = new RecursiveMarkdownSplitter({ + ...splitOptions, + idPrefix: `starknet-blog-${pageId}`, + }); + const pageChunks = splitter.splitMarkdownToChunks(page.content); + const pageSource = this.buildSourceLink(page.name); - logger.info( - `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, - ); + pageChunks.forEach((chunk) => { + const contentHash = calculateHash(chunk.content); + const uniqueId = chunk.meta.uniqueId; + const sourceLink = pageSource; - // Convert chunks to Document format - const localChunks: Document[] = chunks.map((chunk) => { - const contentHash = calculateHash(chunk.content); - - return new Document({ - pageContent: chunk.content, - metadata: { - name: chunk.meta.title, - title: chunk.meta.title, - chunkNumber: chunk.meta.chunkNumber, // Already 0-based - contentHash: contentHash, - uniqueId: chunk.meta.uniqueId, - sourceLink: chunk.meta.sourceLink || this.config.baseUrl, - source: this.source, - }, + chunks.push( + new Document({ + pageContent: chunk.content, + metadata: { + name: page.name, + title: chunk.meta.title || page.name, + chunkNumber: chunk.meta.chunkNumber, + contentHash, + uniqueId, + sourceLink, + source: this.source, + }, + }), + ); }); + } + + logger.info(`Created ${chunks.length} chunks from Starknet blog pages`); + return chunks; + } + + protected getExtractDir(): string { + const { getTempDir } = require('../utils/paths'); + return getTempDir('starknet-blog'); + } + + protected override async cleanupDownloadedFiles(): Promise { + logger.info('No cleanup needed - Starknet blog crawl is in-memory'); + } + + private buildSourceLink(pageName: string): string { + const baseUrl = this.config.baseUrl.replace(/\/$/, ''); + const trimmed = pageName.replace(/^\/+/, ''); + return trimmed ? `${baseUrl}/${trimmed}` : baseUrl; + } +} + +async function discoverUrls(baseUrl: string): Promise { + const sitemapUrls = await discoverUrlsFromSitemap(baseUrl); + if (sitemapUrls.length > 0) { + return sitemapUrls; + } + + return discoverUrlsByCrawling(baseUrl); +} + +async function discoverUrlsFromSitemap(baseUrl: string): Promise { + const sitemapUrl = new URL('/sitemap.xml', baseUrl).toString(); + logger.info(`Checking for sitemap at ${sitemapUrl}`); + + const urls = await parseSitemap(sitemapUrl); + const base = new URL(baseUrl); + const seen = new Set(); + const validUrls: string[] = []; + + for (const url of urls) { + if (!url) continue; + const normalized = normalizeUrl(url); + if (!isValidUrl(normalized, base)) continue; + const canonical = canonicalizeUrl(normalized, baseUrl); + if (seen.has(canonical)) continue; + seen.add(canonical); + validUrls.push(canonical); + } + + logger.info(`Found ${validUrls.length} valid URLs from sitemap`); + return validUrls; +} + +async function parseSitemap(sitemapUrl: string): Promise { + const sitemapContent = await fetchSitemap(sitemapUrl); + if (!sitemapContent) { + return []; + } + + const locMatches = sitemapContent.matchAll(/\s*([^<\s]+)\s*<\/loc>/gi); + const locs = Array.from(locMatches, (match) => decodeXml(match[1] ?? '')); + + if (/sitemapindex/i.test(sitemapContent)) { + const nestedUrls: string[] = []; + for (const loc of locs) { + if (!loc) continue; + const nested = await parseSitemap(loc); + nestedUrls.push(...nested); + } + return nestedUrls; + } + + return locs; +} + +async function fetchSitemap(url: string): Promise { + try { + const response = await axios.get(url, { + headers: { 'User-Agent': USER_AGENT }, + timeout: TIMEOUT_MS, + validateStatus: () => true, }); - return localChunks; + if (response.status >= 200 && response.status < 300) { + return response.data as string; + } + } catch (error) { + logger.debug(`Failed to fetch sitemap ${url}: ${String(error)}`); } - /** - * Starknet Blog specific processing based on the pre-summarized markdown file - * @param vectorStore - */ - public override async process( - vectorStore: VectorStore, - options?: VectorStoreUpdateOptions, - ): Promise { - try { - // 1. Read the pre-summarized documentation - const text = await this.readSummaryFile(); + return null; +} - // 2. Create chunks from the documentation - const chunks = await this.chunkSummaryFile(text); +async function discoverUrlsByCrawling(baseUrl: string): Promise { + logger.info('Falling back to crawling for URL discovery'); + const base = new URL(baseUrl); + const visited = new Set(); + const queue: string[] = [normalizeUrl(baseUrl)]; + visited.add(normalizeUrl(baseUrl)); + + while (queue.length > 0 && visited.size < MAX_CRAWL_PAGES) { + const current = queue.shift(); + if (!current) continue; + + const html = await fetchHtml(current); + if (!html) continue; + + const links = extractLinks(html, current); + for (const link of links) { + if (visited.size >= MAX_CRAWL_PAGES) break; + const canonical = canonicalizeUrl(link, baseUrl); + if (!isValidUrl(canonical, base)) continue; + if (visited.has(canonical)) continue; + visited.add(canonical); + queue.push(canonical); + } + } + + logger.info(`Discovered ${visited.size} pages by crawling`); + return Array.from(visited); +} + +function filterUrls(urls: string[], baseUrl: string): string[] { + const base = new URL(baseUrl); + const seen = new Set(); + const filtered: string[] = []; + + for (const url of urls) { + const normalized = normalizeUrl(url); + const canonical = canonicalizeUrl(normalized, baseUrl); + if (!isValidUrl(canonical, base)) continue; + if (!isBlogPostPath(canonical, baseUrl)) continue; + if (seen.has(canonical)) continue; + seen.add(canonical); + filtered.push(canonical); + } + + return filtered.sort(); +} + +async function fetchAndProcessPage( + url: string, + baseUrl: string, +): Promise { + const html = await fetchHtml(url); + if (!html) { + logger.debug(`Skipping ${url}: failed to fetch HTML`); + return null; + } + + const { markdown, title, publishedYear } = extractContent(html, url); + + if (!publishedYear || !ALLOWED_YEARS.has(publishedYear)) { + logger.debug(`Skipping ${url}: not a 2025/2026 blog post`); + return null; + } + + const cleaned = cleanBlogMarkdown(markdown); + if (!cleaned || cleaned.length < 50) { + logger.debug(`Skipping ${url}: extracted markdown too small`); + return null; + } + + const content = ensureTitleInMarkdown(title, cleaned); + const pageName = buildPageName(url, baseUrl); + + return { + name: pageName, + content, + }; +} + +async function fetchHtml(url: string): Promise { + let lastError = 'Unknown error'; + + for (let attempt = 0; attempt < MAX_RETRIES; attempt += 1) { + try { + const response = await axios.get(url, { + headers: { 'User-Agent': USER_AGENT }, + timeout: TIMEOUT_MS, + validateStatus: () => true, + }); - logger.info( - `Created ${chunks.length} chunks from Starknet blog documentation`, - ); + const contentType = response.headers['content-type'] ?? ''; + if (response.status === 200 && contentType.includes('text/html')) { + await sleep(REQUEST_DELAY_MS); + return response.data as string; + } - // 3. Update the vector store with the chunks - await this.updateVectorStore(vectorStore, chunks, options); + if (response.status === 429 || response.status >= 500) { + lastError = `Status ${response.status}`; + const retryAfter = response.headers['retry-after']; + const delayMs = retryAfter + ? Number.parseInt(String(retryAfter), 10) * 1000 + : 2 ** attempt * 1000; + await sleep(delayMs); + continue; + } - // 4. Clean up any temporary files (no temp files in this case) - await this.cleanupDownloadedFiles(); + return null; } catch (error) { - this.handleError(error); + lastError = String(error); + const delayMs = 2 ** attempt * 1000; + await sleep(delayMs); } } - /** - * Get the directory path for extracting files - * - * @returns string - Path to the extract directory - */ - protected getExtractDir(): string { - const { getTempDir } = require('../utils/paths'); - return getTempDir('starknet-blog'); + logger.debug(`Failed to fetch ${url} after ${MAX_RETRIES} attempts: ${lastError}`); + return null; +} + +function extractLinks(html: string, baseUrl: string): string[] { + const $ = load(html); + const links = new Set(); + + $('a[href], link[href]').each((_, element) => { + const href = $(element).attr('href'); + if (!href) return; + + try { + const absoluteUrl = new URL(href, baseUrl).toString(); + links.add(normalizeUrl(absoluteUrl)); + } catch { + return; + } + }); + + return Array.from(links); +} + +function extractContent( + html: string, + url: string, +): { markdown: string; title: string; publishedYear: number | null } { + const $ = load(html); + + const title = + $('meta[property="og:title"]').attr('content')?.trim() || + $('title').first().text().trim() || + $('h1').first().text().trim() || + url; + + $('script, style, noscript, nav, header, footer, aside, img, svg, iframe').remove(); + + $('*').each((_, element) => { + const node = $(element); + const id = node.attr('id')?.toLowerCase() ?? ''; + const className = (node.attr('class') ?? '').toLowerCase(); + + if (BOILERPLATE_KEYWORDS.some((keyword) => id.includes(keyword))) { + node.remove(); + return; + } + + if (BOILERPLATE_KEYWORDS.some((keyword) => className.includes(keyword))) { + node.remove(); + } + }); + + let mainContent: Cheerio | null = null; + for (const selector of MAIN_CONTENT_SELECTORS) { + const element = $(selector).first(); + if (element.length === 0) continue; + const textLength = element.text().trim().length; + if (textLength > 100) { + mainContent = element; + break; + } } - /** - * Override cleanupDownloadedFiles since we don't download anything - */ - protected override async cleanupDownloadedFiles(): Promise { - // No cleanup needed as we're reading from a local file - logger.info('No cleanup needed - using local summary file'); + if (!mainContent || mainContent.length === 0) { + let bestDiv = null; + let bestLength = 0; + $('div').each((_, element) => { + const node = $(element); + const textLength = node.text().trim().length; + if (textLength < 200) return; + const className = (node.attr('class') ?? '').toLowerCase(); + const id = (node.attr('id') ?? '').toLowerCase(); + if ( + ['nav', 'menu', 'sidebar', 'header', 'footer'].some((kw) => + className.includes(kw) || id.includes(kw), + ) + ) { + return; + } + if (textLength > bestLength) { + bestLength = textLength; + bestDiv = node; + } + }); + mainContent = bestDiv ?? null; + } + + if (!mainContent || mainContent.length === 0) { + const body = $('body').first(); + mainContent = body.length > 0 ? body : null; } + + const htmlFragment = mainContent ? $.html(mainContent) ?? '' : ''; + const markdown = NodeHtmlMarkdown.translate(htmlFragment); + const publishedYear = extractPublishedYear($, markdown); + + return { + markdown: normalizeMarkdown(markdown), + title, + publishedYear, + }; +} + +function extractPublishedYear( + $: ReturnType, + markdown: string, +): number | null { + for (const selector of PUBLISHED_META_SELECTORS) { + const content = $(selector).attr('content'); + const year = parseYear(content); + if (year) return year; + } + + const timeElement = $('time[datetime]').first(); + const timeAttr = timeElement.attr('datetime'); + const timeYear = parseYear(timeAttr); + if (timeYear) return timeYear; + + const timeText = $('time').first().text().trim(); + const timeTextYear = parseYear(timeText); + if (timeTextYear) return timeTextYear; + + const snippet = markdown.slice(0, 2000); + const match = snippet.match(MONTH_REGEX); + if (match && match[2]) { + const year = Number.parseInt(match[2], 10); + return Number.isNaN(year) ? null : year; + } + + return null; +} + +function parseYear(value?: string | null): number | null { + if (!value) return null; + const parsed = new Date(value); + if (!Number.isNaN(parsed.valueOf())) { + return parsed.getUTCFullYear(); + } + + const match = value.match(/(\d{4})/); + if (match) { + const year = Number.parseInt(match[1] ?? '', 10); + return Number.isNaN(year) ? null : year; + } + + return null; +} + +function cleanBlogMarkdown(markdown: string): string { + let cleaned = markdown; + + cleaned = cleaned.replace( + /^#{2,}\s*Join our newsletter[\s\S]*?(?=^#{2,}|\Z)/gim, + '', + ); + cleaned = cleaned.replace( + /^#{2,}\s*May also interest you[\s\S]*?(?=^#{2,}|\Z)/gim, + '', + ); + + cleaned = cleaned.replace( + /Join our newsletter[\s\S]*?(?=\n\n[A-Z]|\Z)/gi, + '', + ); + cleaned = cleaned.replace( + /May also interest you[\s\S]*?(?=\n\n[A-Z]|\Z)/gi, + '', + ); + + return normalizeMarkdown(cleaned).trim(); +} + +function normalizeMarkdown(markdown: string): string { + return markdown + .replace(/\n{3,}/g, '\n\n') + .replace(/^---+\n/gm, '') + .replace(/^\.{3,}\n/gm, '') + .trim(); +} + +function ensureTitleInMarkdown(title: string, markdown: string): string { + const trimmed = markdown.trim(); + if (trimmed.startsWith('# ')) { + return trimmed; + } + + return `# ${title}\n\n${trimmed}`.trim(); +} + +function buildPageName(url: string, baseUrl: string): string { + const base = new URL(baseUrl); + const parsed = new URL(url); + const basePath = base.pathname.replace(/\/$/, ''); + let path = parsed.pathname; + + if (basePath && path.startsWith(basePath)) { + path = path.slice(basePath.length); + } + + path = path.replace(/^\/+/, '').replace(/\/+$/, ''); + return path || 'index'; +} + +function sanitizePageId(pageName: string): string { + return pageName.replace(/[^a-zA-Z0-9-_]/g, '-'); +} + +function isValidUrl(url: string, baseUrl: URL): boolean { + let parsed: URL; + try { + parsed = new URL(url); + } catch { + return false; + } + + if (normalizeHost(parsed.host) !== normalizeHost(baseUrl.host)) return false; + + const basePath = baseUrl.pathname.replace(/\/$/, ''); + const urlPath = parsed.pathname.replace(/\/$/, ''); + if (basePath && !urlPath.startsWith(basePath)) return false; + + return !DEFAULT_EXCLUDE_PATTERNS.some((pattern) => pattern.test(parsed.pathname)); +} + +function isBlogPostPath(url: string, baseUrl: string): boolean { + const basePath = new URL(baseUrl).pathname.replace(/\/$/, ''); + const path = new URL(url).pathname.replace(/\/$/, ''); + if (path === basePath) return false; + if (!path.startsWith(basePath)) return false; + + const remainder = path.slice(basePath.length).replace(/^\/+/, ''); + return remainder.length > 0 && !remainder.startsWith('page/'); +} + +function normalizeUrl(url: string): string { + try { + const parsed = new URL(url); + parsed.hash = ''; + parsed.search = ''; + const normalized = parsed.toString(); + return normalized.endsWith('/') && parsed.pathname !== '/' + ? normalized.slice(0, -1) + : normalized; + } catch { + return url; + } +} + +function canonicalizeUrl(url: string, baseUrl: string): string { + try { + const parsed = new URL(url); + const base = new URL(baseUrl); + parsed.protocol = base.protocol; + parsed.host = base.host; + parsed.hash = ''; + parsed.search = ''; + const normalized = parsed.toString(); + return normalized.endsWith('/') && parsed.pathname !== '/' + ? normalized.slice(0, -1) + : normalized; + } catch { + return url; + } +} + +function normalizeHost(host: string): string { + return host.startsWith('www.') ? host.slice(4) : host; +} + +function decodeXml(value: string): string { + return value + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'"); +} + +async function mapWithConcurrency( + items: T[], + concurrency: number, + mapper: (item: T, index: number) => Promise, +): Promise { + const results: R[] = new Array(items.length); + let index = 0; + + const workers = Array.from({ length: concurrency }, async () => { + while (true) { + const current = index; + index += 1; + if (current >= items.length) break; + results[current] = await mapper(items[current] as T, current); + } + }); + + await Promise.all(workers); + return results; +} + +async function sleep(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); } diff --git a/python/src/cairo_coder_tools/ingestion/web_targets.py b/python/src/cairo_coder_tools/ingestion/web_targets.py index 21e7e228..518d28ae 100644 --- a/python/src/cairo_coder_tools/ingestion/web_targets.py +++ b/python/src/cairo_coder_tools/ingestion/web_targets.py @@ -91,8 +91,8 @@ def get_default_output_path(self) -> Path: return Path(f"{os.getcwd()}/src/cairo_coder_tools/ingestion/generated/{self.name}.md") -def is_2025_blog_entry(content: str) -> bool: - """Check if content is a blog entry from 2025. +def is_2025_or_2026_blog_entry(content: str) -> bool: + """Check if content is a blog entry from 2025 or 2026. Looks for patterns like: Home / Blog @@ -118,7 +118,7 @@ def is_2025_blog_entry(content: str) -> bool: matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE) for match in matches: year = match[1] if len(match) > 1 else match[-1] - if year == '2025': + if year in {'2025', '2026'}: return True return False @@ -172,7 +172,7 @@ def clean_blog_content(content: str) -> str: name="starknet-blog", base_url="https://www.starknet.io/blog", exclude_patterns=[r'video/$'], # Exclude URLs ending with video/ - content_filter=is_2025_blog_entry, + content_filter=is_2025_or_2026_blog_entry, content_processor=clean_blog_content, ) diff --git a/python/src/scripts/ingest.py b/python/src/scripts/ingest.py index 3c4fc897..d94b0b55 100644 --- a/python/src/scripts/ingest.py +++ b/python/src/scripts/ingest.py @@ -133,8 +133,8 @@ def from_web( and compiles it into a single markdown file. Examples: - # Use predefined target for StarkNet 2025 blog posts - uv run ingest from-web starknet-blog-2025 + # Use predefined target for StarkNet 2025/2026 blog posts + uv run ingest from-web starknet-blog # Crawl StarkNet blog manually with filter uv run ingest from-web https://www.starknet.io/blog --content-filter="2025" @@ -153,8 +153,8 @@ def from_web( if target.get_exclude_url_patterns(): typer.echo(f" Exclude patterns: {target.get_exclude_url_patterns()}") # Show details for specific targets - if target.name == "starknet-blog-2025": - typer.echo(" Content filter: 2025 blog entries only") + if target.name == "starknet-blog": + typer.echo(" Content filter: 2025/2026 blog entries only") typer.echo(" Content processor: removes newsletter/interest sections") else: # Create a generic target on the fly for custom URLs From ae5917d601694e61d6a14f2f1f68d0594ce11465 Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 4 Jan 2026 11:57:57 +0100 Subject: [PATCH 2/7] Harden Starknet blog crawl and cleanup --- .../__tests__/StarknetBlogIngester.test.ts | 33 ++- .../src/ingesters/StarknetBlogIngester.ts | 200 +++++++++++++++--- 2 files changed, 200 insertions(+), 33 deletions(-) diff --git a/ingesters/__tests__/StarknetBlogIngester.test.ts b/ingesters/__tests__/StarknetBlogIngester.test.ts index 606b3a40..f539f0b8 100644 --- a/ingesters/__tests__/StarknetBlogIngester.test.ts +++ b/ingesters/__tests__/StarknetBlogIngester.test.ts @@ -28,16 +28,21 @@ const buildHtml = (options: { title: string; metaDate?: string; timeDate?: string; + headerTimeDate?: string; + jsonLdDate?: string; bodyText?: string; }): string => { - const { title, metaDate, timeDate, bodyText } = options; + const { title, metaDate, timeDate, headerTimeDate, jsonLdDate, bodyText } = + options; return ` ${title} ${metaDate ? `` : ''} + ${jsonLdDate ? `` : ''} + ${headerTimeDate ? `
` : ''}

${title}

${timeDate ? `` : ''} @@ -51,7 +56,9 @@ const buildHtml = (options: { `; }; -const buildSitemap = (urls: string[]): string => ` +const buildSitemap = ( + urls: string[], +): string => ` ${urls.map((url) => ` ${url}`).join('\n')} `; @@ -166,6 +173,20 @@ describe('StarknetBlogIngester (crawler)', () => { timeDate: '2026-04-12T00:00:00Z', }), }, + { + label: 'json-ld', + html: buildHtml({ + title: 'JsonLd Date', + jsonLdDate: '2025-11-01T00:00:00Z', + }), + }, + { + label: 'header time element', + html: buildHtml({ + title: 'Header Time', + headerTimeDate: '2025-08-09T00:00:00Z', + }), + }, { label: 'markdown text', html: buildHtml({ @@ -220,9 +241,11 @@ describe('StarknetBlogIngester (crawler)', () => { expect(chunk.metadata.sourceLink).toBe( 'https://www.starknet.io/blog/posts/2025/hello-world', ); - expect(chunk.metadata.uniqueId.startsWith('starknet-blog-posts-2025-hello-world-')).toBe( - true, - ); + expect( + chunk.metadata.uniqueId.startsWith( + 'starknet-blog-posts-2025-hello-world-', + ), + ).toBe(true); }); }); }); diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts index b197ef23..35cbb71f 100644 --- a/ingesters/src/ingesters/StarknetBlogIngester.ts +++ b/ingesters/src/ingesters/StarknetBlogIngester.ts @@ -2,6 +2,7 @@ import axios from 'axios'; import { load, type Cheerio, type Element } from 'cheerio'; import { NodeHtmlMarkdown } from 'node-html-markdown'; import { Document } from '@langchain/core/documents'; +import { gunzipSync } from 'zlib'; import { type BookChunk, DocumentSource } from '../types'; import { type BookConfig, type BookPageDto } from '../utils/types'; import { MarkdownIngester } from './MarkdownIngester'; @@ -17,8 +18,13 @@ const CONCURRENCY = 4; const MAX_RETRIES = 5; const TIMEOUT_MS = 30_000; const REQUEST_DELAY_MS = 300; +const REQUEST_JITTER_MS = 200; const MAX_CRAWL_PAGES = 200; const ALLOWED_YEARS = new Set([2025, 2026]); +const MIN_RETRY_DELAY_MS = 1_000; +const MAX_RETRY_DELAY_MS = 60_000; +const MIN_MARKDOWN_LENGTH = 30; +let globalBackoffUntil = 0; const DEFAULT_EXCLUDE_PATTERNS: RegExp[] = [ /\/admin/i, @@ -273,11 +279,28 @@ async function fetchSitemap(url: string): Promise { const response = await axios.get(url, { headers: { 'User-Agent': USER_AGENT }, timeout: TIMEOUT_MS, + responseType: 'arraybuffer', validateStatus: () => true, }); if (response.status >= 200 && response.status < 300) { - return response.data as string; + const contentType = response.headers['content-type'] ?? ''; + const isGzip = /gzip/i.test(contentType) || url.endsWith('.gz'); + const data = response.data; + const buffer = Buffer.isBuffer(data) + ? data + : Buffer.from( + typeof data === 'string' ? data : (data as ArrayBuffer), + ); + let xml = buffer.toString('utf8'); + if (isGzip) { + try { + xml = gunzipSync(buffer).toString('utf8'); + } catch (error) { + logger.debug(`Failed to gunzip sitemap ${url}: ${String(error)}`); + } + } + return xml; } } catch (error) { logger.debug(`Failed to fetch sitemap ${url}: ${String(error)}`); @@ -351,7 +374,7 @@ async function fetchAndProcessPage( } const cleaned = cleanBlogMarkdown(markdown); - if (!cleaned || cleaned.length < 50) { + if (!cleaned || cleaned.length < MIN_MARKDOWN_LENGTH) { logger.debug(`Skipping ${url}: extracted markdown too small`); return null; } @@ -370,6 +393,7 @@ async function fetchHtml(url: string): Promise { for (let attempt = 0; attempt < MAX_RETRIES; attempt += 1) { try { + await waitForGlobalBackoff(); const response = await axios.get(url, { headers: { 'User-Agent': USER_AGENT }, timeout: TIMEOUT_MS, @@ -378,16 +402,15 @@ async function fetchHtml(url: string): Promise { const contentType = response.headers['content-type'] ?? ''; if (response.status === 200 && contentType.includes('text/html')) { - await sleep(REQUEST_DELAY_MS); + await sleep(REQUEST_DELAY_MS + randomJitter(REQUEST_JITTER_MS)); return response.data as string; } if (response.status === 429 || response.status >= 500) { lastError = `Status ${response.status}`; const retryAfter = response.headers['retry-after']; - const delayMs = retryAfter - ? Number.parseInt(String(retryAfter), 10) * 1000 - : 2 ** attempt * 1000; + const delayMs = computeRetryDelay(retryAfter, attempt); + scheduleGlobalBackoff(delayMs); await sleep(delayMs); continue; } @@ -395,12 +418,15 @@ async function fetchHtml(url: string): Promise { return null; } catch (error) { lastError = String(error); - const delayMs = 2 ** attempt * 1000; + const delayMs = computeRetryDelay(undefined, attempt); + scheduleGlobalBackoff(delayMs); await sleep(delayMs); } } - logger.debug(`Failed to fetch ${url} after ${MAX_RETRIES} attempts: ${lastError}`); + logger.debug( + `Failed to fetch ${url} after ${MAX_RETRIES} attempts: ${lastError}`, + ); return null; } @@ -435,7 +461,11 @@ function extractContent( $('h1').first().text().trim() || url; - $('script, style, noscript, nav, header, footer, aside, img, svg, iframe').remove(); + const publishedYear = extractPublishedYearFromDom($); + + $( + 'script, style, noscript, nav, header, footer, aside, img, svg, iframe', + ).remove(); $('*').each((_, element) => { const node = $(element); @@ -473,8 +503,8 @@ function extractContent( const className = (node.attr('class') ?? '').toLowerCase(); const id = (node.attr('id') ?? '').toLowerCase(); if ( - ['nav', 'menu', 'sidebar', 'header', 'footer'].some((kw) => - className.includes(kw) || id.includes(kw), + ['nav', 'menu', 'sidebar', 'header', 'footer'].some( + (kw) => className.includes(kw) || id.includes(kw), ) ) { return; @@ -492,20 +522,21 @@ function extractContent( mainContent = body.length > 0 ? body : null; } - const htmlFragment = mainContent ? $.html(mainContent) ?? '' : ''; + const htmlFragment = mainContent ? ($.html(mainContent) ?? '') : ''; const markdown = NodeHtmlMarkdown.translate(htmlFragment); - const publishedYear = extractPublishedYear($, markdown); + const normalizedMarkdown = normalizeMarkdown(markdown); + const finalPublishedYear = + publishedYear ?? extractPublishedYearFromMarkdown(normalizedMarkdown); return { - markdown: normalizeMarkdown(markdown), + markdown: normalizedMarkdown, title, - publishedYear, + publishedYear: finalPublishedYear, }; } -function extractPublishedYear( +function extractPublishedYearFromDom( $: ReturnType, - markdown: string, ): number | null { for (const selector of PUBLISHED_META_SELECTORS) { const content = $(selector).attr('content'); @@ -522,6 +553,13 @@ function extractPublishedYear( const timeTextYear = parseYear(timeText); if (timeTextYear) return timeTextYear; + const jsonLdYear = extractPublishedYearFromJsonLd($); + if (jsonLdYear) return jsonLdYear; + + return null; +} + +function extractPublishedYearFromMarkdown(markdown: string): number | null { const snippet = markdown.slice(0, 2000); const match = snippet.match(MONTH_REGEX); if (match && match[2]) { @@ -551,27 +589,30 @@ function parseYear(value?: string | null): number | null { function cleanBlogMarkdown(markdown: string): string { let cleaned = markdown; - cleaned = cleaned.replace( - /^#{2,}\s*Join our newsletter[\s\S]*?(?=^#{2,}|\Z)/gim, - '', - ); - cleaned = cleaned.replace( - /^#{2,}\s*May also interest you[\s\S]*?(?=^#{2,}|\Z)/gim, - '', - ); + cleaned = stripSectionByHeading(cleaned, 'Join our newsletter'); + cleaned = stripSectionByHeading(cleaned, 'May also interest you'); cleaned = cleaned.replace( - /Join our newsletter[\s\S]*?(?=\n\n[A-Z]|\Z)/gi, + /^Join our newsletter\s*[\s\S]*?(?=\n\n|\s*(?![\s\S]))/gim, '', ); cleaned = cleaned.replace( - /May also interest you[\s\S]*?(?=\n\n[A-Z]|\Z)/gi, + /^May also interest you\s*[\s\S]*?(?=\n\n|\s*(?![\s\S]))/gim, '', ); return normalizeMarkdown(cleaned).trim(); } +function stripSectionByHeading(markdown: string, headingText: string): string { + const escaped = headingText.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const pattern = new RegExp( + `^\\s*#{2,}\\s*${escaped}\\b[^\\n]*\\n[\\s\\S]*?(?=^\\s*#{2,}\\s|\\s*(?![\\s\\S]))`, + 'gim', + ); + return markdown.replace(pattern, ''); +} + function normalizeMarkdown(markdown: string): string { return markdown .replace(/\n{3,}/g, '\n\n') @@ -621,7 +662,9 @@ function isValidUrl(url: string, baseUrl: URL): boolean { const urlPath = parsed.pathname.replace(/\/$/, ''); if (basePath && !urlPath.startsWith(basePath)) return false; - return !DEFAULT_EXCLUDE_PATTERNS.some((pattern) => pattern.test(parsed.pathname)); + return !DEFAULT_EXCLUDE_PATTERNS.some((pattern) => + pattern.test(parsed.pathname), + ); } function isBlogPostPath(url: string, baseUrl: string): boolean { @@ -702,3 +745,104 @@ async function mapWithConcurrency( async function sleep(ms: number): Promise { await new Promise((resolve) => setTimeout(resolve, ms)); } + +function computeRetryDelay( + retryAfter: string | number | undefined, + attempt: number, +): number { + if (retryAfter !== undefined) { + const raw = String(retryAfter).trim(); + const seconds = Number.parseInt(raw, 10); + if (!Number.isNaN(seconds)) { + return clampRetryDelay(seconds * 1000); + } + const parsedDate = Date.parse(raw); + if (!Number.isNaN(parsedDate)) { + const delta = parsedDate - Date.now(); + return clampRetryDelay(delta); + } + } + + return clampRetryDelay(2 ** attempt * 1000); +} + +function clampRetryDelay(delayMs: number): number { + const normalized = Math.max(delayMs, MIN_RETRY_DELAY_MS); + return Math.min(normalized, MAX_RETRY_DELAY_MS); +} + +function scheduleGlobalBackoff(delayMs: number): void { + const target = Date.now() + delayMs + randomJitter(delayMs * 0.1); + if (target > globalBackoffUntil) { + globalBackoffUntil = target; + } +} + +async function waitForGlobalBackoff(): Promise { + const now = Date.now(); + if (globalBackoffUntil > now) { + await sleep(globalBackoffUntil - now); + } +} + +function randomJitter(maxJitterMs: number): number { + if (maxJitterMs <= 0) return 0; + return Math.floor(Math.random() * maxJitterMs); +} + +function extractPublishedYearFromJsonLd( + $: ReturnType, +): number | null { + const scripts = $('script[type="application/ld+json"]'); + for (const element of scripts.toArray()) { + const text = $(element).text().trim(); + if (!text) continue; + const year = parseJsonLdForYear(text); + if (year) return year; + } + return null; +} + +function parseJsonLdForYear(payload: string): number | null { + try { + const parsed = JSON.parse(payload); + return findYearInJsonLd(parsed, 0); + } catch { + return null; + } +} + +function findYearInJsonLd(value: unknown, depth: number): number | null { + if (depth > 6) return null; + if (!value || typeof value !== 'object') return null; + + if (Array.isArray(value)) { + for (const entry of value) { + const year = findYearInJsonLd(entry, depth + 1); + if (year) return year; + } + return null; + } + + const record = value as Record; + const dateKeys = ['datePublished', 'dateCreated', 'dateModified']; + for (const key of dateKeys) { + const dateValue = record[key]; + if (typeof dateValue === 'string') { + const year = parseYear(dateValue); + if (year) return year; + } + } + + if (record['@graph']) { + const year = findYearInJsonLd(record['@graph'], depth + 1); + if (year) return year; + } + + for (const entry of Object.values(record)) { + const year = findYearInJsonLd(entry, depth + 1); + if (year) return year; + } + + return null; +} From 8e5b150b51d76ba152d8990395afc30cb8232ed3 Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 4 Jan 2026 12:05:37 +0100 Subject: [PATCH 3/7] fix(ingester): address review feedback for Starknet blog crawler - Fix potential infinite recursion in parseSitemap by adding depth limit and visited URL tracking - Return null explicitly when gzip decompression fails in fetchSitemap - Move getTempDir import to top-level (remove require() inside method) - Add debug logging for non-retryable HTTP status codes (404, etc.) - Reset globalBackoffUntil at start of crawl for parallel safety --- .../src/ingesters/StarknetBlogIngester.ts | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts index 35cbb71f..f937118c 100644 --- a/ingesters/src/ingesters/StarknetBlogIngester.ts +++ b/ingesters/src/ingesters/StarknetBlogIngester.ts @@ -12,6 +12,7 @@ import { RecursiveMarkdownSplitter, type SplitOptions, } from '../utils/RecursiveMarkdownSplitter'; +import { getTempDir } from '../utils/paths'; const USER_AGENT = 'cairo-coder-ingester'; const CONCURRENCY = 4; @@ -24,6 +25,7 @@ const ALLOWED_YEARS = new Set([2025, 2026]); const MIN_RETRY_DELAY_MS = 1_000; const MAX_RETRY_DELAY_MS = 60_000; const MIN_MARKDOWN_LENGTH = 30; +const MAX_SITEMAP_DEPTH = 5; let globalBackoffUntil = 0; const DEFAULT_EXCLUDE_PATTERNS: RegExp[] = [ @@ -121,6 +123,9 @@ export class StarknetBlogIngester extends MarkdownIngester { protected override async downloadAndExtractDocs(): Promise { logger.info('Crawling Starknet blog posts for 2025-2026'); + // Reset global backoff state for fresh crawl + globalBackoffUntil = 0; + const baseUrl = this.config.baseUrl; const discoveredUrls = await discoverUrls(baseUrl); @@ -205,7 +210,6 @@ export class StarknetBlogIngester extends MarkdownIngester { } protected getExtractDir(): string { - const { getTempDir } = require('../utils/paths'); return getTempDir('starknet-blog'); } @@ -252,7 +256,23 @@ async function discoverUrlsFromSitemap(baseUrl: string): Promise { return validUrls; } -async function parseSitemap(sitemapUrl: string): Promise { +async function parseSitemap( + sitemapUrl: string, + depth = 0, + visited = new Set(), +): Promise { + if (depth > MAX_SITEMAP_DEPTH) { + logger.warn(`Sitemap recursion depth exceeded for ${sitemapUrl}`); + return []; + } + + const normalizedUrl = sitemapUrl.toLowerCase(); + if (visited.has(normalizedUrl)) { + logger.debug(`Skipping already visited sitemap: ${sitemapUrl}`); + return []; + } + visited.add(normalizedUrl); + const sitemapContent = await fetchSitemap(sitemapUrl); if (!sitemapContent) { return []; @@ -265,7 +285,7 @@ async function parseSitemap(sitemapUrl: string): Promise { const nestedUrls: string[] = []; for (const loc of locs) { if (!loc) continue; - const nested = await parseSitemap(loc); + const nested = await parseSitemap(loc, depth + 1, visited); nestedUrls.push(...nested); } return nestedUrls; @@ -292,15 +312,15 @@ async function fetchSitemap(url: string): Promise { : Buffer.from( typeof data === 'string' ? data : (data as ArrayBuffer), ); - let xml = buffer.toString('utf8'); if (isGzip) { try { - xml = gunzipSync(buffer).toString('utf8'); + return gunzipSync(buffer).toString('utf8'); } catch (error) { logger.debug(`Failed to gunzip sitemap ${url}: ${String(error)}`); + return null; } } - return xml; + return buffer.toString('utf8'); } } catch (error) { logger.debug(`Failed to fetch sitemap ${url}: ${String(error)}`); @@ -415,6 +435,12 @@ async function fetchHtml(url: string): Promise { continue; } + // Log non-retryable failures (404, 403, etc.) for debugging stale sitemap entries + if (response.status !== 200) { + logger.debug(`Non-retryable status ${response.status} for ${url}`); + } else if (!contentType.includes('text/html')) { + logger.debug(`Non-HTML content-type "${contentType}" for ${url}`); + } return null; } catch (error) { lastError = String(error); From 3328f6d5133dbc196d067d3fb58e9ed81795e99e Mon Sep 17 00:00:00 2001 From: enitrat Date: Sun, 4 Jan 2026 12:49:00 +0100 Subject: [PATCH 4/7] fix(ingester): resolve TypeScript type errors in StarknetBlogIngester - Fix axios mock implementation types in tests to properly handle url parameter - Import AnyNode and Element types from domhandler for cheerio compatibility - Refactor Buffer.from usage to handle ArrayBuffer types explicitly - Update Cheerio type to Cheerio for proper type inference All TypeScript compilation errors resolved and tests passing. --- .../__tests__/StarknetBlogIngester.test.ts | 97 ++++- .../src/ingesters/StarknetBlogIngester.ts | 340 ++++++++---------- .../cairo_coder/dspy/document_retriever.py | 5 +- .../src/cairo_coder/dspy/retrieval_judge.py | 4 +- 4 files changed, 253 insertions(+), 193 deletions(-) diff --git a/ingesters/__tests__/StarknetBlogIngester.test.ts b/ingesters/__tests__/StarknetBlogIngester.test.ts index f539f0b8..703f6d59 100644 --- a/ingesters/__tests__/StarknetBlogIngester.test.ts +++ b/ingesters/__tests__/StarknetBlogIngester.test.ts @@ -1,6 +1,6 @@ import axios from 'axios'; -import { beforeEach, afterEach, describe, expect, it, vi } from 'bun:test'; -import { StarknetBlogIngester } from '../src/ingesters/StarknetBlogIngester'; +import { afterEach, describe, expect, it, vi } from 'bun:test'; +import { StarknetBlogIngester, __testing } from '../src/ingesters/StarknetBlogIngester'; import { type BookPageDto } from '../src/utils/types'; const BASE_URL = 'https://www.starknet.io/blog'; @@ -64,8 +64,8 @@ ${urls.map((url) => ` ${url}`).join('\n')} `; const mockAxiosGet = (responses: Map) => { - return vi.spyOn(axios, 'get').mockImplementation(async (url) => { - const key = typeof url === 'string' ? url : url.toString(); + return vi.spyOn(axios, 'get').mockImplementation(async (url: string | any) => { + const key = typeof url === 'string' ? url : String(url); const response = responses.get(key); if (response) { return response as any; @@ -80,10 +80,6 @@ const mockAxiosGet = (responses: Map) => { }; describe('StarknetBlogIngester (crawler)', () => { - beforeEach(() => { - vi.restoreAllMocks(); - }); - afterEach(() => { vi.restoreAllMocks(); }); @@ -154,6 +150,9 @@ describe('StarknetBlogIngester (crawler)', () => { pages.forEach((page) => { expect(page.content).not.toContain('Join our newsletter'); expect(page.content).not.toContain('May also interest you'); + expect(page.content.toLowerCase()).not.toContain('newsletter'); + expect(page.content.toLowerCase()).not.toContain('may also interest you'); + expect(page.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); expect(page.content.startsWith('# ')).toBe(true); }); }); @@ -222,6 +221,13 @@ describe('StarknetBlogIngester (crawler)', () => { expect(pages).toHaveLength(1); expect(pages[0]?.name).toBe('year-test'); + expect(pages[0]?.content).not.toContain('Join our newsletter'); + expect(pages[0]?.content).not.toContain('May also interest you'); + expect(pages[0]?.content?.toLowerCase()).not.toContain('newsletter'); + expect(pages[0]?.content?.toLowerCase()).not.toContain( + 'may also interest you', + ); + expect(pages[0]?.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); }); it('creates chunks with page-scoped source links and stable IDs', async () => { @@ -249,3 +255,78 @@ describe('StarknetBlogIngester (crawler)', () => { }); }); }); + +describe('StarknetBlogIngester (real page integration)', () => { + const REAL_PAGE_URL = + 'https://www.starknet.io/blog/starknet-2025-year-in-review'; + const REAL_PAGE_URL_SLASH = `${REAL_PAGE_URL}/`; + const REAL_PAGE_NAME = 'starknet-2025-year-in-review'; + + it( + 'processes real page through ingester extraction logic', + async () => { + const realResponse = await axios.get(REAL_PAGE_URL_SLASH, { + headers: { 'User-Agent': 'cairo-coder-ingester-test' }, + timeout: 30000, + }); + + expect(realResponse.status).toBe(200); + const html = realResponse.data as string; + expect(html).toContain('Starknet'); + + vi.spyOn(axios, 'get').mockImplementation(async (url: string | any, config?: any) => { + const key = typeof url === 'string' ? url : String(url); + if (key === SITEMAP_URL) { + return { + status: 200, + data: buildSitemap([REAL_PAGE_URL_SLASH]), + headers: { 'content-type': 'application/xml' }, + } as any; + } + + if (key === REAL_PAGE_URL || key === REAL_PAGE_URL_SLASH) { + return { + status: 200, + data: html, + headers: { 'content-type': 'text/html' }, + } as any; + } + + return { + status: 404, + data: '', + headers: { 'content-type': 'text/html' }, + } as any; + }); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + const page = pages.find((entry) => entry.name === REAL_PAGE_NAME); + + const { markdown, title, publishedYear } = __testing.extractContent( + html, + REAL_PAGE_URL, + ); + const cleaned = __testing.cleanBlogMarkdown(markdown); + const expectedContent = __testing.ensureTitleInMarkdown(title, cleaned); + + expect(title).toContain('Starknet'); + expect(publishedYear).toBe(2025); + + expect(page).toBeDefined(); + expect(page?.content.startsWith('# ')).toBe(true); + expect(page?.content).toContain('Starknet'); + expect(page?.content).toContain('2025'); + expect(page?.content).not.toContain('Join our newsletter'); + expect(page?.content).not.toContain('May also interest you'); + expect(page?.content.toLowerCase()).not.toContain('newsletter'); + expect(page?.content.toLowerCase()).not.toContain('may also interest you'); + expect(page?.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + expect(page?.content).toBe(expectedContent); + expect(expectedContent.toLowerCase()).not.toContain('newsletter'); + expect(expectedContent.toLowerCase()).not.toContain('may also interest you'); + expect(expectedContent).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + }, + { timeout: 30000 }, + ); +}); diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts index f937118c..932f9fdb 100644 --- a/ingesters/src/ingesters/StarknetBlogIngester.ts +++ b/ingesters/src/ingesters/StarknetBlogIngester.ts @@ -1,5 +1,6 @@ import axios from 'axios'; -import { load, type Cheerio, type Element } from 'cheerio'; +import { load, type Cheerio } from 'cheerio'; +import type { AnyNode, Element } from 'domhandler'; import { NodeHtmlMarkdown } from 'node-html-markdown'; import { Document } from '@langchain/core/documents'; import { gunzipSync } from 'zlib'; @@ -244,12 +245,10 @@ async function discoverUrlsFromSitemap(baseUrl: string): Promise { for (const url of urls) { if (!url) continue; - const normalized = normalizeUrl(url); - if (!isValidUrl(normalized, base)) continue; - const canonical = canonicalizeUrl(normalized, baseUrl); - if (seen.has(canonical)) continue; - seen.add(canonical); - validUrls.push(canonical); + const processed = processUrl(url, base, baseUrl); + if (!processed || seen.has(processed)) continue; + seen.add(processed); + validUrls.push(processed); } logger.info(`Found ${validUrls.length} valid URLs from sitemap`); @@ -307,11 +306,14 @@ async function fetchSitemap(url: string): Promise { const contentType = response.headers['content-type'] ?? ''; const isGzip = /gzip/i.test(contentType) || url.endsWith('.gz'); const data = response.data; - const buffer = Buffer.isBuffer(data) - ? data - : Buffer.from( - typeof data === 'string' ? data : (data as ArrayBuffer), - ); + let buffer: Buffer; + if (Buffer.isBuffer(data)) { + buffer = data; + } else if (typeof data === 'string') { + buffer = Buffer.from(data); + } else { + buffer = Buffer.from(data as ArrayBuffer); + } if (isGzip) { try { return gunzipSync(buffer).toString('utf8'); @@ -333,8 +335,11 @@ async function discoverUrlsByCrawling(baseUrl: string): Promise { logger.info('Falling back to crawling for URL discovery'); const base = new URL(baseUrl); const visited = new Set(); - const queue: string[] = [normalizeUrl(baseUrl)]; - visited.add(normalizeUrl(baseUrl)); + const startUrl = processUrl(baseUrl, base, baseUrl); + if (!startUrl) return []; + + const queue: string[] = [startUrl]; + visited.add(startUrl); while (queue.length > 0 && visited.size < MAX_CRAWL_PAGES) { const current = queue.shift(); @@ -343,15 +348,19 @@ async function discoverUrlsByCrawling(baseUrl: string): Promise { const html = await fetchHtml(current); if (!html) continue; - const links = extractLinks(html, current); - for (const link of links) { - if (visited.size >= MAX_CRAWL_PAGES) break; - const canonical = canonicalizeUrl(link, baseUrl); - if (!isValidUrl(canonical, base)) continue; - if (visited.has(canonical)) continue; - visited.add(canonical); - queue.push(canonical); - } + const $ = load(html); + $('a[href], link[href]').each((_, element) => { + if (visited.size >= MAX_CRAWL_PAGES) return; + + const href = $(element).attr('href'); + if (!href) return; + + const processed = processUrl(href, base, baseUrl, current); + if (!processed || visited.has(processed)) return; + + visited.add(processed); + queue.push(processed); + }); } logger.info(`Discovered ${visited.size} pages by crawling`); @@ -364,13 +373,11 @@ function filterUrls(urls: string[], baseUrl: string): string[] { const filtered: string[] = []; for (const url of urls) { - const normalized = normalizeUrl(url); - const canonical = canonicalizeUrl(normalized, baseUrl); - if (!isValidUrl(canonical, base)) continue; - if (!isBlogPostPath(canonical, baseUrl)) continue; - if (seen.has(canonical)) continue; - seen.add(canonical); - filtered.push(canonical); + const processed = processUrl(url, base, baseUrl); + if (!processed || !isBlogPostPath(processed, baseUrl)) continue; + if (seen.has(processed)) continue; + seen.add(processed); + filtered.push(processed); } return filtered.sort(); @@ -456,25 +463,6 @@ async function fetchHtml(url: string): Promise { return null; } -function extractLinks(html: string, baseUrl: string): string[] { - const $ = load(html); - const links = new Set(); - - $('a[href], link[href]').each((_, element) => { - const href = $(element).attr('href'); - if (!href) return; - - try { - const absoluteUrl = new URL(href, baseUrl).toString(); - links.add(normalizeUrl(absoluteUrl)); - } catch { - return; - } - }); - - return Array.from(links); -} - function extractContent( html: string, url: string, @@ -493,60 +481,56 @@ function extractContent( 'script, style, noscript, nav, header, footer, aside, img, svg, iframe', ).remove(); - $('*').each((_, element) => { - const node = $(element); - const id = node.attr('id')?.toLowerCase() ?? ''; - const className = (node.attr('class') ?? '').toLowerCase(); - - if (BOILERPLATE_KEYWORDS.some((keyword) => id.includes(keyword))) { - node.remove(); - return; - } + // Remove boilerplate elements (but never remove html, head, or body) + $('*') + .not('html, head, body') + .each((_, element) => { + const node = $(element); + const idClass = `${node.attr('id') ?? ''} ${node.attr('class') ?? ''}`.toLowerCase(); + if (BOILERPLATE_KEYWORDS.some((keyword) => idClass.includes(keyword))) { + node.remove(); + } + }); - if (BOILERPLATE_KEYWORDS.some((keyword) => className.includes(keyword))) { - node.remove(); - } - }); + let mainContent: Cheerio | null = null; - let mainContent: Cheerio | null = null; + // Try main content selectors first for (const selector of MAIN_CONTENT_SELECTORS) { const element = $(selector).first(); - if (element.length === 0) continue; - const textLength = element.text().trim().length; - if (textLength > 100) { + if (element.length && element.text().trim().length > 100) { mainContent = element; break; } } - if (!mainContent || mainContent.length === 0) { + // Fallback: find largest content div (excluding nav/sidebar/etc) + if (!mainContent) { let bestDiv = null; let bestLength = 0; $('div').each((_, element) => { const node = $(element); const textLength = node.text().trim().length; if (textLength < 200) return; - const className = (node.attr('class') ?? '').toLowerCase(); - const id = (node.attr('id') ?? '').toLowerCase(); + + const idClass = `${node.attr('id') ?? ''} ${node.attr('class') ?? ''}`.toLowerCase(); if ( - ['nav', 'menu', 'sidebar', 'header', 'footer'].some( - (kw) => className.includes(kw) || id.includes(kw), + ['nav', 'menu', 'sidebar', 'header', 'footer'].some((kw) => + idClass.includes(kw), ) ) { return; } + if (textLength > bestLength) { bestLength = textLength; bestDiv = node; } }); - mainContent = bestDiv ?? null; + mainContent = bestDiv; } - if (!mainContent || mainContent.length === 0) { - const body = $('body').first(); - mainContent = body.length > 0 ? body : null; - } + // Last resort: use body + mainContent = mainContent ?? $('body').first(); const htmlFragment = mainContent ? ($.html(mainContent) ?? '') : ''; const markdown = NodeHtmlMarkdown.translate(htmlFragment); @@ -564,25 +548,19 @@ function extractContent( function extractPublishedYearFromDom( $: ReturnType, ): number | null { + // Try meta tags for (const selector of PUBLISHED_META_SELECTORS) { - const content = $(selector).attr('content'); - const year = parseYear(content); + const year = parseYear($(selector).attr('content')); if (year) return year; } - const timeElement = $('time[datetime]').first(); - const timeAttr = timeElement.attr('datetime'); - const timeYear = parseYear(timeAttr); + // Try