diff --git a/config.js b/config.js index e2724f2..92b7e02 100644 --- a/config.js +++ b/config.js @@ -8,7 +8,6 @@ config = { logger_level: envs.LOGGER_LEVEL || 'info', no_logfiles: envs.NO_LOGFILES, is_package: envs.IS_PACKAGE, - rsshub_parser: envs.RSSHUB_PARSER || 'rssaid', reply_received: envs.REPLY_RECEIVED || "false", valid_username: envs.VALID_USERNAME, unsubscribe_db_path: envs.UNSUB_DB_PATH || 'db/unsubscribe.json', diff --git a/package-lock.json b/package-lock.json index 01e062e..ed7b44b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "rss-telegram-bot", - "version": "1.6.7", + "version": "1.7", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "rss-telegram-bot", - "version": "1.6.7", + "version": "1.7", "license": "MIT", "dependencies": { "const": "^1.0.0", @@ -17,6 +17,7 @@ "node-telegram-bot-api": "^0.56.0", "psl": "^1.8.0", "route-recognizer": "^0.3.4", + "tldts": "^6.1.11", "winston": "^3.6.0" } }, @@ -1969,6 +1970,22 @@ "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==" }, + "node_modules/tldts": { + "version": "6.1.16", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.16.tgz", + "integrity": "sha512-X6VrQzW4RymhI1kBRvrWzYlRLXTftZpi7/s/9ZlDILA04yM2lNX7mBvkzDib9L4uSymHt8mBbeaielZMdsAkfQ==", + "dependencies": { + "tldts-core": "^6.1.16" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "6.1.16", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.16.tgz", + "integrity": "sha512-rxnuCux+zn3hMF57nBzr1m1qGZH7Od2ErbDZjVm04fk76cEynTg3zqvHjx5BsBl8lvRTjpzIhsEGMHDH/Hr2Vw==" + }, "node_modules/tough-cookie": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz", @@ -3773,6 +3790,19 @@ "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==" }, + "tldts": { + "version": "6.1.16", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.16.tgz", + "integrity": "sha512-X6VrQzW4RymhI1kBRvrWzYlRLXTftZpi7/s/9ZlDILA04yM2lNX7mBvkzDib9L4uSymHt8mBbeaielZMdsAkfQ==", + "requires": { + "tldts-core": "^6.1.16" + } + }, + "tldts-core": { + "version": "6.1.16", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.16.tgz", + "integrity": "sha512-rxnuCux+zn3hMF57nBzr1m1qGZH7Od2ErbDZjVm04fk76cEynTg3zqvHjx5BsBl8lvRTjpzIhsEGMHDH/Hr2Vw==" + }, "tough-cookie": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz", diff --git a/package.json b/package.json index f99adc4..a173b1d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "rss-telegram-bot", - "version": "1.6.8", + "version": "1.7", "description": "RSSBot 是一个能帮你订阅 RSSHub 的 Telegram Bot", "main": "index.js", "scripts": { @@ -29,6 +29,7 @@ "node-telegram-bot-api": "^0.56.0", "psl": "^1.8.0", "route-recognizer": "^0.3.4", + "tldts": "^6.1.11", "winston": "^3.6.0" } } diff --git a/rsshub/index.js b/rsshub/index.js index 89de631..b7f2c19 100644 --- a/rsshub/index.js +++ b/rsshub/index.js @@ -1,27 +1,9 @@ const got = require('../utils/got'); const logger = require('../utils/logger'); const config = require('../config') -const { getPageRSSHub: rssaidGet } = require('./rule-driver/rssaid'); -const { getPageRSSHub: radarGet } = require('./rule-driver/radar'); +const { getPageRSSHub } = require('./rsshub'); const { getRules } = require("./rules"); -async function getPageRSSHub(data) { - if (config.rsshub_parser === 'radar') { - return await radarGet({ - url: data.url, - html: data.html, - rules: data.rules - }); - } - return JSON.parse(await rssaidGet({ - url: data.url, - host: data.host, - path: data.pathname, - html: data.html, - rules: data.rules - })); -} - async function getRSSHubLink(url) { const { host, pathname } = new URL(url); const rules = await getRules(); @@ -32,7 +14,7 @@ async function getRSSHubLink(url) { } catch (e) { logger.warn(`Cannot get html from ${url}`); } - const feeds = await getPageRSSHub( + const feeds = getPageRSSHub( { url, host, pathname, html, rules } ); for (let feed of feeds) { diff --git a/rsshub/rsshub.js b/rsshub/rsshub.js new file mode 100644 index 0000000..6987279 --- /dev/null +++ b/rsshub/rsshub.js @@ -0,0 +1,222 @@ +// Generate: npx tsc src/lib/rsshub.ts --target es5 +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.getWebsiteRSSHub = exports.getPageRSSHub = void 0; +var route_recognizer_1 = require("route-recognizer"); +var tldts_1 = require("tldts"); +var rules_1 = require("./rules"); +function ruleHandler(rule, params, url, html, success, fail) { + var run = function () { + var _a; + var resultWithParams; + if (typeof rule.target === "function") { + try { + resultWithParams = rule.target(params, url); + } + catch (error) { + resultWithParams = ""; + } + } + else if (typeof rule.target === "string") { + resultWithParams = rule.target; + } + if (resultWithParams) { + // if no :param in resultWithParams, requiredParams will be null + // in that case, just skip the following steps and return resultWithParams + var requiredParams = (_a = resultWithParams + .match(/\/:\w+\??(?=\/|$)/g)) === null || _a === void 0 ? void 0 : _a.map(function (param) { return ({ + name: param.slice(2).replace(/\?$/, ""), + optional: param.endsWith("?"), + }); }); + if (!requiredParams) { + return resultWithParams; + } + for (var _i = 0, requiredParams_1 = requiredParams; _i < requiredParams_1.length; _i++) { + var param = requiredParams_1[_i]; + if (params[param.name]) { + // successfully matched + var regex = new RegExp("/:".concat(param.name, "\\??(?=/|$)")); + resultWithParams = resultWithParams.replace(regex, "/".concat(params[param.name])); + } + else if (param.optional) { + // missing optional parameter, drop all following parameters, otherwise the route will be invalid + var regex = new RegExp("/:".concat(param.name, "\\?(/.*)?$")); + resultWithParams = resultWithParams.replace(regex, ""); + break; + } + else { + // missing necessary parameter, fail + resultWithParams = ""; + break; + } + } + // bypassing double-check since `:` maybe a part of parameter value + // if (resultWithParams && resultWithParams.includes(':')) { + // // double-check + // resultWithParams = ''; + // } + } + return resultWithParams; + }; + var resultWithParams = run(); + if (resultWithParams) { + success(resultWithParams); + } + else { + fail(); + } +} +function formatBlank(str1, str2) { + if (str1 && str2) { + return (str1 + + (str1[str1.length - 1].match(/[a-zA-Z0-9]/) || + str2[0].match(/[a-zA-Z0-9]/) + ? " " + : "") + + str2); + } + else { + return (str1 || "") + (str2 || ""); + } +} +function getPageRSSHub(data) { + var url = data.url, html = data.html; + var rules = (0, rules_1.parseRules)(data.rules); + var parsedDomain; + try { + parsedDomain = (0, tldts_1.parse)(new URL(url).hostname); + } + catch (error) { + return []; + } + if (parsedDomain && parsedDomain.domain) { + var subdomain = parsedDomain.subdomain; + var domain_1 = parsedDomain.domain; + if (rules[domain_1]) { + var rule_1 = rules[domain_1][subdomain || "."]; + if (!rule_1) { + if (subdomain === "www") { + rule_1 = rules[domain_1]["."]; + } + else if (!subdomain) { + rule_1 = rules[domain_1].www; + } + } + if (rule_1) { + var recognized_1 = []; + rule_1.forEach(function (ru, index) { + var oriSources = Object.prototype.toString.call(ru.source) === "[object Array]" + ? ru.source + : typeof ru.source === "string" + ? [ru.source] + : []; + var sources = []; + // route-recognizer do not support optional segments or partial matching + // thus, we need to manually handle it + // allowing partial matching is necessary, since many rule authors did not mark optional segments + oriSources.forEach(function (source) { + // trimming `?` is necessary, since route-recognizer considers it as a part of segment + source = source.replace(/(\/:\w+)\?(?=\/|$)/g, "$1"); + sources.push(source); + var tailMatch; + do { + tailMatch = source.match(/\/:\w+$/); + if (tailMatch) { + var tail = tailMatch[0]; + source = source.slice(0, source.length - tail.length); + sources.push(source); + } + } while (tailMatch); + }); + // deduplicate (some rule authors may already have done similar job) + sources = sources.filter(function (item, index) { return sources.indexOf(item) === index; }); + // match! + sources.forEach(function (source) { + var router = new route_recognizer_1(); + router.add([ + { + path: source, + handler: index, + }, + ]); + var result = router.recognize(new URL(url).pathname.replace(/\/$/, "")); + if (result && result[0]) { + recognized_1.push(result[0]); + } + }); + }); + var result_1 = []; + Promise.all(recognized_1.map(function (recog) { + return new Promise(function (resolve) { + ruleHandler(rule_1[recog.handler], recog.params, url, html, function (parsed) { + if (parsed) { + result_1.push({ + title: formatBlank(rules[domain_1]._name ? "Current" : "", rule_1[recog.handler].title), + url: "{rsshubDomain}" + parsed, + path: parsed, + }); + } + else { + result_1.push({ + title: formatBlank(rules[domain_1]._name ? "Current" : "", rule_1[recog.handler].title), + url: rule_1[recog.handler].docs, + isDocs: true, + }); + } + resolve(); + }, function () { + resolve(); + }); + }); + })); + return result_1; + } + else { + return []; + } + } + else { + return []; + } + } + else { + return []; + } +} +exports.getPageRSSHub = getPageRSSHub; +function getWebsiteRSSHub(data) { + var url = data.url; + var rules = (0, rules_1.parseRules)(data.rules); + var parsedDomain; + try { + parsedDomain = (0, tldts_1.parse)(new URL(url).hostname); + } + catch (error) { + return []; + } + if (parsedDomain && parsedDomain.domain) { + var domain_2 = parsedDomain.domain; + if (rules[domain_2]) { + var domainRules = []; + for (var subdomainRules in rules[domain_2]) { + if (subdomainRules[0] !== "_") { + domainRules.push.apply(domainRules, rules[domain_2][subdomainRules]); + } + } + return domainRules.map(function (rule) { + return ({ + title: formatBlank(rules[domain_2]._name, rule.title), + url: rule.docs, + isDocs: true, + }); + }); + } + else { + return []; + } + } + else { + return []; + } +} +exports.getWebsiteRSSHub = getWebsiteRSSHub; diff --git a/rsshub/rule-driver/radar.js b/rsshub/rule-driver/radar.js deleted file mode 100644 index 4cd45c3..0000000 --- a/rsshub/rule-driver/radar.js +++ /dev/null @@ -1,189 +0,0 @@ -//import psl from 'psl'; -//import RouteRecognizer from 'route-recognizer'; -const psl = require('psl'); -const RouteRecognizer = require('route-recognizer'); -const jsdom = require('jsdom'); - -function ruleHandler(rule, params, url, html, success, fail) { - const run = () => { - let reaultWithParams; - if (typeof rule.target === 'function') { - const parser = new jsdom.JSDOM(html); - const document = parser.window.document; - try { - reaultWithParams = rule.target(params, url, document); - } catch (error) { - console.warn(error); - reaultWithParams = ''; - } - } else if (typeof rule.target === 'string') { - reaultWithParams = rule.target; - } - - if (reaultWithParams) { - for (const param in params) { - reaultWithParams = reaultWithParams.replace(`/:${param}`, `/${params[param]}`); - } - } - - return reaultWithParams; - }; - const reaultWithParams = run(); - if (reaultWithParams && (!rule.verification || rule.verification(params))) { - success(reaultWithParams); - } else { - fail(); - } -} - -function formatBlank(str1, str2) { - if (str1 && str2) { - return str1 + (str1[str1.length - 1].match(/[a-zA-Z0-9]/) || str2[0].match(/[a-zA-Z0-9]/) ? ' ' : '') + str2; - } else { - return (str1 || '') + (str2 || ''); - } -} - -function parseRules(rules) { - return typeof rules === 'string' ? window['lave'.split('').reverse().join('')](rules) : rules; -} - -function getPageRSSHub(data) { - const { url, html } = data; - const rules = parseRules(data.rules); - - const parsedDomain = psl.parse(new URL(url).hostname); - if (parsedDomain && parsedDomain.domain) { - const subdomain = parsedDomain.subdomain; - const domain = parsedDomain.domain; - if (rules[domain]) { - let rule = rules[domain][subdomain || '.']; - if (!rule) { - if (subdomain === 'www') { - rule = rules[domain]['.']; - } else if (!subdomain) { - rule = rules[domain].www; - } - } - if (rule) { - const recognized = []; - rule.forEach((ru, index) => { - if (ru.source !== undefined) { - if (Object.prototype.toString.call(ru.source) === '[object Array]') { - ru.source.forEach((source) => { - const router = new RouteRecognizer(); - router.add([ - { - path: source, - handler: index, - }, - ]); - const result = router.recognize(new URL(url).pathname.replace(/\/$/, '')); - if (result && result[0]) { - recognized.push(result[0]); - } - }); - } else if (typeof ru.source === 'string') { - const router = new RouteRecognizer(); - router.add([ - { - path: ru.source, - handler: index, - }, - ]); - const result = router.recognize(new URL(url).pathname.replace(/\/$/, '')); - if (result && result[0]) { - recognized.push(result[0]); - } - } - } - }); - const result = []; - Promise.all( - recognized.map( - (recog) => - new Promise((resolve) => { - ruleHandler( - rule[recog.handler], - recog.params, - url, - html, - (parsed) => { - if (parsed) { - result.push({ - title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - url: '{rsshubDomain}' + parsed, - path: parsed, - }); - } else { - result.push({ - title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - url: rule[recog.handler].docs, - isDocs: true, - }); - } - resolve(); - }, - () => { - resolve(); - } - ); - }) - ) - ); - return result; - } else { - return []; - } - } else { - return []; - } - } else { - return []; - } -} - -function getWebsiteRSSHub(data) { - const { url } = data; - const rules = parseRules(data.rules); - const parsedDomain = psl.parse(new URL(url).hostname); - if (parsedDomain && parsedDomain.domain) { - const domain = parsedDomain.domain; - if (rules[domain]) { - const domainRules = []; - for (const subdomainRules in rules[domain]) { - if (subdomainRules[0] !== '_') { - domainRules.push(...rules[domain][subdomainRules]); - } - } - return domainRules.map((rule) => ({ - title: formatBlank(rules[domain]._name, rule.title), - url: rule.docs, - isDocs: true, - })); - } else { - return []; - } - } else { - return []; - } -} - -function getList(data) { - const rules = parseRules(data.rules); - for (const rule in rules) { - for (const subrule in rules[rule]) { - if (subrule[0] !== '_') { - rules[rule][subrule].forEach((item) => { - delete item.source; - delete item.target; - delete item.script; - delete item.verification; - }); - } - } - } - return rules; -} - -module.exports = { getPageRSSHub } \ No newline at end of file diff --git a/rsshub/rule-driver/rssaid.js b/rsshub/rule-driver/rssaid.js deleted file mode 100644 index 883ac5a..0000000 --- a/rsshub/rule-driver/rssaid.js +++ /dev/null @@ -1,229 +0,0 @@ -//import psl from 'psl'; -//import RouteRecognizer from 'route-recognizer'; -const psl = require('psl'); -const RouteRecognizer = require('route-recognizer'); -const jsdom = require('jsdom'); - -function ruleHandler(rule, params, url, html) { - // console.log("rule: "+rule+" params:"+ params +" url:"+url +" html:"+html); - const run = () => { - let reaultWithParams; - if (typeof rule.target === 'function') { - const parser = new jsdom.JSDOM(html); - const document = parser.window.document; - try { - reaultWithParams = rule.target(params, url, document); - } catch (error) { - console.log(error) - reaultWithParams = ''; - } - } else if (typeof rule.target === 'string') { - reaultWithParams = rule.target; - } - - if (reaultWithParams) { - for (const param in params) { - reaultWithParams = reaultWithParams.replace(`/:${param}`, `/${params[param]}`); - } - } - return reaultWithParams; - }; - const reaultWithParams = run(); - if (reaultWithParams && (!rule.verification || rule.verification(params))) { - return reaultWithParams; - } else { - return undefined; - } -} - -function formatBlank(str1, str2) { - if (str1 && str2) { - return str1 + (str1[str1.length - 1].match(/[a-zA-Z0-9]/) || str2[0].match(/[a-zA-Z0-9]/) ? ' ' : '') + str2; - } else { - return (str1 || '') + (str2 || ''); - } -} - -function parseRules(rules) { - return typeof rules === 'string' ? window['lave'.split('').reverse().join('')](rules) : rules; -} - -function getPageRSSHub(data) { - const { url, host, path, html } = data; - //console.log("url: "+url+" host:"+ host +" path:"+path +" html:"+html+" rules:"+data.rules); - const rules = parseRules(data.rules); - const parsedDomain = psl.parse(host); - if (parsedDomain && parsedDomain.domain) { - const subdomain = parsedDomain.subdomain; - const domain = parsedDomain.domain; - if (rules[domain]) { - let rule = rules[domain][subdomain || '.']; - if (!rule) { - if (subdomain === 'www' || subdomain === 'mobile' || subdomain === 'm') { - rule = rules[domain]['.']; - } else if (!subdomain) { - rule = rules[domain].www; - } - } - if (rule) { - const recognized = []; - rule.forEach((ru, index) => { - if (ru.source !== undefined) { - if (Object.prototype.toString.call(ru.source) === '[object Array]') { - ru.source.forEach((source) => { - const router = new RouteRecognizer(); - router.add([{ - path: source, - handler: index, - }, ]); - const result = router.recognize(path.replace(/\/$/, '')); - if (result && result[0]) { - recognized.push(result[0]); - } - }); - } else if (typeof ru.source === 'string') { - const router = new RouteRecognizer(); - router.add([{ - path: ru.source, - handler: index, - }, ]); - const result = router.recognize(path.replace(/\/$/, '')); - if (result && result[0]) { - recognized.push(result[0]); - } - } - } - }); - const result = []; - for (var i = 0; i < recognized.length; i++) { - var recog = recognized[i]; - var parsed = ruleHandler(rule[recog.handler], - recog.params, - url, - html); - if (parsed !== undefined) { - if (parsed) { - //console.log({ - // title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - // url: '{rsshubDomain}' + parsed, - // path: parsed, - //}) - result.push({ - title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - url: '{rsshubDomain}' + parsed, - path: parsed, - }); - } else { - result.push({ - title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - url: rule[recog.handler].docs, - isDocs: true, - }); - } - } - } - // Promise.all( - // recognized.map( - // (recog) => - // new Promise((resolve) => { - // ruleHandler( - // rule[recog.handler], - // recog.params, - // url, - // html, - // (parsed) => { - // console.log("parsed: "+parsed); - // if (parsed) { - // console.log({ - // title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - // url: '{rsshubDomain}' + parsed, - // path: parsed, - // }) - // result.push({ - // title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - // url: '{rsshubDomain}' + parsed, - // path: parsed, - // }); - // } else { - // result.push({ - // title: formatBlank(rules[domain]._name ? '当前' : '', rule[recog.handler].title), - // url: rule[recog.handler].docs, - // isDocs: true, - // }); - // } - // resolve(); - // }, - // () => { - // resolve(); - // } - // ); - // }) - // ) - // ); - return JSON.stringify(result); - } else { - return JSON.stringify([]); - } - } else { - return JSON.stringify([]); - } - } else { - return JSON.stringify([]); - - } -} - -function getWebsiteRSSHub(data) { - const { url } = data; - const rules = parseRules(data.rules); - const parsedDomain = psl.parse(new URL(url).hostname); - if (parsedDomain && parsedDomain.domain) { - const domain = parsedDomain.domain; - if (rules[domain]) { - const domainRules = []; - for (const subdomainRules in rules[domain]) { - if (subdomainRules[0] !== '_') { - domainRules.push(...rules[domain][subdomainRules]); - } - } - return domainRules.map((rule) => ({ - title: formatBlank(rules[domain]._name, rule.title), - url: rule.docs, - isDocs: true, - })); - } else { - return []; - } - } else { - return []; - } -} - -function getList(data) { - const rules = parseRules(data.rules); - for (const rule in rules) { - for (const subrule in rules[rule]) { - if (subrule[0] !== '_') { - rules[rule][subrule].forEach((item) => { - delete item.source; - delete item.target; - delete item.script; - delete item.verification; - }); - } - } - } - return rules; -} - -function decodeUtf8(bytes) { - var encoded = ""; - for (var i = 0; i < bytes.length; i++) { - encoded += '%' + bytes[i].toString(16); - } - var html = decodeURIComponent(encoded); - console.log(html); - return html; -} - -module.exports = { getPageRSSHub } \ No newline at end of file diff --git a/rsshub/rules.js b/rsshub/rules.js index e7f149c..0d4736e 100644 --- a/rsshub/rules.js +++ b/rsshub/rules.js @@ -4,6 +4,13 @@ const config = require('../config'); let rules; let last_update; +function parseRules(rules, forceJSON) { + if (typeof rules === "string") { + return JSON.parse(rules); + } + return rules; +} + async function getRules() { if (rules && last_update && (Date.now() - last_update) < config.radar_update_interval) return rules; @@ -13,9 +20,9 @@ async function getRules() { url: config.radar_url } ) - rules = eval(response.body); + rules = JSON.parse(response.body); last_update = Date.now(); return rules; } -module.exports = { getRules } \ No newline at end of file +module.exports = { getRules, parseRules } \ No newline at end of file