diff --git a/Cargo.lock b/Cargo.lock index 681344b..6dcaf15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "adobe-cmap-parser" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" +dependencies = [ + "pom", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -32,6 +41,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -41,12 +56,86 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arc-swap" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +dependencies = [ + "rustversion", +] + +[[package]] +name = "assert_cmd" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a686bbee5efb88a82df0621b236e74d925f470e5445d3220a5648b892ec99c9" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + [[package]] name = "async-compression" version = "0.4.41" @@ -168,6 +257,24 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +[[package]] +name = "bitpacking" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" +dependencies = [ + "crunchy", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "brotli" version = "8.0.2" @@ -189,12 +296,29 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.11.1" @@ -219,6 +343,12 @@ dependencies = [ "shlex", ] +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + [[package]] name = "cesu8" version = "1.1.0" @@ -251,6 +381,46 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + [[package]] name = "cmake" version = "0.1.58" @@ -260,6 +430,12 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + [[package]] name = "combine" version = "4.6.7" @@ -323,6 +499,56 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "cssparser" version = "0.36.0" @@ -380,6 +606,16 @@ dependencies = [ "syn", ] +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + [[package]] name = "derive_more" version = "2.1.1" @@ -401,6 +637,22 @@ dependencies = [ "syn", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -412,6 +664,12 @@ dependencies = [ "syn", ] +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + [[package]] name = "dtoa" version = "1.0.11" @@ -445,6 +703,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04dc5a38e4f151a79d9f2451ae6037fb6eaf5cba34771f44781f80e508498e3" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -470,6 +734,21 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + [[package]] name = "fastrand" version = "2.3.0" @@ -481,16 +760,22 @@ name = "ferris-search" version = "0.1.0" dependencies = [ "anyhow", + "assert_cmd", "axum", "bytes", "cargo-husky", + "clap", "lazy_static", + "pdf-extract", + "predicates", "regex", "reqwest", "rmcp", "scraper", "serde", "serde_json", + "tantivy", + "tempfile", "tokio", "tower-http", "tracing", @@ -498,6 +783,7 @@ dependencies = [ "url", "urlencoding", "uuid", + "walkdir", ] [[package]] @@ -516,6 +802,15 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + [[package]] name = "fnv" version = "1.0.7" @@ -537,6 +832,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix 0.38.44", + "windows-sys 0.52.0", +] + [[package]] name = "fs_extra" version = "1.3.0" @@ -631,6 +936,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getopts" version = "0.2.24" @@ -705,6 +1020,8 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ + "allocator-api2", + "equivalent", "foldhash", ] @@ -720,6 +1037,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "html5ever" version = "0.39.0" @@ -730,6 +1053,12 @@ dependencies = [ "markup5ever", ] +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + [[package]] name = "http" version = "1.4.0" @@ -989,6 +1318,18 @@ dependencies = [ "serde_core", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -1005,6 +1346,21 @@ dependencies = [ "serde", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1089,12 +1445,36 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + [[package]] name = "libc" version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "litemap" version = "0.8.1" @@ -1116,12 +1496,45 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "encoding_rs", + "flate2", + "indexmap", + "itoa", + "log", + "md-5", + "nom", + "rangemap", + "time", + "weezl", +] + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "lru-slab" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" + [[package]] name = "markup5ever" version = "0.39.0" @@ -1148,18 +1561,53 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "measure_time" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +dependencies = [ + "instant", + "log", +] + [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "mime" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1181,12 +1629,34 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + [[package]] name = "new_debug_unreachable" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1196,6 +1666,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-conv" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" + [[package]] name = "num-traits" version = "0.2.19" @@ -1203,6 +1679,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", ] [[package]] @@ -1211,12 +1698,33 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + [[package]] name = "openssl-probe" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "ownedbytes" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -1246,6 +1754,21 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" +[[package]] +name = "pdf-extract" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" +dependencies = [ + "adobe-cmap-parser", + "encoding_rs", + "euclid", + "lopdf", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1317,6 +1840,24 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "potential_utf" version = "0.1.4" @@ -1326,6 +1867,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -1341,6 +1888,36 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "predicates" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada8f2932f28a27ee7b70dd6c1c39ea0675c55a36879ab92f3a715eaa1e63cfe" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cad38746f3166b4031b1a0d39ad9f954dd291e7854fcc0eed52ee41a0b50d144" + +[[package]] +name = "predicates-tree" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0de1b847b39c8131db0467e9df1ff60e6d0562ab8e9a16e568ad0fdb372e2f2" +dependencies = [ + "predicates-core", + "termtree", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1371,7 +1948,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.1.2", "rustls", "socket2", "thiserror 2.0.18", @@ -1390,9 +1967,9 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand", + "rand 0.9.2", "ring", - "rustc-hash", + "rustc-hash 2.1.2", "rustls", "rustls-pki-types", "slab", @@ -1437,14 +2014,35 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -1454,7 +2052,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", ] [[package]] @@ -1466,6 +2073,42 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1614,6 +2257,22 @@ dependencies = [ "syn", ] +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.2" @@ -1629,6 +2288,32 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + [[package]] name = "rustls" version = "0.23.37" @@ -1818,7 +2503,7 @@ dependencies = [ "phf", "phf_codegen", "precomputed-hash", - "rustc-hash", + "rustc-hash 2.1.2", "servo_arc", "smallvec", ] @@ -1952,6 +2637,15 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.12" @@ -2068,6 +2762,160 @@ dependencies = [ "libc", ] +[[package]] +name = "tantivy" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96599ea6fccd844fc833fed21d2eecac2e6a7c1afd9e044057391d78b1feb141" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64", + "bitpacking", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "itertools", + "levenshtein_automata", + "log", + "lru", + "lz4_flex", + "measure_time", + "memmap2", + "num_cpus", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror 1.0.69", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82" +dependencies = [ + "nom", +] + +[[package]] +name = "tantivy-sstable" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e" +dependencies = [ + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8" +dependencies = [ + "murmurhash32", + "rand_distr", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04" +dependencies = [ + "serde", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + [[package]] name = "tendril" version = "0.5.0" @@ -2078,6 +2926,12 @@ dependencies = [ "utf-8", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + [[package]] name = "thiserror" version = "1.0.69" @@ -2127,6 +2981,37 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -2322,12 +3207,36 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "type1-encoding-parser" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749" +dependencies = [ + "pom", +] + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.2.2" @@ -2370,12 +3279,24 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + [[package]] name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.23.0" @@ -2384,6 +3305,7 @@ checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ "getrandom 0.4.2", "js-sys", + "serde_core", "wasm-bindgen", ] @@ -2393,6 +3315,21 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -2566,6 +3503,28 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -2575,6 +3534,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -2995,3 +3960,31 @@ name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 738a713..4813da3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,16 @@ tokio = { version = "1.50", features = ["full"] } rmcp = { version = "1.3", features = ["server", "transport-io", "macros"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" -reqwest = { version = "0.13", features = ["json", "gzip", "deflate", "brotli", "socks", "form", "query", "http2"] } +reqwest = { version = "0.13", features = [ + "json", + "gzip", + "deflate", + "brotli", + "socks", + "form", + "query", + "http2", +] } scraper = "0.26" axum = { version = "0.8", features = ["http2"] } tower-http = { version = "0.6", features = ["cors"] } @@ -23,6 +32,10 @@ uuid = { version = "1", features = ["v4"] } lazy_static = "1" urlencoding = "2.1" regex = "1" +clap = { version = "4", features = ["derive"] } +tantivy = "0.22" +walkdir = "2" +pdf-extract = "0.7" [profile.release] opt-level = 3 @@ -30,8 +43,17 @@ lto = true codegen-units = 1 strip = true +[dev-dependencies] +assert_cmd = "2" +predicates = "3" +tempfile = "3" + [target.'cfg(target_os = "linux")'.dev-dependencies] -cargo-husky = { version = "1", default-features = false, features = ["user-hooks"] } +cargo-husky = { version = "1", default-features = false, features = [ + "user-hooks", +] } [target.'cfg(target_os = "macos")'.dev-dependencies] -cargo-husky = { version = "1", default-features = false, features = ["user-hooks"] } +cargo-husky = { version = "1", default-features = false, features = [ + "user-hooks", +] } diff --git a/README.md b/README.md index 591a7a2..0421975 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ferris-search πŸ¦€ -> A blazing-fast MCP server for multi-engine web search, written in Rust. +> A blazing-fast multi-engine web search CLI & MCP server, written in Rust. ## Why ferris-search? @@ -9,6 +9,7 @@ Claude Code's built-in web search works great in ideal network conditions β€” bu While looking for a workaround, I came across [open-webSearch](https://github.com/Aas-ee/open-webSearch), a Node.js MCP server that routes search queries through multiple engines. It solved the problem well. But I have a thing for Rust β€” and spinning up a Node.js runtime just to proxy a few HTTP requests felt heavier than it needed to be. So I rewrote the same idea in Rust: + - **No Node.js runtime** β€” single self-contained binary, ~8 MB - **Lower latency** β€” Rust async I/O, concurrent fan-out across engines - **Smaller footprint** β€” negligible memory usage @@ -21,6 +22,7 @@ If Claude Code's search isn't working in your environment, this is for you. ferris-search is also a good foundation for **enterprise internal search** scenarios. Since it's a standard MCP server written in Rust, you can fork it and add custom search engines that connect to your internal knowledge bases β€” Confluence, Notion, internal wikis, code repositories, or proprietary document stores. Some ideas: + - Add an engine that searches your internal Elasticsearch or OpenSearch cluster - Integrate with your company's Confluence or GitLab search API - Connect to a private RAG (Retrieval-Augmented Generation) service @@ -30,12 +32,15 @@ With Claude Code as the AI layer and ferris-search as the search backbone, your ## Features +- **CLI + MCP dual mode** β€” use directly in the terminal or as an MCP server for Claude Desktop / Cursor / Claude Code - **Multi-engine fan-out** β€” search across multiple engines simultaneously with a single call - **14 search engines** β€” Bing, DuckDuckGo, Brave, Baidu, CSDN, Juejin, Exa, Firecrawl, Zhihu, LinuxDo, Jina, Tavily, GitHub (repo search), GitHub Code (code search) - **7 MCP tools** β€” `web_search` + 6 article/content fetchers - **No API keys required** for most engines (Brave, Exa, Firecrawl, Jina, and Tavily require API keys) - **Single binary** β€” ~8 MB, no runtime dependencies - **Proxy support** β€” HTTP/SOCKS5 proxy via env var +- **Text & JSON output** β€” `--format text` (default) or `--format json` for scripting +- **Local document indexing** β€” full-text search over local Markdown, Text, HTML, and PDF files via Tantivy ## Quick Install @@ -52,6 +57,7 @@ bash install.sh ``` The script will: + 1. Build and install the binary via `cargo install` 2. Register the MCP server with Claude Code (`claude mcp add -s user`) if the CLI is found 3. Install Claude Code skills for ferris-search @@ -90,6 +96,73 @@ which ferris-search echo "$(cargo home 2>/dev/null || echo $HOME/.cargo)/bin/ferris-search" ``` +### CLI Usage + +Once installed, you can use ferris-search directly from the terminal: + +**Search the web:** + +```bash +# Single engine (uses default engine from env, or bing) +ferris-search search "rust async runtime" + +# Specify engine(s) +ferris-search search "rust async runtime" --engine bing +ferris-search search "rust async runtime" --engine bing,duckduckgo +ferris-search search "rust async runtime" -e bing -e duckduckgo + +# Limit results and output JSON +ferris-search search "rust async runtime" -e bing --limit 3 --format json +``` + +**Fetch web content:** + +```bash +# Auto-detects the best fetcher based on URL +ferris-search fetch https://github.com/nickel-org/nickel.rs +ferris-search fetch https://example.com --max-chars 5000 +ferris-search fetch https://example.com --format json +``` + +Supported domains with specialized extraction: `github.com` (README), `csdn.net`, `juejin.cn`, `zhihu.com`, `linux.do`. + +**List engines & show config:** + +```bash +ferris-search list-engines +ferris-search list-engines --format json + +ferris-search show-config +ferris-search show-config --format json +``` + +**Start MCP server explicitly:** + +```bash +ferris-search mcp +``` + +**Index and search local documents:** + +```bash +# Build a full-text index from a directory (supports md, txt, html, pdf) +ferris-search index-local --path ./docs +ferris-search index-local --path ./docs --path ./notes + +# Search the local index +ferris-search search-local "async runtime" +ferris-search search-local "async runtime" --limit 5 --format json + +# Use a custom index directory +ferris-search index-local --path ./docs --index-path ./my-index +ferris-search search-local "query" --index-path ./my-index +``` + +> **PDF note:** PDF indexing extracts text using the PDF's text layer. Image-only or scanned PDFs without embedded text are not OCRed and will usually produce poor or empty results. +> **Note:** When stdin is not a TTY (e.g., piped by Claude Desktop), ferris-search automatically enters MCP mode β€” no configuration change needed. +> +> **Clarification:** MCP is not deprecated in ferris-search. The project now supports both CLI and MCP. What changed is that the old transport-selection env vars (`MODE`, `ENABLE_HTTP_SERVER`) were removed in favor of stdio MCP mode plus automatic TTY detection. + ### Claude Desktop / Cursor configuration ```json @@ -107,6 +180,8 @@ echo "$(cargo home 2>/dev/null || echo $HOME/.cargo)/bin/ferris-search" Replace the path with the output of `which ferris-search`. +> The binary auto-detects piped stdin and enters MCP mode, so existing configurations continue to work without adding the `mcp` subcommand. + ### Claude Code (claude mcp add) Add for the current project only: @@ -154,11 +229,11 @@ Search the web using one or more engines simultaneously. } ``` -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `query` | string | required | Search query | -| `engines` | string[] | server default | Engines to search (fan-out if multiple) | -| `limit` | number | 10 | Max results per engine (1–50) | +| Parameter | Type | Default | Description | +| --------- | -------- | -------------- | ---------------------------------------- | +| `query` | string | required | Search query | +| `engines` | string[] | server default | Engines to search (fan-out if multiple) | +| `limit` | number | 10 | Max results per engine (1–50) | Supported engines: `bing`, `duckduckgo`, `brave`, `baidu`, `csdn`, `juejin`, `exa`, `firecrawl`, `zhihu`, `linuxdo`, `jina`, `tavily`, `github`, `github_code` @@ -166,53 +241,54 @@ Supported engines: `bing`, `duckduckgo`, `brave`, `baidu`, `csdn`, `juejin`, `ex Fetch and extract text content from any public URL. -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `url` | string | required | Public HTTP/HTTPS URL | -| `max_chars` | number | 30000 | Max characters to return | +| Parameter | Type | Default | Description | +| ----------- | ------ | -------- | ------------------------ | +| `url` | string | required | Public HTTP/HTTPS URL | +| `max_chars` | number | 30000 | Max characters to return | ### `fetch_github_readme` Fetch the README from a GitHub repository. -| Parameter | Type | Description | -|-----------|------|-------------| -| `url` | string | GitHub repository URL | +| Parameter | Type | Description | +| --------- | ------ | --------------------- | +| `url` | string | GitHub repository URL | ### `fetch_csdn_article` / `fetch_juejin_article` / `fetch_zhihu_article` / `fetch_linuxdo_article` Domain-specific fetchers with better content extraction than the generic `fetch_web_content`. -| Tool | URL Constraint | -|------|----------------| -| `fetch_csdn_article` | must contain `csdn.net` | -| `fetch_juejin_article` | must contain `juejin.cn` and `/post/` | -| `fetch_zhihu_article` | must contain `zhihu.com` | +| Tool | URL Constraint | +| ----------------------- | ------------------------------------- | +| `fetch_csdn_article` | must contain `csdn.net` | +| `fetch_juejin_article` | must contain `juejin.cn` and `/post/` | +| `fetch_zhihu_article` | must contain `zhihu.com` | | `fetch_linuxdo_article` | must contain `linux.do` and `/topic/` | ## Configuration All configuration is done via environment variables. -| Env Var | Default | Description | -|---------|---------|-------------| -| `DEFAULT_SEARCH_ENGINE` | `bing` | Engine used when `engines` param is omitted | -| `ALLOWED_SEARCH_ENGINES` | all engines | Comma-separated allow-list | -| `BRAVE_API_KEY` | β€” | Required for `brave` engine | -| `EXA_API_KEY` | β€” | Required for `exa` engine | -| `FIRECRAWL_API_KEY` | β€” | Required for `firecrawl` engine | -| `JINA_API_KEY` | β€” | Required for `jina` engine | -| `TAVILY_API_KEY` | β€” | Required for `tavily` engine | -| `GITHUB_TOKEN` | β€” | Optional for `github` / `github_code` engines (raises rate limit from 60 to 5000 req/hr) | -| `USE_PROXY` | `false` | Enable HTTP/SOCKS5 proxy | -| `PROXY_URL` | `http://127.0.0.1:7890` | Proxy address | -| `ENABLE_HTTP_SERVER` | `false` | Enable HTTP/SSE transport alongside stdio | -| `MODE` | `stdio` | Transport mode: `stdio`, `http`, or `both` | -| `RUST_LOG` | `info` | Log level: `debug`, `info`, `warn`, `error` | +| Env Var | Default | Description | +| ------------------------ | ------------------------------ | ----------------------------------------------------------------------------------------- | +| `DEFAULT_SEARCH_ENGINE` | `bing` | Engine used when `engines` param is omitted | +| `ALLOWED_SEARCH_ENGINES` | all engines | Comma-separated allow-list | +| `BRAVE_API_KEY` | β€” | Required for `brave` engine | +| `EXA_API_KEY` | β€” | Required for `exa` engine | +| `FIRECRAWL_API_KEY` | β€” | Required for `firecrawl` engine | +| `JINA_API_KEY` | β€” | Required for `jina` engine | +| `TAVILY_API_KEY` | β€” | Required for `tavily` engine | +| `GITHUB_TOKEN` | β€” | Optional for `github` / `github_code` engines (raises rate limit from 60 to 5000 req/hr) | +| `USE_PROXY` | `false` | Enable HTTP/SOCKS5 proxy | +| `PROXY_URL` | `http://127.0.0.1:7890` | Proxy address | +| `RUST_LOG` | `info` | Log level: `debug`, `info`, `warn`, `error` | +| `LOCAL_DOCS_INDEX_PATH` | `.ferris-index` | Directory for the local document index | +| `LOCAL_DOCS_EXTENSIONS` | `md,markdown,txt,html,htm,pdf` | Comma-separated file extensions to index | ### Common configurations **Privacy-focused:** + ```bash claude mcp add -s user ferris-search $(which ferris-search) \ -e DEFAULT_SEARCH_ENGINE=duckduckgo \ @@ -220,6 +296,7 @@ claude mcp add -s user ferris-search $(which ferris-search) \ ``` **Chinese developer workflow:** + ```bash claude mcp add -s user ferris-search $(which ferris-search) \ -e DEFAULT_SEARCH_ENGINE=bing \ @@ -227,6 +304,7 @@ claude mcp add -s user ferris-search $(which ferris-search) \ ``` **With Exa AI search:** + ```bash claude mcp add -s user ferris-search $(which ferris-search) \ -e DEFAULT_SEARCH_ENGINE=exa \ diff --git a/skills/ferris-search-cli/SKILL.md b/skills/ferris-search-cli/SKILL.md new file mode 100644 index 0000000..b1624e6 --- /dev/null +++ b/skills/ferris-search-cli/SKILL.md @@ -0,0 +1,131 @@ +--- +name: ferris-search-cli +description: | + CRITICAL: Use for ferris-search CLI usage. Triggers on: + ferris-search search, ferris-search fetch, ferris-search list-engines, + ferris-search show-config, ferris-search mcp, ferris-search CLI, + ferris-search index-local, ferris-search search-local, local document index, + local full-text search, ζœ¬εœ°η΄’εΌ•, ζœ¬εœ°ζ–‡ζ‘£ζœη΄’, ε…¨ζ–‡η΄’εΌ•, ζœ¬εœ°ζ£€η΄’, + terminal search, command line search, shell search, CLI搜紒, + ε‘½δ»€θ‘Œζœη΄’, 终端搜紒, ferris-searchε‘½δ»€θ‘Œ, ferris-search终端 +--- + +# ferris-search CLI Skill + +> **Version:** ferris-search 0.1.0 | **Last Updated:** 2026-03-31 + +Help users run `ferris-search` from the terminal. For complete parameter details, read `./references/cli-reference.md`. + +## Subcommands + +### search β€” Web search + +```bash +# Default engine +ferris-search search "rust async runtime" + +# Specify engines (comma-separated or repeated -e) +ferris-search search "rust async" -e bing,duckduckgo +ferris-search search "rust async" -e bing -e brave + +# Limit + JSON +ferris-search search "tokio tutorial" -e bing --limit 3 --format json +``` + +### fetch β€” Extract web content + +```bash +# Auto-detects fetcher by URL domain +ferris-search fetch https://github.com/tokio-rs/tokio +ferris-search fetch https://example.com --max-chars 5000 +ferris-search fetch https://example.com --format json +``` + +Specialized domains: `github.com` (repo root β†’ README), `csdn.net`, `juejin.cn/post/`, `zhihu.com`, `linux.do/topic/`. All others use generic web extraction. + +### list-engines β€” Show available engines + +```bash +ferris-search list-engines +ferris-search list-engines --format json +``` + +### show-config β€” Show effective configuration + +```bash +ferris-search show-config +ferris-search show-config --format json +``` + +API keys are masked in output (first 4 + last 4 chars shown). + +### index-local β€” Build local full-text index + +```bash +# Index a directory of documents +ferris-search index-local --path ./docs + +# Index multiple paths +ferris-search index-local --path ./docs --path ./notes --path ./papers + +# Custom index location +ferris-search index-local --path ./docs --index-path ./my-index + +# JSON output +ferris-search index-local --path ./docs --format json +``` + +Supported file types (configurable via `LOCAL_DOCS_EXTENSIONS`): `.md`, `.markdown`, `.txt`, `.html`, `.htm`, `.pdf` + +PDF support is text-layer only. Image-only or scanned PDFs without embedded text are not OCRed. + +### search-local β€” Search the local document index + +```bash +# Search indexed documents +ferris-search search-local "async runtime" + +# With limit and custom index path +ferris-search search-local "error handling" --limit 5 --index-path ./my-index + +# JSON output +ferris-search search-local "design patterns" --format json +``` + +Results include title, file path, file type, relevance score, and a content snippet. + +### mcp β€” Start MCP server + +```bash +ferris-search mcp +``` + +> When stdin is piped (e.g. by Claude Desktop), `ferris-search` auto-enters MCP mode without needing `mcp`. +> +> MCP is still supported. The CLI was added alongside MCP, not as a replacement for it. + +## Key Behaviors + +- **Local indexing**: Powered by [Tantivy](https://github.com/quickwit-oss/tantivy), supports full-text search across local Markdown, TXT, HTML, and text-based PDF files +- **PDF limitation**: No OCR is performed; scanned or image-only PDFs may index as empty or near-empty content +- **Output format**: `--format text` (default, human-readable) or `--format json` (machine-parseable) +- **Exit codes**: 0 = success, 1 = search/fetch failure, 2 = parameter error +- **Errors to stderr**: Warnings and failures go to stderr; results go to stdout +- **Engine resolution**: CLI `--engine` overrides env `DEFAULT_SEARCH_ENGINE`; engines are filtered by `ALLOWED_SEARCH_ENGINES` +- **TTY detection**: No subcommand + interactive terminal β†’ prints help; piped stdin β†’ MCP mode +- **MCP status**: MCP is active and supported; only the old `MODE` / `ENABLE_HTTP_SERVER` transport toggles were removed + +## Scripting Patterns + +### Pipe JSON to jq + +```bash +ferris-search search "rust" -e bing --format json | jq '.[].url' +``` + +### Multi-engine with error checking + +```bash +ferris-search search "query" -e bing,brave --format json 2>errors.log +if [ $? -ne 0 ]; then cat errors.log; fi +``` diff --git a/skills/ferris-search-cli/references/cli-reference.md b/skills/ferris-search-cli/references/cli-reference.md new file mode 100644 index 0000000..064e845 --- /dev/null +++ b/skills/ferris-search-cli/references/cli-reference.md @@ -0,0 +1,353 @@ +# CLI Reference + +## Table of Contents + +1. [search](#search) +2. [fetch](#fetch) +3. [list-engines](#list-engines) +4. [show-config](#show-config) +5. [index-local](#index-local) +6. [search-local](#search-local) +7. [mcp](#mcp) +8. [Global Behavior](#global-behavior) +9. [Supported Engines](#supported-engines) + +--- + +## search + +Search the web using one or more engines. + +```text +ferris-search search [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| ---------- | ----- | ------------- | ------------------------------------- | -------------------------------------------- | +| `` | β€” | string | *required* | Search query (positional) | +| `--engine` | `-e` | string | env `DEFAULT_SEARCH_ENGINE` or `bing` | Engine name(s). Comma-separated or repeated. | +| `--limit` | `-l` | u32 | `10` | Max results per engine (1–50) | +| `--format` | `-f` | text / json | `text` | Output format | + +**Engine specification examples:** + +- `-e bing` β€” single engine +- `-e bing,duckduckgo` β€” comma-separated +- `-e bing -e brave` β€” repeated flag +- omitted β€” uses `DEFAULT_SEARCH_ENGINE` env var (default: `bing`) + +**Single-engine output (text):** + +```text +Engine: bing +Total: 3 + +1. **Title** +URL: https://... +Source: bing +Description: ... +``` + +**Multi-engine output (text):** + +```text +Total results: 8 + +## Results from bing + +1. **Title** ... + +## Results from brave + +1. **Title** ... +``` + +**JSON output:** Array of `SearchResult` objects: + +```json +[ + { + "title": "...", + "url": "...", + "description": "...", + "source": "...", + "engine": "bing" + } +] +``` + +**Error handling:** + +- Single engine failure β†’ exit 1, error to stderr +- Multi-engine partial failure β†’ warnings to stderr, successful results to stdout +- Multi-engine all fail β†’ "All engines failed." to stderr, exit 1 + +--- + +## fetch + +Fetch and extract content from a URL. Auto-detects the best extraction method by domain. + +```text +ferris-search fetch [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| ------------- | ----- | ----------- | ---------- | ----------------------------------------- | +| `` | β€” | string | *required* | URL to fetch (positional) | +| `--max-chars` | `-m` | u32 | `30000` | Max content characters (generic web only) | +| `--format` | `-f` | text / json | `text` | Output format | + +**Domain routing:** + +| URL Pattern | Fetcher | Notes | +| ----------------------------------------------------- | ----------------------- | ----------------------------------- | +| `github.com//` (exactly 2 path segments) | `fetch_github_readme` | Returns README.md content | +| `csdn.net` | `fetch_csdn_article` | Full article extraction | +| `juejin.cn` + `/post/` in path | `fetch_juejin_article` | Full article extraction | +| `zhihu.com` | `fetch_zhihu_article` | Full article extraction | +| `linux.do` + `/topic/` in path | `fetch_linuxdo_article` | Full topic extraction | +| Everything else | `fetch_web_content` | Generic with `max_chars` truncation | + +**Important:** GitHub URLs with deeper paths (e.g. `/issues/`, `/blob/`, `/pull/`) fall through to generic web fetch, not README fetch. Query strings and fragments (`?tab=...`, `#readme`) are stripped before routing. + +**Text output (generic web):** + +```text +Title: Page Title +URL: https://... + +Page content here... + +[Content truncated] +``` + +**JSON output (generic web):** + +```json +{ + "title": "...", + "url": "...", + "content": "...", + "truncated": true +} +``` + +**JSON output (specialized fetchers):** + +```json +{ + "url": "...", + "content": "..." +} +``` + +--- + +## list-engines + +List all 14 supported search engines with their status. + +```text +ferris-search list-engines [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| ---------- | ----- | ----------- | ------- | ------------- | +| `--format` | `-f` | text / json | `text` | Output format | + +**Text output:** + +```text +Supported engines: + + baidu (allowed) + bing (default) + brave (allowed) + ... + zhihu (disabled) +``` + +**JSON output:** + +```json +{ + "default_engine": "bing", + "allowed_engines": ["bing", "duckduckgo", "brave"], + "all_engines": ["baidu", "bing", "brave", "csdn", "duckduckgo", "exa", "firecrawl", "github", "github_code", "jina", "juejin", "linuxdo", "tavily", "zhihu"] +} +``` + +--- + +## show-config + +Show the current effective configuration. API keys are masked. + +```text +ferris-search show-config [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| ---------- | ----- | ----------- | ------- | ------------- | +| `--format` | `-f` | text / json | `text` | Output format | + +**Key masking:** Keys with >8 chars show `abcd...wxyz` (first 4 + last 4). Keys ≀8 chars show `***`. Unset keys show `(not set)`. + +--- + +## index-local + +Build a full-text index from local documents. Powered by [Tantivy](https://github.com/quickwit-oss/tantivy). + +```text +ferris-search index-local --path [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| -------------- | ----- | ----------- | ------------------------------------------------ | --------------------------------------------------- | +| `--path` | `-p` | string | *required* | Directory or file to index (can be repeated) | +| `--index-path` | β€” | string | env `LOCAL_DOCS_INDEX_PATH` or `.ferris-index` | Directory to store the Tantivy index | +| `--format` | `-f` | text / json | `text` | Output format | + +**Supported file types** (configurable via `LOCAL_DOCS_EXTENSIONS`): + +| Extension | Extraction Method | +| ------------------ | ------------------------------------- | +| `.md`, `.markdown` | Raw text | +| `.txt` | Raw text | +| `.html`, `.htm` | HTML β†’ text stripping | +| `.pdf` | pdf-extract (text layer only, no OCR) | + +**Behavior:** + +- Rebuilds the index from scratch each time (`clear = true`) +- Recursively walks directories, following symlinks +- Per-file errors are reported as warnings to stderr but do not abort the run +- Scanned or image-only PDFs without embedded text are not OCRed and may yield empty or low-quality indexed content +- If no documents are found or all fail to index, exits with code 1 + +**Text output:** + +```text +Indexed 42 documents into .ferris-index +Warnings: 2 +``` + +**JSON output:** + +```json +{ + "indexed": 42, + "errors": 2, + "index_path": ".ferris-index" +} +``` + +--- + +## search-local + +Search the local document index built by `index-local`. + +```text +ferris-search search-local [OPTIONS] +``` + +| Parameter | Short | Type | Default | Description | +| -------------- | ----- | ----------- | ------------------------------------------------ | ---------------------------------------- | +| `` | β€” | string | *required* | Search query (positional) | +| `--index-path` | β€” | string | env `LOCAL_DOCS_INDEX_PATH` or `.ferris-index` | Directory of the Tantivy index | +| `--limit` | `-l` | u32 | `10` | Max results (1–50) | +| `--format` | `-f` | text / json | `text` | Output format | + +**Query syntax:** Supports Tantivy query syntax β€” terms are OR-ed by default. Use `+term` for required, `-term` for exclusion, `"exact phrase"` for phrase match. + +**Fields searched:** `title` and `body` (full-text indexed). + +**Text output:** + +```text +Found 3 results: + +1. **Getting Started with Rust** +Path: /docs/getting-started.md +Type: md | Score: 12.3456 +Snippet: Rust is a systems programming language... + +2. **Error Handling** +Path: /docs/errors.md +Type: md | Score: 8.7654 +Snippet: In Rust, errors are values... +``` + +**JSON output:** Array of `LocalSearchResult` objects: + +```json +[ + { + "title": "Getting Started with Rust", + "path": "/docs/getting-started.md", + "snippet": "Rust is a systems programming language...", + "file_type": "md", + "score": 12.3456 + } +] +``` + +--- + +## mcp + +Start the MCP server using stdio transport. + +```text +ferris-search mcp +``` + +No options. Logging level defaults to `info` (configurable via `RUST_LOG`). + +--- + +## Global Behavior + +### TTY Auto-detection + +| Condition | Behavior | +| ------------------------------------- | --------------------------------- | +| No subcommand + piped stdin | Auto-enters MCP mode | +| No subcommand + interactive terminal | Prints help and exits (code 0) | +| Explicit subcommand | Runs that subcommand | + +### Exit Codes + +- `0`: Success +- `1`: Search/fetch failure +- `2`: Parameter error, such as invalid engine or bad URL + +### Logging + +- CLI subcommands (`search`, `fetch`): default log level `warn` (to stderr) +- MCP mode (`mcp`): default log level `info` (to stderr) +- Override with `RUST_LOG` env var + +--- + +## Supported Engines + +| Engine | Alias(es) | API Key Required | +| ------------- | -------------------------- | ------------------------------ | +| `baidu` | η™ΎεΊ¦ | No | +| `bing` | microsoft bing | No | +| `brave` | brave search | Yes (`BRAVE_API_KEY`) | +| `csdn` | β€” | No | +| `duckduckgo` | ddg, duck duck go | No | +| `exa` | β€” | Yes (`EXA_API_KEY`) | +| `firecrawl` | β€” | Yes (`FIRECRAWL_API_KEY`) | +| `github` | github repos, github repo | No (optional `GITHUB_TOKEN`) | +| `github_code` | github code | No (optional `GITHUB_TOKEN`) | +| `jina` | jina.ai | Yes (`JINA_API_KEY`) | +| `juejin` | ζŽ˜ι‡‘ | No | +| `linuxdo` | linux.do | No | +| `tavily` | β€” | Yes (`TAVILY_API_KEY`) | +| `zhihu` | ηŸ₯乎 | No | diff --git a/skills/ferris-search-setup/SKILL.md b/skills/ferris-search-setup/SKILL.md index 2f9332a..fa4f753 100644 --- a/skills/ferris-search-setup/SKILL.md +++ b/skills/ferris-search-setup/SKILL.md @@ -12,13 +12,24 @@ description: | > **Version:** ferris-search 0.1.0 | **Last Updated:** 2026-03-30 -You are an expert at installing and configuring the `ferris-search` MCP server. Help users by: +You are an expert at installing and configuring the `ferris-search` CLI & MCP binary. Help users by: + - **Setup**: Guide through build, install, and MCP registration - **Configuration**: Explain env vars and their effects +## Important Clarification + +MCP is not deprecated in `ferris-search`. + +- The binary supports both **CLI mode** and **MCP stdio mode** +- Current MCP usage is still the recommended path for Claude Desktop / Cursor / Claude Code integration +- Only the old transport-selection env vars were removed: `MODE` and `ENABLE_HTTP_SERVER` +- Current behavior is: explicit `mcp` subcommand, or automatic MCP mode when stdin is piped + ## Documentation Refer to the local files for detailed documentation: + - `./references/configuration.md` - All environment variables and their effects ## IMPORTANT: Documentation Completeness Check @@ -31,12 +42,14 @@ Refer to the local files for detailed documentation: ## Key Patterns ### Build & register with Claude Code + ```bash cargo build --release claude mcp add ferris-search ./target/release/ferris-search ``` ### With environment variables + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e DEFAULT_SEARCH_ENGINE=bing \ @@ -44,6 +57,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### Claude Desktop / Cursor (mcp-config.json) + ```json { "mcpServers": { @@ -60,6 +74,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### With proxy + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e USE_PROXY=true \ @@ -67,6 +82,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### Docker + ```bash docker build -t ferris-search . docker run -e DEFAULT_SEARCH_ENGINE=bing ferris-search @@ -74,21 +90,21 @@ docker run -e DEFAULT_SEARCH_ENGINE=bing ferris-search ## Configuration Reference -| Env Var | Default | Description | -|---------|---------|-------------| -| `DEFAULT_SEARCH_ENGINE` | `bing` | Engine used when `engines` param is omitted | -| `ALLOWED_SEARCH_ENGINES` | all 14 engines | Comma-separated allow-list | -| `BRAVE_API_KEY` | β€” | Required only for `brave` engine | -| `EXA_API_KEY` | β€” | Required only for `exa` engine | -| `FIRECRAWL_API_KEY` | β€” | Required only for `firecrawl` engine | -| `JINA_API_KEY` | β€” | Required only for `jina` engine | -| `TAVILY_API_KEY` | β€” | Required only for `tavily` engine | -| `GITHUB_TOKEN` | β€” | Optional for `github`/`github_code` engines (60β†’5000 req/hr) | -| `USE_PROXY` | `false` | Enable HTTP/SOCKS5 proxy | -| `PROXY_URL` | `http://127.0.0.1:7890` | Proxy address | -| `ENABLE_HTTP_SERVER` | `false` | Enable HTTP/SSE transport alongside stdio | -| `MODE` | `stdio` | Transport mode: `stdio`, `http`, or `both` | -| `RUST_LOG` | `info` | Log level: `debug`, `info`, `warn`, `error` | +| Env Var | Default | Description | +| ------------------------ | ------------------------------ | ------------------------------------------------------------ | +| `DEFAULT_SEARCH_ENGINE` | `bing` | Engine used when `engines` param is omitted | +| `ALLOWED_SEARCH_ENGINES` | all 14 engines | Comma-separated allow-list | +| `BRAVE_API_KEY` | β€” | Required only for `brave` engine | +| `EXA_API_KEY` | β€” | Required only for `exa` engine | +| `FIRECRAWL_API_KEY` | β€” | Required only for `firecrawl` engine | +| `JINA_API_KEY` | β€” | Required only for `jina` engine | +| `TAVILY_API_KEY` | β€” | Required only for `tavily` engine | +| `GITHUB_TOKEN` | β€” | Optional for `github`/`github_code` engines (60β†’5000 req/hr) | +| `USE_PROXY` | `false` | Enable HTTP/SOCKS5 proxy | +| `PROXY_URL` | `http://127.0.0.1:7890` | Proxy address | +| `LOCAL_DOCS_INDEX_PATH` | `.ferris-index` | Directory to store the local full-text index | +| `LOCAL_DOCS_EXTENSIONS` | `md,markdown,txt,html,htm,pdf` | Comma-separated file extensions for local indexing | +| `RUST_LOG` | `info` | Log level: `debug`, `info`, `warn`, `error` | ## When Writing Code @@ -103,3 +119,4 @@ docker run -e DEFAULT_SEARCH_ENGINE=bing ferris-search 2. JSON config is needed for Claude Desktop / Cursor 3. Proxy support works for all engines including those behind GFW 4. `ALLOWED_SEARCH_ENGINES` acts as an allow-list β€” engines not listed are silently filtered out +5. If a user asks whether MCP was deprecated, answer: MCP is still supported; only the old transport env vars were removed diff --git a/skills/ferris-search-setup/references/configuration.md b/skills/ferris-search-setup/references/configuration.md index 7dc7fe7..a340a85 100644 --- a/skills/ferris-search-setup/references/configuration.md +++ b/skills/ferris-search-setup/references/configuration.md @@ -36,7 +36,7 @@ ALLOWED_SEARCH_ENGINES=baidu,csdn,juejin,zhihu,bing - **Default:** unset - **Effect:** Required to use the `brave` engine. Without it, `brave` calls will fail. -- **Get a key:** https://brave.com/search/api/ +- **Get a key:** ```bash BRAVE_API_KEY=your-brave-api-key @@ -48,7 +48,7 @@ BRAVE_API_KEY=your-brave-api-key - **Default:** unset - **Effect:** Required to use the `exa` engine. Without it, `exa` calls will fail. -- **Get a key:** https://exa.ai +- **Get a key:** ```bash EXA_API_KEY=exa-xxxxxxxxxxxxxxxx @@ -60,7 +60,7 @@ EXA_API_KEY=exa-xxxxxxxxxxxxxxxx - **Default:** unset - **Effect:** Required to use the `firecrawl` engine. Without it, `firecrawl` calls will fail. -- **Get a key:** https://firecrawl.dev +- **Get a key:** ```bash FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxx @@ -72,7 +72,7 @@ FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxx - **Default:** unset - **Effect:** Required to use the `jina` engine. Without it, `jina` calls will fail. -- **Get a key:** https://jina.ai +- **Get a key:** ```bash JINA_API_KEY=jina_xxxxxxxxxxxxxxxx @@ -84,7 +84,7 @@ JINA_API_KEY=jina_xxxxxxxxxxxxxxxx - **Default:** unset - **Effect:** Required to use the `tavily` engine. Without it, `tavily` calls will fail. -- **Get a key:** https://tavily.com +- **Get a key:** ```bash TAVILY_API_KEY=tvly-xxxxxxxxxxxxxxxx @@ -124,20 +124,30 @@ PROXY_URL=socks5://127.0.0.1:1080 --- -### ENABLE_HTTP_SERVER / MODE +### LOCAL_DOCS_INDEX_PATH -- **ENABLE_HTTP_SERVER default:** `false` -- **MODE default:** `stdio` -- **MODE values:** `stdio`, `http`, `both` -- **Effect:** Enables HTTP/SSE transport in addition to (or instead of) stdio +- **Default:** `.ferris-index` +- **Effect:** Directory where `index-local` writes and `search-local` reads the Tantivy full-text index +- **Used by:** CLI `index-local` and `search-local` subcommands ```bash -# HTTP only -ENABLE_HTTP_SERVER=true -MODE=http +LOCAL_DOCS_INDEX_PATH=./my-docs-index +``` + +--- + +### LOCAL_DOCS_EXTENSIONS + +- **Default:** `md,markdown,txt,html,htm,pdf` (when unset, uses built-in defaults) +- **Format:** comma-separated list of file extensions (without dots) +- **Effect:** Controls which file types are collected during `index-local` + +```bash +# Only index Markdown and text files +LOCAL_DOCS_EXTENSIONS=md,txt -# Both transports -MODE=both +# Include PDF +LOCAL_DOCS_EXTENSIONS=md,markdown,txt,html,htm,pdf ``` --- @@ -157,11 +167,13 @@ RUST_LOG=debug # verbose logging for troubleshooting ## Complete Example Configurations ### Minimal (stdio, default bing) + ```bash claude mcp add ferris-search ./target/release/ferris-search ``` ### Privacy-focused + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e DEFAULT_SEARCH_ENGINE=duckduckgo \ @@ -169,6 +181,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### Chinese developer workflow + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e DEFAULT_SEARCH_ENGINE=bing \ @@ -176,6 +189,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### With Exa AI search + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e DEFAULT_SEARCH_ENGINE=exa \ @@ -184,6 +198,7 @@ claude mcp add ferris-search ./target/release/ferris-search \ ``` ### Behind GFW with proxy + ```bash claude mcp add ferris-search ./target/release/ferris-search \ -e USE_PROXY=true \ diff --git a/skills/ferris-search-tools/SKILL.md b/skills/ferris-search-tools/SKILL.md index c94e59b..54a87e2 100644 --- a/skills/ferris-search-tools/SKILL.md +++ b/skills/ferris-search-tools/SKILL.md @@ -14,12 +14,22 @@ description: | > **Version:** ferris-search 0.1.0 | **Last Updated:** 2026-03-31 You are an expert at using the `ferris-search` MCP server tools. Help users by: + - **Writing MCP calls**: Generate correct tool invocations with proper parameters - **Answering questions**: Explain which tool to use and why, troubleshoot issues +## Important Clarification + +MCP is not deprecated in `ferris-search`. + +- The project now supports both CLI commands and MCP tools +- MCP tooling remains a first-class integration path +- What changed is that old transport env vars such as `MODE` and `ENABLE_HTTP_SERVER` were removed + ## Documentation Refer to the local files for detailed documentation: + - `./references/tools-api.md` - Complete tool parameter reference - `./references/engines.md` - Search engine details and aliases @@ -36,6 +46,7 @@ Refer to the local files for detailed documentation: ## Key Patterns ### Single-engine search + ```json { "tool": "web_search", @@ -45,6 +56,7 @@ Refer to the local files for detailed documentation: ``` ### Multi-engine fan-out + ```json { "tool": "web_search", @@ -55,6 +67,7 @@ Refer to the local files for detailed documentation: ``` ### GitHub repository search + ```json { "tool": "web_search", @@ -65,6 +78,7 @@ Refer to the local files for detailed documentation: ``` ### GitHub code search + ```json { "tool": "web_search", @@ -75,6 +89,7 @@ Refer to the local files for detailed documentation: ``` ### Fetch any web page + ```json { "tool": "fetch_web_content", @@ -84,6 +99,7 @@ Refer to the local files for detailed documentation: ``` ### Fetch GitHub README + ```json { "tool": "fetch_github_readme", @@ -92,6 +108,7 @@ Refer to the local files for detailed documentation: ``` ### Fetch domain-specific article + ```json // CSDN { "tool": "fetch_csdn_article", "url": "https://blog.csdn.net/..." } @@ -105,22 +122,24 @@ Refer to the local files for detailed documentation: ## API Reference Table -| Tool | Required Params | Optional Params | URL Constraint | -|------|----------------|-----------------|----------------| -| `web_search` | `query` | `engines`, `limit` (1–50) | β€” | -| `fetch_web_content` | `url` | `max_chars` (default 30000) | public HTTP/HTTPS | -| `fetch_github_readme` | `url` | β€” | github.com | -| `fetch_csdn_article` | `url` | β€” | csdn.net | -| `fetch_juejin_article` | `url` | β€” | juejin.cn + /post/ | -| `fetch_zhihu_article` | `url` | β€” | zhihu.com | -| `fetch_linuxdo_article` | `url` | β€” | linux.do + /topic/ | +> All fetch tools enforce SSRF protection: URLs must be public HTTP/HTTPS (private IPs and localhost are rejected). Domain-specific fetchers validate the URL **host** (not just string containment). + +| Tool | Required Params | Optional Params | URL Constraint (host-based) | +| ----------------------- | --------------- | --------------------------- | --------------------------- | +| `web_search` | `query` | `engines`, `limit` (1–50) | β€” | +| `fetch_web_content` | `url` | `max_chars` (default 30000) | public HTTP/HTTPS | +| `fetch_github_readme` | `url` | β€” | host: github.com | +| `fetch_csdn_article` | `url` | β€” | host: csdn.net | +| `fetch_juejin_article` | `url` | β€” | host: juejin.cn + /post/ | +| `fetch_zhihu_article` | `url` | β€” | host: zhihu.com | +| `fetch_linuxdo_article` | `url` | β€” | host: linux.do + /topic/ | ## Deprecated Patterns (Don't Use) -| Deprecated | Correct | Notes | -|------------|---------|-------| -| Passing engine as string `"engines": "bing"` | `"engines": ["bing"]` | Must be an array | -| `limit > 50` | `limit: 50` | Clamped to max 50 | +| Deprecated | Correct | Notes | +| ----------------------------------------------- | --------------------------- | ------------------------- | +| Passing engine as string `"engines": "bing"` | `"engines": ["bing"]` | Must be an array | +| `limit > 50` | `limit: 50` | Clamped to max 50 | | Using `fetch_web_content` for CSDN/Juejin/Zhihu | Use domain-specific fetcher | Better extraction quality | ## When Writing Code diff --git a/skills/ferris-search-tools/references/tools-api.md b/skills/ferris-search-tools/references/tools-api.md index 2d6ad4b..6f6c7c6 100644 --- a/skills/ferris-search-tools/references/tools-api.md +++ b/skills/ferris-search-tools/references/tools-api.md @@ -4,37 +4,36 @@ Search the web using one or more engines. Supports concurrent fan-out. -### Parameters +### web_search Parameters -| Parameter | Type | Required | Default | Constraints | -|-----------|------|----------|---------|-------------| -| `query` | string | yes | β€” | Any search query | -| `engines` | string[] | no | `DEFAULT_SEARCH_ENGINE` env var | Must be array, see engine list below | -| `limit` | number | no | 10 | 1–50 (clamped) | +| Parameter | Type | Required | Default | Constraints | +| --------- | -------- | -------- | ------------------------------- | ------------------------------------- | +| `query` | string | yes | β€” | Any search query | +| `engines` | string[] | no | `DEFAULT_SEARCH_ENGINE` env var | Must be array, see engine list below | +| `limit` | number | no | 10 | 1–50 (clamped) | ### Engine Names & Aliases -| Canonical | Aliases | -|-----------|---------| -| `bing` | `microsoft bing` | -| `duckduckgo` | `ddg`, `duck duck go` | -| `brave` | `brave search` | (requires `BRAVE_API_KEY`) | -| `baidu` | `η™ΎεΊ¦` | -| `csdn` | β€” | -| `juejin` | `ζŽ˜ι‡‘` | -| `exa` | β€” | (requires `EXA_API_KEY`) | -| `firecrawl` | β€” | (requires `FIRECRAWL_API_KEY`) | -| `zhihu` | `ηŸ₯乎` | -| `linuxdo` | `linux.do` | -| `jina` | `jina.ai` | (requires `JINA_API_KEY`) | -| `tavily` | β€” | (requires `TAVILY_API_KEY`) | -| `github` | `github repos`, `github repo` | (optional `GITHUB_TOKEN`; searches repositories) | -| `github_code` | `github code` | (optional `GITHUB_TOKEN`; searches code files) | - -### Output Format +- `bing`: aliases `microsoft bing` +- `duckduckgo`: aliases `ddg`, `duck duck go` +- `brave`: aliases `brave search`; requires `BRAVE_API_KEY` +- `baidu`: aliases `η™ΎεΊ¦` +- `csdn`: no aliases +- `juejin`: aliases `ζŽ˜ι‡‘` +- `exa`: no aliases; requires `EXA_API_KEY` +- `firecrawl`: no aliases; requires `FIRECRAWL_API_KEY` +- `zhihu`: aliases `ηŸ₯乎` +- `linuxdo`: aliases `linux.do` +- `jina`: aliases `jina.ai`; requires `JINA_API_KEY` +- `tavily`: no aliases; requires `TAVILY_API_KEY` +- `github`: aliases `github repos`, `github repo`; optional `GITHUB_TOKEN`; searches repositories +- `github_code`: aliases `github code`; optional `GITHUB_TOKEN`; searches code files + +### fetch_web_content Output Format Single engine: -``` + +```text Engine: bing Total: 10 @@ -45,7 +44,8 @@ Description: ... ``` Multi-engine: -``` + +```text Total results: 25 ## Results from bing @@ -63,16 +63,16 @@ Total results: 25 Fetch and extract text from any public URL using HTML scraping. -### Parameters +### fetch_web_content Parameters -| Parameter | Type | Required | Default | Constraints | -|-----------|------|----------|---------|-------------| -| `url` | string | yes | β€” | Must be public HTTP/HTTPS | -| `max_chars` | number | no | 30000 | max 200000 | +| Parameter | Type | Required | Default | Constraints | +| ----------- | ------ | -------- | ------- | ------------------------- | +| `url` | string | yes | β€” | Must be public HTTP/HTTPS | +| `max_chars` | number | no | 30000 | max 200000 | ### Output Format -``` +```text Title: Page Title URL: https://... @@ -83,6 +83,8 @@ URL: https://... ### URL Safety Rules +All fetch tools enforce the following SSRF protection: + - Must start with `http://` or `https://` - Must not be a private/internal IP (10.x, 192.168.x, 127.x, etc.) - Must not be `localhost` @@ -93,15 +95,15 @@ URL: https://... Fetch README from a GitHub repository via the GitHub raw content API. -### Parameters +### fetch_github_readme Parameters -| Parameter | Type | Required | Constraints | -|-----------|------|----------|-------------| -| `url` | string | yes | Must be a `github.com` URL | +| Parameter | Type | Required | Constraints | +| --------- | ------ | -------- | ------------------------------ | +| `url` | string | yes | URL host must be `github.com` | ### Supported URL Formats -``` +```text https://github.com/owner/repo https://github.com/owner/repo/tree/branch ``` @@ -116,11 +118,11 @@ Raw README content (markdown text). Fetch full article from CSDN blog. -### Parameters +### fetch_csdn_article Parameters -| Parameter | Type | Required | Constraints | -|-----------|------|----------|-------------| -| `url` | string | yes | Must contain `csdn.net` | +| Parameter | Type | Required | Constraints | +| --------- | ------ | -------- | -------------------------------------------- | +| `url` | string | yes | URL host must be `csdn.net` or its subdomain | --- @@ -128,11 +130,9 @@ Fetch full article from CSDN blog. Fetch full article from Juejin. -### Parameters +### fetch_juejin_article Parameters -| Parameter | Type | Required | Constraints | -|-----------|------|----------|-------------| -| `url` | string | yes | Must contain `juejin.cn` AND `/post/` | +- `url` (`string`, required): URL host must be `juejin.cn` or its subdomain, and the path must contain `/post/` --- @@ -140,11 +140,11 @@ Fetch full article from Juejin. Fetch full article from Zhihu. -### Parameters +### fetch_zhihu_article Parameters -| Parameter | Type | Required | Constraints | -|-----------|------|----------|-------------| -| `url` | string | yes | Must contain `zhihu.com` | +| Parameter | Type | Required | Constraints | +| --------- | ------ | -------- | --------------------------------------------- | +| `url` | string | yes | URL host must be `zhihu.com` or its subdomain | --- @@ -152,8 +152,6 @@ Fetch full article from Zhihu. Fetch full topic from linux.do forum. -### Parameters +### fetch_linuxdo_article Parameters -| Parameter | Type | Required | Constraints | -|-----------|------|----------|-------------| -| `url` | string | yes | Must contain `linux.do` AND `/topic/` | +- `url` (`string`, required): URL host must be `linux.do` or its subdomain, and the path must contain `/topic/` diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..3096c69 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,103 @@ +use clap::{Parser, Subcommand, ValueEnum}; + +#[derive(Parser)] +#[command( + name = "ferris-search", + version, + about = "A blazing-fast multi-engine web search tool & MCP server with local document indexing, written in Rust." +)] +pub struct Cli { + #[command(subcommand)] + pub command: Option, +} + +#[derive(Subcommand)] +pub enum Command { + /// Search the web using one or more engines + Search { + /// Search query + query: String, + + /// Search engines to use (can be specified multiple times, or comma-separated) + #[arg(short, long = "engine", value_delimiter = ',')] + engine: Vec, + + /// Maximum results per engine (1-50) + #[arg(short, long, default_value_t = 10)] + limit: u32, + + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, + + /// Fetch and extract content from a URL + Fetch { + /// URL to fetch + url: String, + + /// Maximum characters of content to return + #[arg(short, long, default_value_t = 30000)] + max_chars: u32, + + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, + + /// List all supported and allowed search engines + ListEngines { + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, + + /// Show the current effective configuration + ShowConfig { + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, + + /// Start the MCP server (stdio transport) + Mcp, + + /// Build a full-text index from local documents + IndexLocal { + /// Directories or files to index (can be specified multiple times) + #[arg(short, long = "path", required = true)] + path: Vec, + + /// Directory to store the index (default: from LOCAL_DOCS_INDEX_PATH or ./.ferris-index) + #[arg(long)] + index_path: Option, + + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, + + /// Search the local document index + SearchLocal { + /// Search query + query: String, + + /// Directory of the index (default: from LOCAL_DOCS_INDEX_PATH or ./.ferris-index) + #[arg(long)] + index_path: Option, + + /// Maximum results to return (1-50) + #[arg(short, long, default_value_t = 10)] + limit: u32, + + /// Output format + #[arg(short, long, default_value = "text")] + format: OutputFormat, + }, +} + +#[derive(Clone, ValueEnum)] +pub enum OutputFormat { + Text, + Json, +} diff --git a/src/config.rs b/src/config.rs index ee741ae..d031ffa 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,6 +12,8 @@ pub struct AppConfig { pub jina_api_key: Option, pub tavily_api_key: Option, pub github_token: Option, + pub local_docs_index_path: String, + pub local_docs_extensions: Vec, } impl AppConfig { @@ -60,6 +62,19 @@ impl AppConfig { let tavily_api_key = env::var("TAVILY_API_KEY").ok(); let github_token = env::var("GITHUB_TOKEN").ok(); + let local_docs_index_path = env::var("LOCAL_DOCS_INDEX_PATH") + .unwrap_or_else(|_| ".ferris-index".to_string()); + + let local_docs_ext_str = env::var("LOCAL_DOCS_EXTENSIONS").unwrap_or_default(); + let local_docs_extensions: Vec = if local_docs_ext_str.is_empty() { + Vec::new() // empty means use default from collector + } else { + local_docs_ext_str + .split(',') + .map(|s| s.trim().to_lowercase()) + .collect() + }; + Self { default_search_engine, allowed_search_engines, @@ -71,6 +86,8 @@ impl AppConfig { jina_api_key, tavily_api_key, github_token, + local_docs_index_path, + local_docs_extensions, } } diff --git a/src/fetchers/csdn.rs b/src/fetchers/csdn.rs index e5bac05..aa853a7 100644 --- a/src/fetchers/csdn.rs +++ b/src/fetchers/csdn.rs @@ -1,4 +1,7 @@ -use crate::utils::http_client::{build_client, chrome_headers}; +use crate::utils::{ + http_client::{build_client, chrome_headers}, + url_safety::assert_public_http_url, +}; use scraper::{Html, Selector}; fn normalize_text(s: &str) -> String { @@ -23,6 +26,7 @@ fn extract_content(html: &str) -> String { } pub async fn fetch_csdn_article(url: &str) -> anyhow::Result { + assert_public_http_url(url)?; let client = build_client()?; let resp = client .get(url) diff --git a/src/fetchers/github.rs b/src/fetchers/github.rs index ef3fc15..d077c58 100644 --- a/src/fetchers/github.rs +++ b/src/fetchers/github.rs @@ -26,7 +26,8 @@ fn parse_github_url(url: &str) -> Option { } else { return None; }; - // strip .git suffix + // strip query string, fragment, and .git suffix + let path = path.split(['?', '#']).next().unwrap_or(path); let path = path.trim_end_matches(".git"); // take only first two path segments let parts: Vec<&str> = path.splitn(3, '/').collect(); @@ -67,3 +68,68 @@ pub async fn fetch_github_readme(url: &str) -> anyhow::Result { .await .ok_or_else(|| anyhow::anyhow!("Could not fetch README for {}/{}", info.owner, info.repo)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_https_url() { + let info = parse_github_url("https://github.com/tokio-rs/tokio").unwrap(); + assert_eq!(info.owner, "tokio-rs"); + assert_eq!(info.repo, "tokio"); + } + + #[test] + fn parse_http_url() { + let info = parse_github_url("http://github.com/owner/repo").unwrap(); + assert_eq!(info.owner, "owner"); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_git_ssh_url() { + let info = parse_github_url("git@github.com:owner/repo.git").unwrap(); + assert_eq!(info.owner, "owner"); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_strips_dot_git() { + let info = parse_github_url("https://github.com/owner/repo.git").unwrap(); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_strips_query_and_fragment() { + let info = parse_github_url("https://github.com/owner/repo?tab=readme#section").unwrap(); + assert_eq!(info.owner, "owner"); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_trailing_slash() { + let info = parse_github_url("https://github.com/owner/repo/").unwrap(); + assert_eq!(info.owner, "owner"); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_deeper_path_takes_owner_repo() { + let info = parse_github_url("https://github.com/owner/repo/issues/42").unwrap(); + assert_eq!(info.owner, "owner"); + assert_eq!(info.repo, "repo"); + } + + #[test] + fn parse_rejects_non_github() { + assert!(parse_github_url("https://gitlab.com/owner/repo").is_none()); + } + + #[test] + fn parse_rejects_incomplete() { + assert!(parse_github_url("https://github.com/").is_none()); + assert!(parse_github_url("https://github.com/owner").is_none()); + assert!(parse_github_url("https://github.com/owner/").is_none()); + } +} diff --git a/src/fetchers/juejin.rs b/src/fetchers/juejin.rs index b4c2d19..99313d6 100644 --- a/src/fetchers/juejin.rs +++ b/src/fetchers/juejin.rs @@ -1,4 +1,7 @@ -use crate::utils::http_client::{build_client, chrome_headers}; +use crate::utils::{ + http_client::{build_client, chrome_headers}, + url_safety::assert_public_http_url, +}; use scraper::{Html, Selector}; fn normalize_text(s: &str) -> String { @@ -36,9 +39,9 @@ fn extract_content(html: &str) -> String { } pub async fn fetch_juejin_article(url: &str) -> anyhow::Result { - // Validate it's a juejin post URL - if !url.contains("juejin.cn") || !url.contains("/post/") { - anyhow::bail!("URL must be from juejin.cn and contain /post/ path"); + assert_public_http_url(url)?; + if !url.contains("/post/") { + anyhow::bail!("URL must contain /post/ path"); } let client = build_client()?; let resp = client.get(url).headers(chrome_headers()).send().await?; diff --git a/src/fetchers/web.rs b/src/fetchers/web.rs index 2e359ac..8e1ad71 100644 --- a/src/fetchers/web.rs +++ b/src/fetchers/web.rs @@ -109,8 +109,9 @@ pub async fn fetch_web_content(url: &str, max_chars: Option) -> anyhow::R let truncated = text.len() > max; if truncated { - // Truncate at a newline boundary if possible - let cut = text[..max].rfind('\n').unwrap_or(max); + // Truncate at a newline boundary if possible (safe for multi-byte UTF-8) + let safe_max = text.floor_char_boundary(max); + let cut = text[..safe_max].rfind('\n').unwrap_or(safe_max); text.truncate(cut); } diff --git a/src/fetchers/zhihu.rs b/src/fetchers/zhihu.rs index 839d95c..86f9050 100644 --- a/src/fetchers/zhihu.rs +++ b/src/fetchers/zhihu.rs @@ -1,4 +1,7 @@ -use crate::utils::http_client::{build_client, chrome_headers}; +use crate::utils::{ + http_client::{build_client, chrome_headers}, + url_safety::assert_public_http_url, +}; use scraper::{Html, Selector}; fn normalize_text(s: &str) -> String { @@ -29,9 +32,7 @@ fn extract_content(html: &str) -> String { } pub async fn fetch_zhihu_article(url: &str) -> anyhow::Result { - if !url.contains("zhihu.com") { - anyhow::bail!("URL must be from zhihu.com"); - } + assert_public_http_url(url)?; let client = build_client()?; let mut headers = chrome_headers(); headers.insert("referer", "https://www.zhihu.com/".parse().unwrap()); diff --git a/src/index/collector.rs b/src/index/collector.rs new file mode 100644 index 0000000..ede5000 --- /dev/null +++ b/src/index/collector.rs @@ -0,0 +1,392 @@ +use crate::index::indexer::LocalDocument; +use anyhow::{Context, Result}; +use scraper::{Html, Selector}; +use std::path::Path; +use walkdir::WalkDir; + +/// Default allowed extensions for local document indexing. +pub const DEFAULT_EXTENSIONS: &[&str] = &["md", "markdown", "txt", "html", "htm", "pdf"]; + +/// Collected file ready for indexing. +struct CollectedFile { + path: String, + file_type: String, + content: String, + /// Raw source for title extraction (e.g. original HTML before text extraction). + raw_title_source: Option, +} + +/// Recursively scan directories and collect documents. +/// Returns (documents, errors) β€” errors are per-file warnings, not fatal. +pub fn collect_documents( + paths: &[String], + extensions: &[String], +) -> (Vec, Vec) { + let mut docs = Vec::new(); + let mut errors = Vec::new(); + + let ext_set: Vec = if extensions.is_empty() { + DEFAULT_EXTENSIONS.iter().map(|s| s.to_string()).collect() + } else { + extensions.to_vec() + }; + + for root in paths { + let root_path = Path::new(root); + if !root_path.exists() { + errors.push(format!("Path does not exist: {}", root)); + continue; + } + + if root_path.is_file() { + match process_file(root_path, &ext_set) { + Ok(Some(f)) => docs.push(file_to_document(f)), + Ok(None) => {} // skipped (extension not allowed) + Err(e) => errors.push(format!("{}: {}", root, e)), + } + continue; + } + + for entry in WalkDir::new(root).follow_links(true).into_iter() { + let entry = match entry { + Ok(e) => e, + Err(e) => { + errors.push(format!("Walk error: {}", e)); + continue; + } + }; + + if !entry.file_type().is_file() { + continue; + } + + match process_file(entry.path(), &ext_set) { + Ok(Some(f)) => docs.push(file_to_document(f)), + Ok(None) => {} + Err(e) => errors.push(format!("{}: {}", entry.path().display(), e)), + } + } + } + + (docs, errors) +} + +fn process_file(path: &Path, allowed_ext: &[String]) -> Result> { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + + if !allowed_ext.iter().any(|a| a == &ext) { + return Ok(None); + } + + let abs_path = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf()); + let abs_path = abs_path.to_string_lossy(); + // Strip Windows verbatim path prefix for cleaner output. + // `\\?\UNC\server\share` β†’ `\\server\share` (UNC path) + // `\\?\C:\dir` β†’ `C:\dir` (regular path) + let abs_path = if let Some(unc) = abs_path.strip_prefix(r"\\?\UNC\") { + format!(r"\\{}", unc) + } else { + abs_path + .strip_prefix(r"\\?\") + .unwrap_or(&abs_path) + .to_string() + }; + + let content = match ext.as_str() { + "md" | "markdown" | "txt" => { + std::fs::read_to_string(path).context("Failed to read file")? + } + "html" | "htm" => { + let raw = std::fs::read_to_string(path).context("Failed to read HTML file")?; + let text = extract_html_text(&raw); + return Ok(Some(CollectedFile { + path: abs_path, + file_type: ext, + content: text, + raw_title_source: Some(raw), + })); + } + "pdf" => extract_pdf_text(path)?, + _ => return Ok(None), + }; + + Ok(Some(CollectedFile { + path: abs_path, + file_type: ext, + content, + raw_title_source: None, + })) +} + +fn file_to_document(f: CollectedFile) -> LocalDocument { + let title_source = f.raw_title_source.as_deref().unwrap_or(&f.content); + let title = extract_title(title_source, &f.file_type, &f.path); + LocalDocument { + title, + body: f.content, + path: f.path, + file_type: f.file_type, + } +} + +/// Extract title based on file type. +/// Priority: MD first heading > HTML title/h1 > filename. +fn extract_title(content: &str, file_type: &str, path: &str) -> String { + match file_type { + "md" | "markdown" => { + // Find first ATX heading (# Title) + for line in content.lines() { + let trimmed = line.trim(); + if let Some(heading) = trimmed.strip_prefix('#') { + let heading = heading.trim_start_matches('#').trim(); + if !heading.is_empty() { + return heading.to_string(); + } + } + } + } + "html" | "htm" => { + let doc = Html::parse_document(content); + // Try + if let Ok(sel) = Selector::parse("title") { + if let Some(el) = doc.select(&sel).next() { + let t = el.text().collect::<String>().trim().to_string(); + if !t.is_empty() { + return t; + } + } + } + // Try <h1> + if let Ok(sel) = Selector::parse("h1") { + if let Some(el) = doc.select(&sel).next() { + let t = el.text().collect::<String>().trim().to_string(); + if !t.is_empty() { + return t; + } + } + } + } + _ => {} + } + + // Fallback: filename without extension + Path::new(path) + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("Untitled") + .to_string() +} + +/// Extract text from HTML, reusing the same approach as fetchers/web.rs. +fn extract_html_text(html: &str) -> String { + let doc = Html::parse_document(html); + + let containers = [ + "article", + "main", + "#main", + ".main", + ".content", + "#content", + ".post", + ".article", + ".entry-content", + ".post-content", + ".article-content", + ]; + + for sel_str in &containers { + if let Ok(sel) = Selector::parse(sel_str) { + if let Some(el) = doc.select(&sel).next() { + let text = el.text().collect::<String>(); + let normalized = normalize_text(&text); + if normalized.len() > 200 { + return normalized; + } + } + } + } + + // Fallback: body text + if let Ok(sel) = Selector::parse("body") { + if let Some(el) = doc.select(&sel).next() { + return normalize_text(&el.text().collect::<String>()); + } + } + + normalize_text(&doc.root_element().text().collect::<String>()) +} + +fn normalize_text(s: &str) -> String { + s.replace("\r\n", "\n") + .replace('\u{00a0}', " ") + .split('\n') + .map(|l| l.trim_end()) + .collect::<Vec<_>>() + .join("\n") + .trim() + .split("\n\n\n") + .collect::<Vec<_>>() + .join("\n\n") +} + +/// Extract text from a PDF file. +fn extract_pdf_text(path: &Path) -> Result<String> { + let bytes = std::fs::read(path).context("Failed to read PDF file")?; + let text = + pdf_extract::extract_text_from_mem(&bytes).context("Failed to extract text from PDF")?; + Ok(normalize_text(&text)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::TempDir; + + #[test] + fn test_extract_title_markdown_heading() { + let content = "# My Document\n\nSome body text."; + assert_eq!(extract_title(content, "md", "doc.md"), "My Document"); + } + + #[test] + fn test_extract_title_markdown_nested_heading() { + let content = "## Second Level\n\nBody."; + assert_eq!(extract_title(content, "md", "doc.md"), "Second Level"); + } + + #[test] + fn test_extract_title_html_title_tag() { + let html = "<html><head><title>Page TitleText"; + assert_eq!(extract_title(html, "html", "page.html"), "Page Title"); + } + + #[test] + fn test_extract_title_html_h1_fallback() { + let html = "

Heading One

Text

"; + assert_eq!(extract_title(html, "html", "page.html"), "Heading One"); + } + + #[test] + fn test_extract_title_fallback_to_filename() { + let content = "No headings here, just plain text."; + assert_eq!(extract_title(content, "txt", "/path/to/notes.txt"), "notes"); + } + + #[test] + fn test_extract_title_empty_heading_falls_back() { + let content = "# \n\nBody text."; + assert_eq!(extract_title(content, "md", "readme.md"), "readme"); + } + + #[test] + fn test_normalize_text_removes_excess_blank_lines() { + let input = "Line 1\n\n\nLine 2"; + let result = normalize_text(input); + assert_eq!(result, "Line 1\n\nLine 2"); + } + + #[test] + fn test_normalize_text_trims_trailing_spaces() { + let input = "Hello \nWorld "; + let result = normalize_text(input); + assert_eq!(result, "Hello\nWorld"); + } + + #[test] + fn test_normalize_text_replaces_nbsp() { + let input = "Hello\u{00a0}World"; + let result = normalize_text(input); + assert_eq!(result, "Hello World"); + } + + #[test] + fn test_extract_html_text_from_article() { + let html = format!( + "
{}
", + "Important content. ".repeat(20) + ); + let text = extract_html_text(&html); + assert!(text.contains("Important content.")); + } + + #[test] + fn test_extract_html_text_body_fallback() { + let html = "

Simple paragraph content here.

"; + let text = extract_html_text(html); + assert!(text.contains("Simple paragraph content here.")); + } + + #[test] + fn test_default_extensions() { + assert!(DEFAULT_EXTENSIONS.contains(&"md")); + assert!(DEFAULT_EXTENSIONS.contains(&"txt")); + assert!(DEFAULT_EXTENSIONS.contains(&"html")); + assert!(DEFAULT_EXTENSIONS.contains(&"pdf")); + assert!(!DEFAULT_EXTENSIONS.contains(&"rs")); + } + + #[test] + fn test_collect_documents_with_temp_files() { + let dir = TempDir::new().unwrap(); + + fs::write(dir.path().join("readme.md"), "# Hello\n\nWorld").unwrap(); + fs::write(dir.path().join("notes.txt"), "Some notes here").unwrap(); + fs::write(dir.path().join("skip.rs"), "fn main() {}").unwrap(); + + let (docs, errors) = collect_documents(&[dir.path().to_string_lossy().to_string()], &[]); + + assert!(errors.is_empty(), "Unexpected errors: {:?}", errors); + assert_eq!(docs.len(), 2); + + let titles: Vec<&str> = docs.iter().map(|d| d.title.as_str()).collect(); + assert!(titles.contains(&"Hello")); + assert!(titles.contains(&"notes")); + } + + #[test] + fn test_collect_documents_nonexistent_path() { + let (docs, errors) = collect_documents(&["/nonexistent/path/abc123".to_string()], &[]); + + assert!(docs.is_empty()); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("does not exist")); + } + + #[test] + fn test_collect_documents_extension_filter() { + let dir = TempDir::new().unwrap(); + + fs::write(dir.path().join("a.md"), "# A").unwrap(); + fs::write(dir.path().join("b.txt"), "B").unwrap(); + fs::write(dir.path().join("c.html"), "

C

").unwrap(); + + // Only allow txt + let (docs, errors) = collect_documents( + &[dir.path().to_string_lossy().to_string()], + &["txt".to_string()], + ); + + assert!(errors.is_empty()); + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].file_type, "txt"); + } + + #[test] + fn test_collect_single_file() { + let dir = TempDir::new().unwrap(); + let file_path = dir.path().join("doc.md"); + fs::write(&file_path, "# Single\n\nContent").unwrap(); + + let (docs, errors) = collect_documents(&[file_path.to_string_lossy().to_string()], &[]); + + assert!(errors.is_empty()); + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].title, "Single"); + } +} diff --git a/src/index/indexer.rs b/src/index/indexer.rs new file mode 100644 index 0000000..e4d60b8 --- /dev/null +++ b/src/index/indexer.rs @@ -0,0 +1,161 @@ +use crate::index::schema::{create_schema, fields}; +use anyhow::{Context, Result}; +use std::path::Path; +use tantivy::{Index, IndexWriter, TantivyDocument}; + +/// A document to be indexed. +pub struct LocalDocument { + pub title: String, + pub body: String, + pub path: String, + pub file_type: String, +} + +pub struct Indexer { + index: Index, + writer: IndexWriter, +} + +impl Indexer { + /// Create or open a persistent index at the given path. + /// If `clear` is true, delete all existing documents first (rebuild). + pub fn new>(index_path: P, clear: bool) -> Result { + let schema = create_schema(); + let index_path = index_path.as_ref(); + + let index = if index_path.join("meta.json").exists() { + Index::open_in_dir(index_path).context("Failed to open existing index")? + } else { + std::fs::create_dir_all(index_path).context("Failed to create index directory")?; + Index::create_in_dir(index_path, schema).context("Failed to create index")? + }; + + let writer = index + .writer(50 * 1024 * 1024) + .context("Failed to create index writer")?; + + let mut indexer = Self { index, writer }; + + if clear { + indexer.writer.delete_all_documents()?; + indexer.writer.commit()?; + } + + Ok(indexer) + } + + pub fn add_document(&mut self, doc: &LocalDocument) -> Result<()> { + let schema = self.index.schema(); + + let title_field = schema.get_field(fields::TITLE)?; + let body_field = schema.get_field(fields::BODY)?; + let path_field = schema.get_field(fields::PATH)?; + let file_type_field = schema.get_field(fields::FILE_TYPE)?; + let indexed_at_field = schema.get_field(fields::INDEXED_AT)?; + + let now = tantivy::DateTime::from_timestamp_secs( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64, + ); + + let mut tantivy_doc = TantivyDocument::new(); + tantivy_doc.add_text(title_field, &doc.title); + tantivy_doc.add_text(body_field, &doc.body); + tantivy_doc.add_text(path_field, &doc.path); + tantivy_doc.add_text(file_type_field, &doc.file_type); + tantivy_doc.add_date(indexed_at_field, now); + + self.writer.add_document(tantivy_doc)?; + Ok(()) + } + + pub fn commit(&mut self) -> Result<()> { + self.writer.commit()?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn sample_doc(title: &str, body: &str) -> LocalDocument { + LocalDocument { + title: title.to_string(), + body: body.to_string(), + path: format!("/tmp/{}.md", title), + file_type: "md".to_string(), + } + } + + #[test] + fn test_indexer_create_new() { + let dir = TempDir::new().unwrap(); + let indexer = Indexer::new(dir.path(), false); + assert!(indexer.is_ok()); + } + + #[test] + fn test_indexer_add_and_commit() { + let dir = TempDir::new().unwrap(); + let mut indexer = Indexer::new(dir.path(), false).unwrap(); + + let doc = sample_doc("test", "Hello world content"); + assert!(indexer.add_document(&doc).is_ok()); + assert!(indexer.commit().is_ok()); + } + + #[test] + fn test_indexer_reopen_existing() { + let dir = TempDir::new().unwrap(); + + // Create and commit + { + let mut indexer = Indexer::new(dir.path(), false).unwrap(); + indexer + .add_document(&sample_doc("first", "First document")) + .unwrap(); + indexer.commit().unwrap(); + } + + // Reopen + let indexer = Indexer::new(dir.path(), false); + assert!(indexer.is_ok()); + } + + #[test] + fn test_indexer_clear_rebuilds() { + let dir = TempDir::new().unwrap(); + + // Create index with a document + { + let mut indexer = Indexer::new(dir.path(), false).unwrap(); + indexer + .add_document(&sample_doc("old", "Old document")) + .unwrap(); + indexer.commit().unwrap(); + } + + // Reopen with clear=true, add new doc + { + let mut indexer = Indexer::new(dir.path(), true).unwrap(); + indexer + .add_document(&sample_doc("new", "New document")) + .unwrap(); + indexer.commit().unwrap(); + } + + // Verify via searcher: only "new" should exist + use crate::index::searcher::Searcher; + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("old", 10).unwrap(); + assert!(results.is_empty(), "Old document should have been cleared"); + + let results = searcher.search("new", 10).unwrap(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].title, "new"); + } +} diff --git a/src/index/mod.rs b/src/index/mod.rs new file mode 100644 index 0000000..ebc04b0 --- /dev/null +++ b/src/index/mod.rs @@ -0,0 +1,4 @@ +pub mod collector; +pub mod indexer; +pub mod schema; +pub mod searcher; diff --git a/src/index/schema.rs b/src/index/schema.rs new file mode 100644 index 0000000..4b882d6 --- /dev/null +++ b/src/index/schema.rs @@ -0,0 +1,51 @@ +use tantivy::schema::{FAST, INDEXED, STORED, STRING, Schema, TEXT}; + +/// Create the Tantivy schema for local document indexing. +/// +/// Fields: +/// - `title`: document title (full-text indexed + stored) +/// - `body`: document body (full-text indexed + stored) +/// - `path`: file path (string + stored) +/// - `file_type`: extension (md/txt/html/pdf) (string + stored + FAST) +/// - `indexed_at`: index timestamp (date + indexed + stored + FAST) +pub fn create_schema() -> Schema { + let mut builder = Schema::builder(); + + builder.add_text_field("title", TEXT | STORED); + builder.add_text_field("body", TEXT | STORED); + builder.add_text_field("path", STRING | STORED); + builder.add_text_field("file_type", STRING | STORED | FAST); + builder.add_date_field("indexed_at", INDEXED | STORED | FAST); + + builder.build() +} + +pub mod fields { + pub const TITLE: &str = "title"; + pub const BODY: &str = "body"; + pub const PATH: &str = "path"; + pub const FILE_TYPE: &str = "file_type"; + pub const INDEXED_AT: &str = "indexed_at"; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_schema_has_all_fields() { + let schema = create_schema(); + assert!(schema.get_field(fields::TITLE).is_ok()); + assert!(schema.get_field(fields::BODY).is_ok()); + assert!(schema.get_field(fields::PATH).is_ok()); + assert!(schema.get_field(fields::FILE_TYPE).is_ok()); + assert!(schema.get_field(fields::INDEXED_AT).is_ok()); + } + + #[test] + fn test_schema_field_count() { + let schema = create_schema(); + // Schema should have exactly 5 fields + assert_eq!(schema.fields().count(), 5); + } +} diff --git a/src/index/searcher.rs b/src/index/searcher.rs new file mode 100644 index 0000000..00cf446 --- /dev/null +++ b/src/index/searcher.rs @@ -0,0 +1,200 @@ +use crate::index::schema::fields; +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use std::path::Path; +use tantivy::{ + Index, IndexReader, ReloadPolicy, collector::TopDocs, query::QueryParser, schema::Value, +}; + +/// A local search result. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LocalSearchResult { + pub title: String, + pub path: String, + pub snippet: String, + pub file_type: String, + pub score: f32, +} + +pub struct Searcher { + index: Index, + reader: IndexReader, +} + +impl Searcher { + pub fn new>(index_path: P) -> Result { + let index = Index::open_in_dir(index_path).context("Failed to open index")?; + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommitWithDelay) + .try_into() + .context("Failed to create index reader")?; + + Ok(Self { index, reader }) + } + + pub fn search(&self, query_str: &str, limit: usize) -> Result> { + let schema = self.index.schema(); + let searcher = self.reader.searcher(); + + let title_field = schema.get_field(fields::TITLE)?; + let body_field = schema.get_field(fields::BODY)?; + let path_field = schema.get_field(fields::PATH)?; + let file_type_field = schema.get_field(fields::FILE_TYPE)?; + + let query_parser = QueryParser::for_index(&self.index, vec![title_field, body_field]); + let query = query_parser + .parse_query(query_str) + .context("Failed to parse query")?; + + let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?; + + let mut results = Vec::new(); + for (score, doc_address) in top_docs { + let doc: tantivy::TantivyDocument = searcher.doc(doc_address)?; + + let title = doc + .get_first(title_field) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let body = doc + .get_first(body_field) + .and_then(|v| v.as_str()) + .unwrap_or(""); + + // Generate snippet: first 200 chars of body + let snippet = if body.len() > 200 { + format!("{}...", &body[..body.floor_char_boundary(200)]) + } else { + body.to_string() + }; + + let path = doc + .get_first(path_field) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let file_type = doc + .get_first(file_type_field) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + results.push(LocalSearchResult { + title, + path, + snippet, + file_type, + score, + }); + } + + Ok(results) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::index::indexer::{Indexer, LocalDocument}; + use tempfile::TempDir; + + fn build_index(dir: &Path, docs: Vec<(&str, &str)>) { + let mut indexer = Indexer::new(dir, false).unwrap(); + for (title, body) in docs { + indexer + .add_document(&LocalDocument { + title: title.to_string(), + body: body.to_string(), + path: format!("/docs/{}.md", title), + file_type: "md".to_string(), + }) + .unwrap(); + } + indexer.commit().unwrap(); + } + + #[test] + fn test_search_returns_matching_results() { + let dir = TempDir::new().unwrap(); + build_index( + dir.path(), + vec![ + ("rust guide", "Rust is a systems programming language"), + ("python guide", "Python is great for scripting"), + ], + ); + + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("rust systems", 10).unwrap(); + + assert!(!results.is_empty()); + assert_eq!(results[0].title, "rust guide"); + assert_eq!(results[0].file_type, "md"); + assert!(results[0].score > 0.0); + } + + #[test] + fn test_search_respects_limit() { + let dir = TempDir::new().unwrap(); + build_index( + dir.path(), + vec![ + ("doc1", "common keyword found here"), + ("doc2", "common keyword also here"), + ("doc3", "common keyword again"), + ], + ); + + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("common keyword", 2).unwrap(); + assert!(results.len() <= 2); + } + + #[test] + fn test_search_empty_index_returns_nothing() { + let dir = TempDir::new().unwrap(); + // Create empty index + let mut indexer = Indexer::new(dir.path(), false).unwrap(); + indexer.commit().unwrap(); + + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("anything", 10).unwrap(); + assert!(results.is_empty()); + } + + #[test] + fn test_search_no_match() { + let dir = TempDir::new().unwrap(); + build_index(dir.path(), vec![("hello", "world is great")]); + + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("zzzznonexistent", 10).unwrap(); + assert!(results.is_empty()); + } + + #[test] + fn test_search_result_has_snippet() { + let dir = TempDir::new().unwrap(); + build_index( + dir.path(), + vec![("doc", "Short body text for testing snippets")], + ); + + let searcher = Searcher::new(dir.path()).unwrap(); + let results = searcher.search("testing snippets", 10).unwrap(); + + assert!(!results.is_empty()); + assert!(!results[0].snippet.is_empty()); + } + + #[test] + fn test_searcher_open_nonexistent_fails() { + let result = Searcher::new("/nonexistent/index/path"); + assert!(result.is_err()); + } +} diff --git a/src/main.rs b/src/main.rs index 7baf5ae..34102e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,42 +1,560 @@ +mod cli; mod config; mod engines; mod fetchers; +mod index; mod tools; mod types; mod utils; +use std::io::IsTerminal; +use std::process; + use anyhow::Result; +use clap::Parser; use rmcp::{ServiceExt, transport::io::stdio}; use tracing_subscriber::{EnvFilter, fmt}; +use cli::{Cli, Command, OutputFormat}; use config::CONFIG; use tools::WebSearchHandler; +use tools::helpers::{ + ALL_ENGINES, do_multi_search, do_search, format_results, resolve_engines, results_to_text, +}; #[tokio::main] async fn main() -> Result<()> { + let cli = Cli::parse(); + + // If no subcommand and stdin is not a TTY β†’ auto-enter MCP mode + // (backward-compatible with Claude Desktop / Cursor which pipe stdin) + let command = cli.command.unwrap_or_else(|| { + if !std::io::stdin().is_terminal() { + Command::Mcp + } else { + // Show help and exit when invoked interactively without subcommand + let _ = ::command().print_help(); + println!(); + process::exit(0); + } + }); + + match command { + Command::Mcp => run_mcp().await, + Command::Search { + query, + engine, + limit, + format, + } => { + init_cli_logging(); + run_search(&query, &engine, limit, &format).await + } + Command::Fetch { + url, + max_chars, + format, + } => { + init_cli_logging(); + run_fetch(&url, max_chars, &format).await + } + Command::ListEngines { format } => { + run_list_engines(&format); + Ok(()) + } + Command::ShowConfig { format } => { + run_show_config(&format); + Ok(()) + } + Command::IndexLocal { + path, + index_path, + format, + } => { + init_cli_logging(); + run_index_local(&path, index_path.as_deref(), &format) + } + Command::SearchLocal { + query, + index_path, + limit, + format, + } => { + init_cli_logging(); + run_search_local(&query, index_path.as_deref(), limit, &format) + } + } +} + +fn init_cli_logging() { fmt() .with_writer(std::io::stderr) .with_env_filter( - EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")), ) .init(); +} + +// ─── mcp ──────────────────────────────────────────────────────────────────── - let mode = std::env::var("MODE") - .unwrap_or_else(|_| "stdio".into()) - .to_lowercase(); +async fn run_mcp() -> Result<()> { + fmt() + .with_writer(std::io::stderr) + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + ) + .init(); - tracing::info!("ferris-search starting..."); + tracing::info!("ferris-search MCP server starting..."); tracing::info!("Default engine: {}", CONFIG.default_search_engine); - tracing::info!("Mode: {}", mode); - if mode == "stdio" || mode == "both" { - tracing::info!("Starting STDIO transport..."); - let service = WebSearchHandler::new() - .serve(stdio()) - .await - .map_err(|e| anyhow::anyhow!("Failed to start stdio transport: {}", e))?; - service.waiting().await?; + let service = WebSearchHandler::new() + .serve(stdio()) + .await + .map_err(|e| anyhow::anyhow!("Failed to start stdio transport: {}", e))?; + service.waiting().await?; + + Ok(()) +} + +// ─── search ───────────────────────────────────────────────────────────────── + +async fn run_search( + query: &str, + raw_engines: &[String], + limit: u32, + format: &OutputFormat, +) -> Result<()> { + let limit = limit.clamp(1, 50) as usize; + let engines = resolve_engines(raw_engines); + + if engines.is_empty() { + eprintln!("Error: no allowed engines specified."); + process::exit(2); + } + + if engines.len() == 1 { + match do_search(&engines[0], query, limit).await { + Ok(results) if results.is_empty() => { + println!("No results found."); + } + Ok(results) => match format { + OutputFormat::Text => println!("{}", format_results(&engines[0], &results)), + OutputFormat::Json => { + println!("{}", serde_json::to_string_pretty(&results)?); + } + }, + Err(e) => { + eprintln!("Search failed: {}", e); + process::exit(1); + } + } + return Ok(()); + } + + let multi = do_multi_search(&engines, query, limit).await; + + for (engine, err) in &multi.errors { + eprintln!("Warning: engine '{}' failed: {}", engine, err); + } + + if multi.results.is_empty() { + if multi.errors.is_empty() { + println!("No results found."); + } else { + eprintln!("All engines failed."); + process::exit(1); + } + return Ok(()); + } + + match format { + OutputFormat::Text => { + let mut total = 0usize; + let mut output = String::new(); + for (engine, results) in &multi.results { + total += results.len(); + output.push_str(&format!("## Results from {}\n\n", engine)); + output.push_str(&results_to_text(results)); + output.push('\n'); + } + println!("Total results: {}\n\n{}", total, output); + } + OutputFormat::Json => { + let all: Vec<_> = multi.results.into_iter().flat_map(|(_, r)| r).collect(); + println!("{}", serde_json::to_string_pretty(&all)?); + } + } + + Ok(()) +} + +// ─── fetch ────────────────────────────────────────────────────────────────── + +/// Check if a GitHub URL points to a repo root (owner/repo), not deeper paths. +fn is_github_repo_url(url: &str) -> bool { + let path = if let Some(p) = url.strip_prefix("https://github.com/") { + p + } else if let Some(p) = url.strip_prefix("http://github.com/") { + p + } else { + return false; + }; + // Strip query string and fragment before counting segments + let path = path.split(['?', '#']).next().unwrap_or(path); + let path = path.trim_end_matches('/'); + let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); + segments.len() == 2 +} + +async fn run_fetch(url: &str, max_chars: u32, format: &OutputFormat) -> Result<()> { + use fetchers::{ + csdn::fetch_csdn_article, github::fetch_github_readme, juejin::fetch_juejin_article, + linuxdo::fetch_linuxdo_article, web::fetch_web_content, zhihu::fetch_zhihu_article, + }; + use utils::url_safety::{is_public_http_url, is_url_from_host}; + + if !is_public_http_url(url) { + eprintln!("Error: URL must be a public HTTP/HTTPS URL."); + process::exit(2); + } + + // Auto-detect fetcher based on URL + let content: String = if is_github_repo_url(url) { + fetch_github_readme(url).await? + } else if is_url_from_host(url, "csdn.net") { + fetch_csdn_article(url).await? + } else if is_url_from_host(url, "juejin.cn") && url.contains("/post/") { + fetch_juejin_article(url).await? + } else if is_url_from_host(url, "zhihu.com") { + fetch_zhihu_article(url).await? + } else if is_url_from_host(url, "linux.do") && url.contains("/topic/") { + fetch_linuxdo_article(url).await? + } else { + let result = fetch_web_content(url, Some(max_chars as usize)).await?; + match format { + OutputFormat::Text => { + let truncated_note = if result.truncated { + "\n\n[Content truncated]" + } else { + "" + }; + println!( + "Title: {}\nURL: {}\n\n{}{}", + result.title, result.url, result.content, truncated_note + ); + return Ok(()); + } + OutputFormat::Json => { + let obj = serde_json::json!({ + "title": result.title, + "url": result.url, + "content": result.content, + "truncated": result.truncated, + }); + println!("{}", serde_json::to_string_pretty(&obj)?); + return Ok(()); + } + } + }; + + match format { + OutputFormat::Text => println!("{}", content), + OutputFormat::Json => { + let obj = serde_json::json!({ + "url": url, + "content": content, + }); + println!("{}", serde_json::to_string_pretty(&obj)?); + } + } + + Ok(()) +} + +// ─── list-engines ─────────────────────────────────────────────────────────── + +fn run_list_engines(format: &OutputFormat) { + let allowed = &CONFIG.allowed_search_engines; + let default = &CONFIG.default_search_engine; + + match format { + OutputFormat::Text => { + println!("Supported engines:\n"); + for &engine in ALL_ENGINES { + let is_allowed = allowed.iter().any(|e| e == engine); + let is_default = engine == default; + let markers = match (is_default, is_allowed) { + (true, true) => " (default)", + (true, false) => " (default, disabled)", + (false, true) => " (allowed)", + (false, false) => " (disabled)", + }; + println!(" {}{}", engine, markers); + } + } + OutputFormat::Json => { + let obj = serde_json::json!({ + "default_engine": default, + "allowed_engines": allowed, + "all_engines": ALL_ENGINES, + }); + println!("{}", serde_json::to_string_pretty(&obj).unwrap()); + } + } +} + +// ─── show-config ──────────────────────────────────────────────────────────── + +fn mask_key(key: &Option) -> String { + match key { + None => "(not set)".into(), + Some(k) if k.chars().count() <= 8 => "***".into(), + Some(k) => { + let first4: String = k.chars().take(4).collect(); + let last4: String = k + .chars() + .rev() + .take(4) + .collect::>() + .into_iter() + .rev() + .collect(); + format!("{}...{}", first4, last4) + } + } +} + +fn run_show_config(format: &OutputFormat) { + let c = &*CONFIG; + + match format { + OutputFormat::Text => { + println!("ferris-search configuration\n"); + println!(" Default engine: {}", c.default_search_engine); + println!( + " Allowed engines: {}", + c.allowed_search_engines.join(", ") + ); + println!(" Proxy enabled: {}", c.use_proxy); + println!(" Proxy URL: {}", c.proxy_url); + println!(" BRAVE_API_KEY: {}", mask_key(&c.brave_api_key)); + println!(" EXA_API_KEY: {}", mask_key(&c.exa_api_key)); + println!(" FIRECRAWL_API_KEY: {}", mask_key(&c.firecrawl_api_key)); + println!(" JINA_API_KEY: {}", mask_key(&c.jina_api_key)); + println!(" TAVILY_API_KEY: {}", mask_key(&c.tavily_api_key)); + println!(" GITHUB_TOKEN: {}", mask_key(&c.github_token)); + println!(" Local index path: {}", c.local_docs_index_path); + let effective_ext: Vec = if c.local_docs_extensions.is_empty() { + index::collector::DEFAULT_EXTENSIONS + .iter() + .map(|s| s.to_string()) + .collect() + } else { + c.local_docs_extensions.clone() + }; + println!(" Local extensions: {}", effective_ext.join(", ")); + } + OutputFormat::Json => { + let obj = serde_json::json!({ + "default_search_engine": c.default_search_engine, + "allowed_search_engines": c.allowed_search_engines, + "use_proxy": c.use_proxy, + "proxy_url": c.proxy_url, + "brave_api_key": mask_key(&c.brave_api_key), + "exa_api_key": mask_key(&c.exa_api_key), + "firecrawl_api_key": mask_key(&c.firecrawl_api_key), + "jina_api_key": mask_key(&c.jina_api_key), + "tavily_api_key": mask_key(&c.tavily_api_key), + "github_token": mask_key(&c.github_token), + "local_docs_index_path": c.local_docs_index_path, + "local_docs_extensions": if c.local_docs_extensions.is_empty() { + serde_json::json!(index::collector::DEFAULT_EXTENSIONS) + } else { + serde_json::json!(c.local_docs_extensions) + }, + }); + println!("{}", serde_json::to_string_pretty(&obj).unwrap()); + } + } +} + +// ─── index-local ──────────────────────────────────────────────────────────── + +fn run_index_local( + paths: &[String], + index_path: Option<&str>, + format: &OutputFormat, +) -> Result<()> { + let idx_dir = index_path.unwrap_or(&CONFIG.local_docs_index_path); + let extensions = CONFIG.local_docs_extensions.clone(); + + let (docs, errors) = index::collector::collect_documents(paths, &extensions); + + for err in &errors { + eprintln!("Warning: {}", err); + } + + if docs.is_empty() { + eprintln!("No documents found to index."); + process::exit(1); + } + + let mut indexer = index::indexer::Indexer::new(idx_dir, true)?; + let mut indexed = 0usize; + let mut index_errors = 0usize; + for doc in &docs { + match indexer.add_document(doc) { + Ok(()) => indexed += 1, + Err(e) => { + eprintln!("Warning: failed to index {}: {}", doc.path, e); + index_errors += 1; + } + } + } + indexer.commit()?; + + let total_errors = errors.len() + index_errors; + let summary = serde_json::json!({ + "indexed": indexed, + "errors": total_errors, + "index_path": idx_dir, + }); + + match format { + OutputFormat::Text => { + println!( + "Indexed {} documents into {}\nWarnings: {}", + indexed, idx_dir, total_errors + ); + } + OutputFormat::Json => { + println!("{}", serde_json::to_string_pretty(&summary)?); + } + } + + if indexed == 0 { + eprintln!("Error: all documents failed to index."); + process::exit(1); + } + + Ok(()) +} + +// ─── search-local ─────────────────────────────────────────────────────────── + +fn run_search_local( + query: &str, + index_path: Option<&str>, + limit: u32, + format: &OutputFormat, +) -> Result<()> { + let idx_dir = index_path.unwrap_or(&CONFIG.local_docs_index_path); + let limit = limit.clamp(1, 50) as usize; + + let searcher = index::searcher::Searcher::new(idx_dir)?; + let results = searcher.search(query, limit)?; + + if results.is_empty() { + println!("No results found."); + return Ok(()); + } + + match format { + OutputFormat::Text => { + println!("Found {} results:\n", results.len()); + for (i, r) in results.iter().enumerate() { + println!( + "{}. **{}**\nPath: {}\nType: {} | Score: {:.4}\nSnippet: {}\n", + i + 1, + r.title, + r.path, + r.file_type, + r.score, + r.snippet + ); + } + } + OutputFormat::Json => { + println!("{}", serde_json::to_string_pretty(&results)?); + } } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + // ─── is_github_repo_url ───────────────────────────────────────────── + + #[test] + fn github_repo_url_basic() { + assert!(is_github_repo_url("https://github.com/tokio-rs/tokio")); + assert!(is_github_repo_url("http://github.com/tokio-rs/tokio")); + } + + #[test] + fn github_repo_url_trailing_slash() { + assert!(is_github_repo_url("https://github.com/tokio-rs/tokio/")); + } + + #[test] + fn github_repo_url_query_and_fragment() { + assert!(is_github_repo_url( + "https://github.com/tokio-rs/tokio?tab=readme" + )); + assert!(is_github_repo_url( + "https://github.com/tokio-rs/tokio#readme" + )); + assert!(is_github_repo_url( + "https://github.com/tokio-rs/tokio?tab=readme#section" + )); + } + + #[test] + fn github_repo_url_deeper_paths_rejected() { + assert!(!is_github_repo_url( + "https://github.com/tokio-rs/tokio/issues" + )); + assert!(!is_github_repo_url( + "https://github.com/tokio-rs/tokio/blob/main/README.md" + )); + assert!(!is_github_repo_url( + "https://github.com/tokio-rs/tokio/pull/123" + )); + } + + #[test] + fn github_repo_url_non_github() { + assert!(!is_github_repo_url("https://gitlab.com/owner/repo")); + assert!(!is_github_repo_url("https://example.com/path")); + assert!(!is_github_repo_url("not a url")); + } + + #[test] + fn github_repo_url_owner_only_rejected() { + assert!(!is_github_repo_url("https://github.com/tokio-rs")); + assert!(!is_github_repo_url("https://github.com/tokio-rs/")); + } + + // ─── mask_key ─────────────────────────────────────────────────────── + + #[test] + fn mask_key_none() { + assert_eq!(mask_key(&None), "(not set)"); + } + + #[test] + fn mask_key_short() { + assert_eq!(mask_key(&Some("abc".into())), "***"); + assert_eq!(mask_key(&Some("12345678".into())), "***"); + } + + #[test] + fn mask_key_long() { + assert_eq!(mask_key(&Some("abcdefghij".into())), "abcd...ghij"); + assert_eq!(mask_key(&Some("exa-1234567890".into())), "exa-...7890"); + } +} diff --git a/src/tools/handlers.rs b/src/tools/handlers.rs index 17a9f76..258dffd 100644 --- a/src/tools/handlers.rs +++ b/src/tools/handlers.rs @@ -1,12 +1,13 @@ -use super::helpers::{do_search, format_results, normalize_engine, results_to_text}; +use super::helpers::{ + do_multi_search, do_search, format_results, resolve_engines, results_to_text, +}; use super::params::{ArticleUrlParams, FetchUrlParams, GithubReadmeParams, WebSearchParams}; use crate::{ - config::CONFIG, fetchers::{ csdn::fetch_csdn_article, github::fetch_github_readme, juejin::fetch_juejin_article, linuxdo::fetch_linuxdo_article, web::fetch_web_content, zhihu::fetch_zhihu_article, }, - utils::url_safety::is_public_http_url, + utils::url_safety::{is_public_http_url, is_url_from_host}, }; use rmcp::{ ServerHandler, @@ -39,24 +40,14 @@ impl WebSearchHandler { let params = p.0; let limit = params.limit.unwrap_or(10).clamp(1, 50) as usize; - // Resolve engine list - let engines: Vec = match params.engines { - Some(list) if !list.is_empty() => list.iter().map(|e| normalize_engine(e)).collect(), - _ => vec![CONFIG.default_search_engine.clone()], - }; - - // Filter against allowed engines - let engines: Vec = engines - .into_iter() - .filter(|e| CONFIG.is_engine_allowed(e)) - .collect(); + let raw: Vec = params.engines.unwrap_or_default(); + let engines = resolve_engines(&raw); if engines.is_empty() { return "No allowed engines specified.".into(); } if engines.len() == 1 { - // Single engine β€” simple path match do_search(&engines[0], ¶ms.query, limit).await { Ok(results) if results.is_empty() => return "No results found.".into(), Ok(results) => return format_results(&engines[0], &results), @@ -64,34 +55,35 @@ impl WebSearchHandler { } } - // Multi-engine fan-out - let mut handles = Vec::new(); - for engine in engines.clone() { - let query = params.query.clone(); - handles.push(tokio::spawn(async move { - let res = do_search(&engine, &query, limit).await; - (engine, res) - })); + let multi = do_multi_search(&engines, ¶ms.query, limit).await; + + if multi.results.is_empty() { + if multi.errors.is_empty() { + return "No results found.".into(); + } + let err_detail: Vec = multi + .errors + .iter() + .map(|(e, msg)| format!("{}: {}", e, msg)) + .collect(); + return format!("All engines failed:\n{}", err_detail.join("\n")); } let mut output = String::new(); let mut total = 0usize; - for handle in handles { - if let Ok((engine, Ok(results))) = handle.await - && !results.is_empty() - { - total += results.len(); - output.push_str(&format!("## Results from {}\n\n", engine)); - output.push_str(&results_to_text(&results)); - output.push('\n'); + for (engine, results) in &multi.results { + total += results.len(); + output.push_str(&format!("## Results from {}\n\n", engine)); + output.push_str(&results_to_text(results)); + output.push('\n'); + } + if !multi.errors.is_empty() { + output.push_str("\n## Warnings\n\n"); + for (engine, msg) in &multi.errors { + output.push_str(&format!("- Engine '{}' failed: {}\n", engine, msg)); } } - - if output.is_empty() { - "No results found.".into() - } else { - format!("Total results: {}\n\n{}", total, output) - } + format!("Total results: {}\n\n{}", total, output) } /// Fetch the content of a web page @@ -128,7 +120,7 @@ impl WebSearchHandler { )] pub async fn fetch_github_readme_tool(&self, p: Parameters) -> String { let url = p.0.url; - if !url.contains("github.com") { + if !is_url_from_host(&url, "github.com") { return "URL must be from github.com".into(); } match fetch_github_readme(&url).await { @@ -144,7 +136,7 @@ impl WebSearchHandler { )] pub async fn fetch_csdn_article_tool(&self, p: Parameters) -> String { let url = p.0.url; - if !url.contains("csdn.net") { + if !is_url_from_host(&url, "csdn.net") { return "URL must be from csdn.net".into(); } match fetch_csdn_article(&url).await { @@ -160,7 +152,7 @@ impl WebSearchHandler { )] pub async fn fetch_juejin_article_tool(&self, p: Parameters) -> String { let url = p.0.url; - if !url.contains("juejin.cn") || !url.contains("/post/") { + if !is_url_from_host(&url, "juejin.cn") || !url.contains("/post/") { return "URL must be from juejin.cn and contain /post/ path".into(); } match fetch_juejin_article(&url).await { @@ -176,7 +168,7 @@ impl WebSearchHandler { )] pub async fn fetch_zhihu_article_tool(&self, p: Parameters) -> String { let url = p.0.url; - if !url.contains("zhihu.com") { + if !is_url_from_host(&url, "zhihu.com") { return "URL must be from zhihu.com".into(); } match fetch_zhihu_article(&url).await { @@ -192,7 +184,7 @@ impl WebSearchHandler { )] pub async fn fetch_linuxdo_article_tool(&self, p: Parameters) -> String { let url = p.0.url; - if !url.contains("linux.do") || !url.contains("/topic/") { + if !is_url_from_host(&url, "linux.do") || !url.contains("/topic/") { return "URL must be from linux.do and contain /topic/ path".into(); } match fetch_linuxdo_article(&url).await { diff --git a/src/tools/helpers.rs b/src/tools/helpers.rs index 10cfbee..b618f26 100644 --- a/src/tools/helpers.rs +++ b/src/tools/helpers.rs @@ -1,4 +1,5 @@ use crate::{ + config::CONFIG, engines::{ baidu::search_baidu, bing::search_bing, @@ -88,3 +89,166 @@ pub async fn do_search( other => anyhow::bail!("Unknown search engine: {}", other), } } + +/// All supported engine names. +pub const ALL_ENGINES: &[&str] = &[ + "baidu", + "bing", + "brave", + "csdn", + "duckduckgo", + "exa", + "firecrawl", + "github", + "github_code", + "jina", + "juejin", + "linuxdo", + "tavily", + "zhihu", +]; + +/// Resolve a list of engine names: normalize, filter against allowed list, fall back to default. +pub fn resolve_engines(raw: &[String]) -> Vec { + let engines: Vec = if raw.is_empty() { + vec![CONFIG.default_search_engine.clone()] + } else { + raw.iter().map(|e| normalize_engine(e)).collect() + }; + engines + .into_iter() + .filter(|e| CONFIG.is_engine_allowed(e)) + .collect() +} + +/// Result from a multi-engine search: successes and failures. +pub struct MultiSearchResult { + pub results: Vec<(String, Vec)>, + pub errors: Vec<(String, String)>, +} + +/// Multi-engine concurrent search. Returns successes and failures separately. +pub async fn do_multi_search(engines: &[String], query: &str, limit: usize) -> MultiSearchResult { + let mut handles = Vec::new(); + for engine in engines { + let engine = engine.clone(); + let query = query.to_string(); + handles.push(tokio::spawn(async move { + let res = do_search(&engine, &query, limit).await; + (engine, res) + })); + } + + let mut results = Vec::new(); + let mut errors = Vec::new(); + for handle in handles { + match handle.await { + Ok((engine, Ok(r))) => { + if !r.is_empty() { + results.push((engine, r)); + } + } + Ok((engine, Err(e))) => { + errors.push((engine, e.to_string())); + } + Err(e) => { + errors.push(("unknown".to_string(), e.to_string())); + } + } + } + MultiSearchResult { results, errors } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize_standard_names() { + assert_eq!(normalize_engine("bing"), "bing"); + assert_eq!(normalize_engine("duckduckgo"), "duckduckgo"); + assert_eq!(normalize_engine("brave"), "brave"); + assert_eq!(normalize_engine("github"), "github"); + assert_eq!(normalize_engine("github_code"), "github_code"); + } + + #[test] + fn normalize_aliases() { + assert_eq!(normalize_engine("ddg"), "duckduckgo"); + assert_eq!(normalize_engine("duck duck go"), "duckduckgo"); + assert_eq!(normalize_engine("microsoft bing"), "bing"); + assert_eq!(normalize_engine("brave search"), "brave"); + assert_eq!(normalize_engine("github repos"), "github"); + assert_eq!(normalize_engine("github code"), "github_code"); + } + + #[test] + fn normalize_chinese_aliases() { + assert_eq!(normalize_engine("η™ΎεΊ¦"), "baidu"); + assert_eq!(normalize_engine("ζŽ˜ι‡‘"), "juejin"); + assert_eq!(normalize_engine("ηŸ₯乎"), "zhihu"); + } + + #[test] + fn normalize_case_insensitive() { + assert_eq!(normalize_engine("BING"), "bing"); + assert_eq!(normalize_engine("DuckDuckGo"), "duckduckgo"); + assert_eq!(normalize_engine(" Brave "), "brave"); + } + + #[test] + fn normalize_unknown_passthrough() { + assert_eq!(normalize_engine("unknown_engine"), "unknown_engine"); + } + + #[test] + fn resolve_engines_uses_default_when_empty() { + let resolved = resolve_engines(&[]); + + if CONFIG.is_engine_allowed(&CONFIG.default_search_engine) { + assert_eq!(resolved, vec![CONFIG.default_search_engine.clone()]); + } else { + assert!(resolved.is_empty()); + } + } + + #[test] + fn resolve_engines_normalizes_and_filters() { + let raw = vec![ + " ddg ".to_string(), + "github code".to_string(), + "unknown_engine".to_string(), + ]; + + let resolved = resolve_engines(&raw); + + let mut expected = Vec::new(); + if CONFIG.is_engine_allowed("duckduckgo") { + expected.push("duckduckgo".to_string()); + } + if CONFIG.is_engine_allowed("github_code") { + expected.push("github_code".to_string()); + } + + assert_eq!(resolved, expected); + assert!( + resolved + .iter() + .all(|engine| CONFIG.is_engine_allowed(engine)) + ); + } + + #[tokio::test] + async fn do_multi_search_collects_unknown_engine_errors() { + let engines = vec!["unknown_engine".to_string(), "another_unknown".to_string()]; + + let result = do_multi_search(&engines, "rust", 3).await; + + assert!(result.results.is_empty()); + assert_eq!(result.errors.len(), 2); + assert_eq!(result.errors[0].0, "unknown_engine"); + assert!(result.errors[0].1.contains("Unknown search engine")); + assert_eq!(result.errors[1].0, "another_unknown"); + assert!(result.errors[1].1.contains("Unknown search engine")); + } +} diff --git a/src/tools/mod.rs b/src/tools/mod.rs index c68b633..16d7f08 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -1,5 +1,5 @@ mod handlers; -mod helpers; +pub mod helpers; mod params; pub use handlers::WebSearchHandler; diff --git a/src/utils/url_safety.rs b/src/utils/url_safety.rs index b97e49a..7dd4a25 100644 --- a/src/utils/url_safety.rs +++ b/src/utils/url_safety.rs @@ -67,3 +67,84 @@ pub fn assert_public_http_url(url: &str) -> anyhow::Result<()> { anyhow::bail!("URL is not a public HTTP/HTTPS URL: {}", url) } } + +/// Check if a URL's host matches the given domain (or is a subdomain of it). +pub fn is_url_from_host(raw: &str, domain: &str) -> bool { + let Ok(url) = Url::parse(raw) else { + return false; + }; + match url.host_str() { + Some(host) => host == domain || host.ends_with(&format!(".{}", domain)), + None => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn public_urls_accepted() { + assert!(is_public_http_url("https://example.com")); + assert!(is_public_http_url("http://example.com/path")); + assert!(is_public_http_url("https://github.com/owner/repo")); + } + + #[test] + fn non_http_rejected() { + assert!(!is_public_http_url("ftp://example.com")); + assert!(!is_public_http_url("file:///etc/passwd")); + assert!(!is_public_http_url("javascript:alert(1)")); + } + + #[test] + fn private_ips_rejected() { + assert!(!is_public_http_url("http://127.0.0.1")); + assert!(!is_public_http_url("http://10.0.0.1/path")); + assert!(!is_public_http_url("http://192.168.1.1")); + assert!(!is_public_http_url("http://172.16.0.1")); + } + + #[test] + fn localhost_rejected() { + assert!(!is_public_http_url("http://localhost")); + assert!(!is_public_http_url("http://localhost:8080")); + assert!(!is_public_http_url("http://foo.localhost")); + } + + #[test] + fn invalid_urls_rejected() { + assert!(!is_public_http_url("not a url")); + assert!(!is_public_http_url("")); + } + + #[test] + fn is_url_from_host_exact_match() { + assert!(is_url_from_host( + "https://github.com/owner/repo", + "github.com" + )); + assert!(is_url_from_host( + "https://blog.csdn.net/article", + "csdn.net" + )); + assert!(is_url_from_host( + "https://www.zhihu.com/question/1", + "zhihu.com" + )); + } + + #[test] + fn is_url_from_host_rejects_spoofed() { + assert!(!is_url_from_host( + "http://evil.com/github.com", + "github.com" + )); + assert!(!is_url_from_host( + "http://169.254.169.254/csdn.net", + "csdn.net" + )); + assert!(!is_url_from_host("http://evil.com/zhihu.com", "zhihu.com")); + assert!(!is_url_from_host("not a url", "github.com")); + } +} diff --git a/tests/cli.rs b/tests/cli.rs new file mode 100644 index 0000000..d81a40a --- /dev/null +++ b/tests/cli.rs @@ -0,0 +1,257 @@ +use assert_cmd::Command; +use tempfile::TempDir; + +fn cmd() -> Command { + Command::cargo_bin("ferris-search").unwrap() +} + +#[test] +fn help_flag_succeeds() { + cmd().arg("--help").assert().success(); +} + +#[test] +fn version_flag_succeeds() { + cmd().arg("--version").assert().success(); +} + +#[test] +fn list_engines_text() { + cmd() + .arg("list-engines") + .assert() + .success() + .stdout(predicates::str::contains("bing")); +} + +#[test] +fn list_engines_json() { + cmd() + .args(["list-engines", "--format", "json"]) + .assert() + .success() + .stdout(predicates::str::contains("\"all_engines\"")); +} + +#[test] +fn show_config_text() { + cmd() + .arg("show-config") + .assert() + .success() + .stdout(predicates::str::contains("Default engine")); +} + +#[test] +fn show_config_json() { + cmd() + .args(["show-config", "--format", "json"]) + .assert() + .success() + .stdout(predicates::str::contains("\"default_search_engine\"")); +} + +#[test] +fn unknown_subcommand_fails() { + cmd().arg("nonexistent").assert().failure(); +} + +#[test] +fn search_missing_query_fails() { + cmd().arg("search").assert().failure(); +} + +#[test] +fn fetch_missing_url_fails() { + cmd().arg("fetch").assert().failure(); +} + +// ─── index-local ──────────────────────────────────────────────────────────── + +#[test] +fn index_local_missing_path_fails() { + cmd().arg("index-local").assert().failure(); +} + +#[test] +fn index_local_nonexistent_path_fails() { + let dir = TempDir::new().unwrap(); + let bad_path = dir.path().join("does_not_exist"); + + cmd() + .args(["index-local", "--path"]) + .arg(&bad_path) + .assert() + .failure(); +} + +#[test] +fn index_local_indexes_files() { + let docs = TempDir::new().unwrap(); + let idx = TempDir::new().unwrap(); + + std::fs::write(docs.path().join("hello.md"), "# Hello\n\nWorld").unwrap(); + std::fs::write(docs.path().join("notes.txt"), "Some notes").unwrap(); + + cmd() + .arg("index-local") + .arg("--path") + .arg(docs.path()) + .arg("--index-path") + .arg(idx.path()) + .assert() + .success() + .stdout(predicates::str::contains("Indexed 2 documents")); +} + +#[test] +fn index_local_json_output_is_valid() { + let docs = TempDir::new().unwrap(); + let idx = TempDir::new().unwrap(); + + std::fs::write(docs.path().join("a.md"), "# A\n\nContent").unwrap(); + + let output = cmd() + .arg("index-local") + .arg("--path") + .arg(docs.path()) + .arg("--index-path") + .arg(idx.path()) + .args(["--format", "json"]) + .output() + .unwrap(); + + assert!(output.status.success()); + + let json: serde_json::Value = + serde_json::from_slice(&output.stdout).expect("index-local JSON output must be valid JSON"); + assert_eq!(json["indexed"], 1); + assert!(json["index_path"].is_string()); +} + +// ─── search-local ─────────────────────────────────────────────────────────── + +#[test] +fn search_local_missing_query_fails() { + cmd().arg("search-local").assert().failure(); +} + +#[test] +fn search_local_missing_index_fails() { + let dir = TempDir::new().unwrap(); + let bad_idx = dir.path().join("no_index_here"); + + cmd() + .args(["search-local", "anything", "--index-path"]) + .arg(&bad_idx) + .assert() + .failure(); +} + +#[test] +fn search_local_returns_results() { + let docs = TempDir::new().unwrap(); + let idx = TempDir::new().unwrap(); + + std::fs::write( + docs.path().join("rust.md"), + "# Rust Guide\n\nRust is a systems programming language focused on safety.", + ) + .unwrap(); + + // Build index first + cmd() + .arg("index-local") + .arg("--path") + .arg(docs.path()) + .arg("--index-path") + .arg(idx.path()) + .assert() + .success(); + + // Search + cmd() + .args(["search-local", "rust systems", "--index-path"]) + .arg(idx.path()) + .assert() + .success() + .stdout(predicates::str::contains("Rust Guide")); +} + +#[test] +fn search_local_json_output_is_valid() { + let docs = TempDir::new().unwrap(); + let idx = TempDir::new().unwrap(); + + std::fs::write( + docs.path().join("test.md"), + "# Test Doc\n\nSome searchable content here.", + ) + .unwrap(); + + cmd() + .arg("index-local") + .arg("--path") + .arg(docs.path()) + .arg("--index-path") + .arg(idx.path()) + .assert() + .success(); + + let output = cmd() + .args([ + "search-local", + "searchable content", + "--format", + "json", + "--index-path", + ]) + .arg(idx.path()) + .output() + .unwrap(); + + assert!(output.status.success()); + + let json: serde_json::Value = serde_json::from_slice(&output.stdout) + .expect("search-local JSON output must be valid JSON"); + assert!(json.is_array()); + let arr = json.as_array().unwrap(); + assert!(!arr.is_empty()); + assert!(arr[0]["title"].is_string()); + assert!(arr[0]["path"].is_string()); + assert!(arr[0]["score"].is_number()); +} + +// ─── existing JSON output structure validation ────────────────────────────── + +#[test] +fn list_engines_json_is_valid() { + let output = cmd() + .args(["list-engines", "--format", "json"]) + .output() + .unwrap(); + + assert!(output.status.success()); + + let json: serde_json::Value = + serde_json::from_slice(&output.stdout).expect("list-engines JSON must be valid JSON"); + assert!(json["default_engine"].is_string()); + assert!(json["allowed_engines"].is_array()); + assert!(json["all_engines"].is_array()); +} + +#[test] +fn show_config_json_is_valid() { + let output = cmd() + .args(["show-config", "--format", "json"]) + .output() + .unwrap(); + + assert!(output.status.success()); + + let json: serde_json::Value = + serde_json::from_slice(&output.stdout).expect("show-config JSON must be valid JSON"); + assert!(json["default_search_engine"].is_string()); + assert!(json["allowed_search_engines"].is_array()); + assert!(json["local_docs_index_path"].is_string()); +}