diff --git a/Cargo.toml b/Cargo.toml index daff399a..63692a3a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,24 +11,9 @@ keywords = ["semantic-search", "document-parsing", "cli", "pdf", "search"] readme = "README.md" [[bin]] -name = "parse" -path = "src/bin/parse.rs" -required-features = ["parse"] - -[[bin]] -name = "search" -path = "src/bin/search.rs" -required-features = ["search"] - -[[bin]] -name = "workspace" -path = "src/bin/workspace.rs" -required-features = ["workspace", "search"] - -[[bin]] -name = "ask" -path = "src/bin/ask.rs" -required-features = ["ask", "search"] +name = "semtools" +path = "src/bin/semtools.rs" +required-features = ["ask", "search", "workspace", "parse"] [dependencies] # Common dependencies diff --git a/README.md b/README.md index cdb08e35..076a80f6 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ > Semantic search and document parsing tools for the command line -A collection of high-performance CLI tools for document processing and semantic search, built with Rust for speed and reliability. +A high-performance CLI tool for document processing and semantic search, built with Rust for speed and reliability. -- **`parse`** - Parse documents (PDF, DOCX, etc.) using, by default, the LlamaParse API into markdown format -- **`search`** - Local semantic keyword search using multilingual embeddings with cosine similarity matching and per-line context matching -- **`ask`** - AI agent with search and read tools for answering questions over document collections (defaults to OpenAI, but see the [config section](#configuration) to learn more about connecting to any OpenAI-Compatible API) -- **`workspace`** - Workspace management for accelerating search over large collections +- **`semtools parse`** - Parse documents (PDF, DOCX, etc.) using, by default, the LlamaParse API into markdown format +- **`semtools search`** - Local semantic keyword search using multilingual embeddings with cosine similarity matching and per-line context matching +- **`semtools ask`** - AI agent with search and read tools for answering questions over document collections (defaults to OpenAI, but see the [config section](#configuration) to learn more about connecting to any OpenAI-Compatible API) +- **`semtools workspace`** - Workspace management for accelerating search over large collections **NOTE:** By default, `parse` uses LlamaParse as a backend. Get your API key today for free at [https://cloud.llamaindex.ai](https://cloud.llamaindex.ai). `search` and `workspace` remain local-only. `ask` requires an OpenAI API key. @@ -25,7 +25,8 @@ A collection of high-performance CLI tools for document processing and semantic Prerequisites: -- For the `parse` tool: LlamaIndex Cloud API key +- For the `parse` subcommand: LlamaIndex Cloud API key +- For the `ask` subcommand: OpenAI API key Install: @@ -53,41 +54,41 @@ Basic Usage: ```bash # Parse some files -parse my_dir/*.pdf +semtools parse my_dir/*.pdf # Search some (text-based) files -search "some keywords" *.txt --max-distance 0.3 --n-lines 5 +semtools search "some keywords" *.txt --max-distance 0.3 --n-lines 5 # Ask questions about your documents using an AI agent -ask "What are the main findings?" papers/*.txt +semtools ask "What are the main findings?" papers/*.txt # Combine parsing and search -parse my_docs/*.pdf | xargs search "API endpoints" +semtools parse my_docs/*.pdf | xargs search "API endpoints" # Ask a question to a set of files -ask "Some question?" *.txt +semtools ask "Some question?" *.txt # Combine parsing with the ask agent -parse research_papers/*.pdf | xargs ask "Summarize the key methodologies" +semtools parse research_papers/*.pdf | xargs ask "Summarize the key methodologies" # Ask based on stdin content -cat README.md | ask "How do I install SemTools?" +cat README.md | semtools ask "How do I install SemTools?" ``` Advanced Usage: ```bash # Combine with grep for exact-match pre-filtering and distance thresholding -parse *.pdf | xargs cat | grep -i "error" | search "network error" --max-distance 0.3 +semtools parse *.pdf | xargs cat | grep -i "error" | semtools search "network error" --max-distance 0.3 # Pipeline with content search (note the 'xargs' on search to search files instead of stdin) -find . -name "*.md" | xargs parse | xargs search "installation" +find . -name "*.md" | xargs semtools parse | xargs semtools search "installation" # Combine with grep for filtering (grep could be before or after parse/search!) -parse docs/*.pdf | xargs search "API" | grep -A5 "authentication" +semtools parse docs/*.pdf | xargs semtools search "API" | grep -A5 "authentication" # Save search results from stdin search -parse report.pdf | xargs cat | search "summary" > results.txt +semtools parse report.pdf | xargs cat | semtools search "summary" > results.txt ``` Using Workspaces: @@ -95,7 +96,7 @@ Using Workspaces: ```bash # Create or select a workspace # Workspaces are stored in ~/.semtools/workspaces/ -workspace use my-workspace +semtools workspace use my-workspace > Workspace 'my-workspace' configured. > To activate it, run: > export SEMTOOLS_WORKSPACE=my-workspace @@ -107,17 +108,17 @@ export SEMTOOLS_WORKSPACE=my-workspace # All search commands will now use the workspace for caching embeddings # The initial command is used to initialize the workspace -search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 +semtools search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 # If documents change, they are automatically re-embedded and cached echo "some new content" > ./some_large_dir/some_file.txt -search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 +semtools search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 # If documents are removed, you can run prune to clean up stale files -workspace prune +semtools workspace prune # You can see the stats of a workspace at any time -workspace status +semtools workspace status > Active workspace: arxiv > Root: /Users/loganmarkewich/.semtools/workspaces/arxiv > Documents: 3000 @@ -127,10 +128,10 @@ workspace status ## CLI Help ```bash -$ parse --help +$ semtools parse --help A CLI tool for parsing documents using various backends -Usage: parse [OPTIONS] ... +Usage: semtools parse [OPTIONS] ... Arguments: ... Files to parse @@ -140,33 +141,32 @@ Options: -b, --backend The backend type to use for parsing. Defaults to `llama-parse` [default: llama-parse] -v, --verbose Verbose output while parsing -h, --help Print help - -V, --version Print version ``` ```bash -$ search --help +$ semtools search --help A CLI tool for fast semantic keyword search -Usage: search [OPTIONS] [FILES]... +Usage: semtools search [OPTIONS] [FILES]... Arguments: Query to search for (positional argument) - [FILES]... Files or directories to search + [FILES]... Files to search, optional if using stdin Options: -n, --n-lines How many lines before/after to return as context [default: 3] --top-k The top-k files or texts to return (ignored if max_distance is set) [default: 3] -m, --max-distance Return all results with distance below this threshold (0.0+) -i, --ignore-case Perform case-insensitive search (default is false) + -j, --json Output results in JSON format -h, --help Print help - -V, --version Print version ``` ```bash -$ workspace --help +$ semtools workspace --help Manage semtools workspaces -Usage: workspace +Usage: semtools workspace [OPTIONS] Commands: use Use or create a workspace (prints export command to run) @@ -175,15 +175,15 @@ Commands: help Print this message or the help of the given subcommand(s) Options: - -h, --help Print help - -V, --version Print version + -j, --json Output results in JSON format + -h, --help Print help ``` ```bash -$ ask --help -A CLI tool for fast semantic keyword search +$ semtools ask --help +A CLI tool for document-based question-answering -Usage: ask [OPTIONS] [FILES]... +Usage: semtools ask [OPTIONS] [FILES]... Arguments: Query to prompt the agent with @@ -194,8 +194,9 @@ Options: --api-key OpenAI API key (overrides config file and env var) --base-url OpenAI base URL (overrides config file) -m, --model Model to use for the agent (overrides config file) + --api-mode API mode to use: 'chat' or 'responses' (overrides config file) + -j, --json Output results in JSON or text format -h, --help Print help - -V, --version Print version ``` ## Configuration @@ -260,11 +261,11 @@ Configuration values are resolved in the following priority order (highest to lo This allows you to set common defaults in the config file while overriding them on a per-command basis when needed. -### Tool-Specific Configuration +### Subcommand-Specific Configuration -#### Parse Tool +#### Parse Subcommand -The `parse` tool requires a LlamaParse API key. Get your free API key at [https://cloud.llamaindex.ai](https://cloud.llamaindex.ai). +The `parse` subcommand requires a LlamaParse API key. Get your free API key at [https://cloud.llamaindex.ai](https://cloud.llamaindex.ai). Configuration options: - `api_key`: Your LlamaParse API key @@ -273,9 +274,9 @@ Configuration options: - `parse_kwargs`: Additional parsing parameters - `check_interval`, `max_timeout`, `max_retries`, `retry_delay_ms`, `backoff_multiplier`: Retry and timeout settings -#### Ask Tool +#### Ask Subcommand -The `ask` tool requires an OpenAI API key for the agent's LLM. +The `ask` subcommand requires an OpenAI API key for the agent's LLM. Configuration options: - `api_key`: Your OpenAI API key @@ -285,7 +286,7 @@ Configuration options: You can also override these per-command: ```bash -ask "What is this about?" docs/*.txt --model gpt-4o --api-key sk-... +semtools ask "What is this about?" docs/*.txt --model gpt-4o --api-key sk-... ``` ## Agent Use Case Examples diff --git a/examples/example_CLAUDE.md b/examples/example_CLAUDE.md index e6cf3bab..75c942c0 100644 --- a/examples/example_CLAUDE.md +++ b/examples/example_CLAUDE.md @@ -1,58 +1,58 @@ # Augmented CLI Tooling -If executing bash commands, you have three very helpful utilities installed +If executing bash commands, you have three very helpful sucommands installed within `semtools`: - `parse` -- converts any non grep-able format into markdown, outputs a filepath for a converted markdown file for every input file to stdin - `search` -- performs a search using static embeddings on either stdin or a list of files (very similar to grep). Works best with keyword based search queries. Only works with text-based files so it may require the `parse` tool to help preprocess into markdown. - `workspace` - Workspace management for accelerating search over large collections These commands, combined with other CLI commands, you can ensure that you can search large amounts of files efficiently, while handling various formats of documents. Both `parse` and `search` can scale to hundreds of thousands of documents. -## Parse CLI Help +## Parse Subcommand Help ```bash -parse --help +$ semtools parse --help A CLI tool for parsing documents using various backends -Usage: parse [OPTIONS] ... +Usage: semtools parse [OPTIONS] ... Arguments: ... Files to parse Options: - -c, --parse-config Path to the config file. Defaults to ~/.parse_config.json - -b, --backend The backend type to use for parsing. Defaults to `llama-parse` [default: llama-parse] - -h, --help Print help - -V, --version Print version + -c, --config Path to the config file. Defaults to ~/.semtools_config.json + -b, --backend The backend type to use for parsing. Defaults to `llama-parse` [default: llama-parse] + -v, --verbose Verbose output while parsing + -h, --help Print help ``` -## Search CLI Help +## Search Subcommand Help ```bash -search --help +$ semtools search --help A CLI tool for fast semantic keyword search -Usage: search [OPTIONS] [FILES]... +Usage: semtools search [OPTIONS] [FILES]... Arguments: Query to search for (positional argument) - [FILES]... Files or directories to search + [FILES]... Files to search, optional if using stdin Options: -n, --n-lines How many lines before/after to return as context [default: 3] --top-k The top-k files or texts to return (ignored if max_distance is set) [default: 3] -m, --max-distance Return all results with distance below this threshold (0.0+) -i, --ignore-case Perform case-insensitive search (default is false) + -j, --json Output results in JSON format -h, --help Print help - -V, --version Print version ``` -## Workspaces CLI Help +## Workspaces Subcommand Help ```bash -workspace --help +$ semtools workspace --help Manage semtools workspaces -Usage: workspace +Usage: semtools workspace [OPTIONS] Commands: use Use or create a workspace (prints export command to run) @@ -61,55 +61,55 @@ Commands: help Print this message or the help of the given subcommand(s) Options: - -h, --help Print help - -V, --version Print version + -j, --json Output results in JSON format + -h, --help Print help ``` ## Common Usage Patterns +Here's how to convert those standalone commands to `semtools` subcommands: + ### Using Parse and Search ```bash # Parse a PDF and search for specific content -parse document.pdf | xargs cat | search "error handling" +semtools parse document.pdf | xargs cat | semtools search "error handling" # Search within many files after parsing -parse my_docs/*.pdf | xargs search "API endpoints" +semtools parse my_docs/*.pdf | xargs semtools search "API endpoints" # Search with custom context and thresholds or distance thresholds -search "machine learning" *.txt --n-lines 5 --max-distance 0.3 +semtools search "machine learning" *.txt --n-lines 5 --max-distance 0.3 # Search from stdin -echo "some text content" | search "content" +echo "some text content" | semtools search "content" # Parse multiple documents -parse report.pdf data.xlsx presentation.pptx +semtools parse report.pdf data.xlsx presentation.pptx # Chain parsing with semantic search -parse *.pdf | xargs search "financial projections" --n-lines 3 +semtools parse *.pdf | xargs semtools search "financial projections" --n-lines 3 # Search with distance threshold (lower = more similar) -parse document.pdf | xargs cat | search "revenue" --max-distance 0.2 +semtools parse document.pdf | xargs cat | semtools search "revenue" --max-distance 0.2 # Search multiple files directly -search "error handling" src/*.rs --top-k 5 +semtools search "error handling" src/*.rs --top-k 5 # Combine with grep for exact-match pre-filtering and distance thresholding -parse *.pdf | xargs cat | grep -i "error" | search "network error" --max-distance 0.3 +semtools parse *.pdf | xargs cat | grep -i "error" | semtools search "network error" --max-distance 0.3 # Pipeline with content search (note the 'cat') -find . -name "*.md" | xargs parse | xargs search "installation" +find . -name "*.md" | xargs semtools parse | xargs semtools search "installation" ``` - - ### Using with Workspaces ```bash # Create or select a workspace # Workspaces are stored in ~/.semtools/workspaces/ -workspace use my-workspace +semtools workspace use my-workspace > Workspace 'my-workspace' configured. > To activate it, run: > export SEMTOOLS_WORKSPACE=my-workspace @@ -121,44 +121,44 @@ export SEMTOOLS_WORKSPACE=my-workspace # All search commands will now use the workspace for caching embeddings # The initial command is used to initialize the workspace -search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 +semtools search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 # If documents change, they are automatically re-embedded and cached echo "some new content" > ./some_large_dir/some_file.txt -search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 - +semtools search "some keywords" ./some_large_dir/*.txt --n-lines 5 --top-k 10 # A workspace example if you are using with parse # create a workspace -workspace use my-workspace2 +semtools workspace use my-workspace2 export SEMTOOLS_WORKSPACE=my-workspace2 + # parse files, and then search over the parsed files, and cache the file embeddings -parse *.pdf | xargs search "financial projections" --n-lines 3 +semtools parse *.pdf | xargs semtools search "financial projections" --n-lines 3 + # if you run the command with a different query (see option a and b), over the same set of files, then search will operate # over the cached file embeddings # option a - parse won't rerun since files already cached -parse *.pdf | xargs search "balance sheet" --n-lines 3 +semtools parse *.pdf | xargs semtools search "balance sheet" --n-lines 3 + # option b - run search directly over the parse cache -xargs search "balance sheet" /Users/jerryliu/.parse/*.pdf.md --n-lines 3 +xargs semtools search "balance sheet" /Users/jerryliu/.parse/*.pdf.md --n-lines 3 # If documents are removed, you can run prune to clean up stale files -workspace prune +semtools workspace prune # You can see the stats of a workspace at any time -workspace status +semtools workspace status > Active workspace: arxiv > Root: /Users/loganmarkewich/.semtools/workspaces/arxiv > Documents: 3000 > Index: Yes (IVF_PQ) - ``` - ## Tips for using these tools -- If you have run / plan on running repeated `search` queries over the same file or set of files, you SHOULD create a workspace (`workspace use`) before running parse/search commands - otherwise you will be re-embedding the same document collections from scratch every time. Make sure the environment variable is set before downstream commands. -- Before you create a workspace, you can check current workspace through `workspace status` which will also give the directory where all workspaces are stored. -- You can choose to add a new workspace or prune an existing one if you are changing to a different collection of files. You can check the status through `workspace status`. +- If you have run / plan on running repeated `search` queries over the same file or set of files, you SHOULD create a workspace (`semtools workspace use`) before running parse/search commands - otherwise you will be re-embedding the same document collections from scratch every time. Make sure the environment variable is set before downstream commands. +- Before you create a workspace, you can check current workspace through `semtools workspace status` which will also give the directory where all workspaces are stored. +- You can choose to add a new workspace or prune an existing one if you are changing to a different collection of files. You can check the status through `semtools workspace status`. - `parse` will always output paths of parsed files to stdin. These parsed files represent the markdown version of their original file (for example, parsing a PDF or DOCX file into markdown). - ALWAYS call `parse` first when interacting with PDF (or similar) formats so that you can get the paths to the markdown versions of those files - `search` only works with text-based files (like markdown). It's a common pattern to first call `parse` and either feed files into `search` or cat files and search from stdin diff --git a/examples/use_with_coding_agents.md b/examples/use_with_coding_agents.md index 93e23c18..2f3818c0 100644 --- a/examples/use_with_coding_agents.md +++ b/examples/use_with_coding_agents.md @@ -42,10 +42,10 @@ In this example, I have a folder of 900+ papers (as PDFs) from an AI conference. First, I parse the PDFs (which caches them to disk at `~/.parse`): ```bash -parse ./papers +semtools parse ./papers ``` -Next, I invoke `claude` and ask it to summarize its takes on aspects of the papers. Using `parse`, `search`, and other common CLI commands, Claude is able to gather specific information from all of the papers. +Next, I invoke `claude` and ask it to summarize its takes on aspects of the papers. Using `semtools parse`, `semtools search`, and other common CLI commands, Claude is able to gather specific information from all of the papers. Below is the full trace from Claude-Code, which demonstrates it's ability to use `semtools` like `parse` and `search` to gather information from your files. @@ -82,7 +82,7 @@ Below is the full trace from Claude-Code, which demonstrates it's ability to use ⏺ I can see there are 910 ACL 2023 papers in PDF format. Now I'll parse all these PDFs to make them searchable, then look for papers discussing LLMs and evaluations. -⏺ Bash(parse *.pdf) +⏺ Bash(semtools parse *.pdf) ⎿  /Users/loganmarkewich/.parse/2023.acl-long.0.pdf.md /Users/loganmarkewich/.parse/2023.acl-long.1.pdf.md /Users/loganmarkewich/.parse/2023.acl-long.10.pdf.md @@ -96,7 +96,7 @@ Below is the full trace from Claude-Code, which demonstrates it's ability to use ⏺ Great! Now I have parsed all the PDFs. Let me search for papers related to LLMs and evaluations using the search tool. -⏺ Bash(parse *.pdf | xargs cat | search "large language model, evaluation, assessment, benchmark" --n-lines 5 --max-distance 0.4) +⏺ Bash(semtools parse *.pdf | xargs cat | semtools search "large language model, evaluation, assessment, benchmark" --n-lines 5 --max-distance 0.4) ⎿  :447094::447105 (0.16900031210055844) 447095: for Computational Linguistics. 447096: @@ -104,7 +104,7 @@ Below is the full trace from Claude-Code, which demonstrates it's ability to use ⏺ Now let me search for more specific terms to get a broader view of LLM evaluation research: -⏺ Bash(parse *.pdf | xargs cat | search "GPT, BERT, language model evaluation, performance evaluation, model comparison" --n-lines 5 +⏺ Bash(semtools parse *.pdf | xargs cat | semtools search "GPT, BERT, language model evaluation, performance evaluation, model comparison" --n-lines 5 --max-distance 0.35) ⎿  :244769::244780 (0.2719643874147169) 244770: @@ -113,7 +113,7 @@ Below is the full trace from Claude-Code, which demonstrates it's ability to use ⏺ Now let me search for specific types of evaluation studies and findings: -⏺ Bash(parse *.pdf | xargs cat | search "robustness, bias, fairness, hallucination, reliability" --n-lines 4 --max-distance 0.3) +⏺ Bash(semtools parse *.pdf | xargs cat | semtools search "robustness, bias, fairness, hallucination, reliability" --n-lines 4 --max-distance 0.3) ⎿  Using cached result for: 2023.acl-long.0.pdf Using cached result for: 2023.acl-long.1.pdf Using cached result for: 2023.acl-long.10.pdf @@ -129,7 +129,7 @@ Below is the full trace from Claude-Code, which demonstrates it's ability to use ⏺ Let me search for paper titles and abstracts to get a better understanding of the specific papers discussing LLM evaluation: -⏺ Bash(parse *.pdf | xargs cat | search "Abstract" --n-lines 10 | grep -A 10 -B 2 -i "large language model\|evaluation\|benchmark\|GPT\|BERT") +⏺ Bash(semtools parse *.pdf | xargs cat | semtools search "Abstract" --n-lines 10 | grep -A 10 -B 2 -i "large language model\|evaluation\|benchmark\|GPT\|BERT") ⎿  :94456::94477 (0.00000000009809730805202874) 94457: Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought 94458: Reasoning by Large Language Models @@ -203,5 +203,5 @@ This run cost $0.68 and took about 4 minutes. Not bad for researching across 900 ## Tips - Your `CLAUDE.md` / `AGENTS.md` / etc. is critical to ensure your agent knows about `semtools`. Giving it good examples of usage (maybe from this guide) is a good idea. -- Parsing files can take a while. I recommend using `parse` ahead of time if you intend to give your agent a lot of files to work with that need to be parsed. Results will be cached to disk at `~/.parse`. +- Parsing files can take a while. I recommend using `semtools parse` ahead of time if you intend to give your agent a lot of files to work with that need to be parsed. Results will be cached to disk at `~/.parse`. - Giving your agent any context on the files it has access to will also help it understand how to help you with them. diff --git a/examples/use_with_mcp.md b/examples/use_with_mcp.md index b06a386d..be21ac93 100644 --- a/examples/use_with_mcp.md +++ b/examples/use_with_mcp.md @@ -59,7 +59,7 @@ Here's an example where I prompt it to interact with a directory of 900+ PDF fil First, I parse the PDFs (which caches them to disk at `~/.parse`): ```bash -parse ./papers +semtools parse ./papers ``` Then I can write a script to call an agent to interact with the files and search for information: @@ -109,13 +109,13 @@ Which outputs: $ python ./agent.py Calling tool execute_bash({'command': 'find . -name "*.pdf" | head -10'}) Calling tool execute_bash({'command': 'find . -name "*.pdf" | wc -l'}) -Calling tool execute_bash({'command': 'find . -name "*.pdf" | xargs parse'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "large language model, LLM, evaluation, benchmark" --top-k 10 --n-lines 5'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "GPT, ChatGPT, instruction tuning, evaluation benchmark, model evaluation" --top-k 15 --n-lines 4'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "human evaluation, automatic evaluation, BLEU, ROUGE, BERTScore, evaluation metric" --top-k 10 --n-lines 4'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "scaling laws, emergent abilities, few-shot learning, in-context learning, prompt engineering" --top-k 10 --n-lines 4'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "title, abstract" --top-k 5 --n-lines 10 | head -50'}) -Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs search "# " --top-k 20 --n-lines 2 | grep -E "(LLM|evaluation|benchmark|GPT|language model)" | head -20'}) +Calling tool execute_bash({'command': 'find . -name "*.pdf" | xargs semtools parse'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "large language model, LLM, evaluation, benchmark" --top-k 10 --n-lines 5'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "GPT, ChatGPT, instruction tuning, evaluation benchmark, model evaluation" --top-k 15 --n-lines 4'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "human evaluation, automatic evaluation, BLEU, ROUGE, BERTScore, evaluation metric" --top-k 10 --n-lines 4'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "scaling laws, emergent abilities, few-shot learning, in-context learning, prompt engineering" --top-k 10 --n-lines 4'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "title, abstract" --top-k 5 --n-lines 10 | head -50'}) +Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | xargs semtools search "# " --top-k 20 --n-lines 2 | grep -E "(LLM|evaluation|benchmark|GPT|language model)" | head -20'}) Calling tool execute_bash({'command': 'find /Users/loganmarkewich/.parse -name "*.md" | head -5 | xargs -I {} sh -c \'echo "=== {} ===" && head -20 "{}"\''}) Based on my analysis of the ACL 2023 papers, I can now provide a comprehensive summary of the key findings about LLMs and evaluations: @@ -182,4 +182,3 @@ The ACL 2023 papers collectively suggest that: The research community is moving toward more nuanced, multi-dimensional evaluation approaches that better capture the complexities of language understanding and generation in real-world applications. ``` - diff --git a/src/bin/parse.rs b/src/bin/parse.rs deleted file mode 100644 index 8d6c910c..00000000 --- a/src/bin/parse.rs +++ /dev/null @@ -1,68 +0,0 @@ -use anyhow::Result; -use clap::Parser; -use std::path::Path; - -use semtools::{LlamaParseBackend, SemtoolsConfig}; - -#[derive(Parser, Debug)] -#[command(version, about = "A CLI tool for parsing documents using various backends", long_about = None)] -struct Args { - /// Path to the config file. Defaults to ~/.semtools_config.json - #[clap(short = 'c', long)] - config: Option, - - /// The backend type to use for parsing. Defaults to `llama-parse` - #[clap(short, long, default_value = "llama-parse")] - backend: String, - - /// Files to parse - #[clap(required = true)] - files: Vec, - - /// Verbose output while parsing - #[clap(short, long)] - verbose: bool, -} - -#[tokio::main] -async fn main() -> Result<()> { - let args = Args::parse(); - - // Get config file path - let config_path = args - .config - .unwrap_or_else(SemtoolsConfig::default_config_path); - - // Load configuration - let semtools_config = SemtoolsConfig::from_config_file(&config_path)?; - let parse_config = semtools_config.parse.unwrap_or_default(); - - // Validate that files exist - for file in &args.files { - if !Path::new(file).exists() { - eprintln!("Warning: File does not exist: {file}"); - } - } - - // Create backend and process files - match args.backend.as_str() { - "llama-parse" => { - let backend = LlamaParseBackend::new(parse_config, args.verbose)?; - let results = backend.parse(args.files).await?; - - // Output the paths to parsed files, one per line - for result_path in results { - println!("{result_path}"); - } - } - _ => { - eprintln!( - "Error: Unknown backend '{}'. Supported backends: llama-parse", - args.backend - ); - std::process::exit(1); - } - } - - Ok(()) -} diff --git a/src/bin/semtools.rs b/src/bin/semtools.rs new file mode 100644 index 00000000..e3ea3ec6 --- /dev/null +++ b/src/bin/semtools.rs @@ -0,0 +1,181 @@ +use clap::{Parser, Subcommand}; +use semtools::cmds::ask::ask_cmd; +use semtools::cmds::parse::parse_cmd; +use semtools::cmds::search::search_cmd; +use semtools::cmds::workspace::{workspace_prune_cmd, workspace_status_cmd, workspace_use_cmd}; + +#[derive(Parser, Debug)] +struct SemtoolsArgs { + #[command(subcommand)] + cmd: Commands, +} + +#[derive(Subcommand, Debug)] +enum WorkspaceCommands { + /// Use or create a workspace (prints export command to run) + Use { name: String }, + /// Show active workspace and basic stats + Status, + /// Remove stale or missing files from store + Prune {}, +} + +#[derive(Subcommand, Debug)] +enum Commands { + #[cfg(feature = "parse")] + /// A CLI tool for parsing documents using various backends + Parse { + /// Path to the config file. Defaults to ~/.semtools_config.json + #[clap(short = 'c', long)] + config: Option, + + /// The backend type to use for parsing. Defaults to `llama-parse` + #[clap(short, long, default_value = "llama-parse")] + backend: String, + + /// Files to parse + #[clap(required = true)] + files: Vec, + + /// Verbose output while parsing + #[clap(short, long)] + verbose: bool, + }, + #[cfg(feature = "search")] + /// A CLI tool for fast semantic keyword search + Search { + /// Query to search for (positional argument) + query: String, + + /// Files to search (positional arguments, optional if using stdin) + #[arg(help = "Files to search, optional if using stdin")] + files: Vec, + + /// How many lines before/after to return as context + #[arg(short = 'n', long = "n-lines", alias = "context", default_value_t = 3)] + n_lines: usize, + + /// The top-k files or texts to return (ignored if max_distance is set) + #[arg(long, default_value_t = 3)] + top_k: usize, + + /// Return all results with distance below this threshold (0.0+) + #[arg(short = 'm', long = "max-distance", alias = "threshold")] + max_distance: Option, + + /// Perform case-insensitive search (default is false) + #[arg(short, long, default_value_t = false)] + ignore_case: bool, + + /// Output results in JSON format + #[clap(short, long)] + json: bool, + }, + #[cfg(feature = "ask")] + /// A CLI tool for document-based question-answering + Ask { + /// Query to prompt the agent with + query: String, + + /// Files to search (positional arguments, optional if using stdin) + #[arg(help = "Files to search, optional if using stdin")] + files: Vec, + + /// Path to the config file. Defaults to ~/.semtools_config.json + #[clap(short = 'c', long)] + config: Option, + + /// OpenAI API key (overrides config file and env var) + #[clap(long)] + api_key: Option, + + /// OpenAI base URL (overrides config file) + #[clap(long)] + base_url: Option, + + /// Model to use for the agent (overrides config file) + #[clap(short, long)] + model: Option, + + /// API mode to use: 'chat' or 'responses' (overrides config file) + #[clap(long)] + api_mode: Option, + + /// Output results in JSON or text format + #[clap(short, long)] + json: bool, + }, + #[cfg(feature = "workspace")] + /// Manage semtools workspaces + Workspace { + /// Output results in JSON format + #[clap(short, long, global = true)] + json: bool, + + #[command(subcommand)] + command: WorkspaceCommands, + }, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let args = SemtoolsArgs::parse(); + match args.cmd { + Commands::Ask { + query, + files, + config, + api_key, + base_url, + model, + api_mode, + json, + } => { + ask_cmd( + query, files, config, api_key, base_url, model, api_mode, json, + ) + .await?; + } + Commands::Parse { + config, + backend, + files, + verbose, + } => { + parse_cmd(config, backend, files, verbose).await?; + } + Commands::Search { + query, + files, + n_lines, + top_k, + max_distance, + ignore_case, + json, + } => { + search_cmd( + query, + files, + n_lines, + top_k, + max_distance, + ignore_case, + json, + ) + .await?; + } + Commands::Workspace { json, command } => match command { + WorkspaceCommands::Use { name } => { + workspace_use_cmd(name, json).await?; + } + WorkspaceCommands::Prune {} => { + workspace_prune_cmd(json).await?; + } + WorkspaceCommands::Status => { + workspace_status_cmd(json).await?; + } + }, + } + + Ok(()) +} diff --git a/src/bin/workspace.rs b/src/bin/workspace.rs deleted file mode 100644 index b57a1382..00000000 --- a/src/bin/workspace.rs +++ /dev/null @@ -1,200 +0,0 @@ -use anyhow::{Context, Result}; -use clap::{Parser, Subcommand}; - -#[cfg(feature = "workspace")] -use semtools::workspace::{Workspace, WorkspaceConfig, store::Store}; - -use semtools::json_mode::{PruneOutput, WorkspaceOutput}; - -#[cfg(not(feature = "workspace"))] -use semtools::json_mode::ErrorOutput; - -#[derive(Parser, Debug)] -#[command(version, about = "Manage semtools workspaces", long_about = None)] -struct Args { - /// Output results in JSON format - #[clap(short, long, global = true)] - json: bool, - - #[command(subcommand)] - command: Commands, -} - -#[derive(Subcommand, Debug)] -enum Commands { - /// Use or create a workspace (prints export command to run) - Use { name: String }, - /// Show active workspace and basic stats - Status, - /// Remove stale or missing files from store - Prune {}, -} - -#[tokio::main] -async fn main() -> Result<()> { - let args = Args::parse(); - - match args.command { - Commands::Use { name } => { - #[cfg(feature = "workspace")] - { - // Initialize new workspace configuration - let ws = Workspace { - config: WorkspaceConfig { - name: name.clone(), - root_dir: Workspace::root_path(&name)?, - ..Default::default() - }, - }; - ws.save()?; - - if args.json { - // Try to get document count from store, or use 0 for new workspace - let total_documents = if let Ok(store) = Store::open(&ws.config.root_dir) { - if let Ok(stats) = store.get_stats() { - stats.total_documents - } else { - 0 - } - } else { - 0 - }; - - let output = WorkspaceOutput { - name: ws.config.name.clone(), - root_dir: ws.config.root_dir.clone(), - total_documents, - }; - let json_output = serde_json::to_string_pretty(&output)?; - println!("{}", json_output); - } else { - println!("Workspace '{name}' configured."); - println!("To activate it, run:"); - println!(" export SEMTOOLS_WORKSPACE={name}"); - println!(); - println!("Or add this to your shell profile (.bashrc, .zshrc, etc.)"); - } - } - #[cfg(not(feature = "workspace"))] - { - if args.json { - let error_output = ErrorOutput { - error: "workspace feature not enabled".to_string(), - error_type: "FeatureNotEnabled".to_string(), - }; - let json_output = serde_json::to_string_pretty(&error_output)?; - eprintln!("{}", json_output); - } else { - println!("workspace feature not enabled"); - } - } - } - Commands::Status => { - #[cfg(feature = "workspace")] - { - let _name = Workspace::active().context("No active workspace")?; - let ws = Workspace::open()?; - - // Open store and get stats - let store = Store::open(&ws.config.root_dir)?; - let stats = store.get_stats()?; - - if args.json { - let output = WorkspaceOutput { - name: ws.config.name.clone(), - root_dir: ws.config.root_dir.clone(), - total_documents: stats.total_documents, - }; - let json_output = serde_json::to_string_pretty(&output)?; - println!("{}", json_output); - } else { - println!("Active workspace: {}", ws.config.name); - println!("Root: {}", ws.config.root_dir); - println!("Documents: {}", stats.total_documents); - if stats.has_index { - let index_info = stats.index_type.unwrap_or_else(|| "Unknown".to_string()); - println!("Index: Yes ({index_info})"); - } else { - println!("Index: No"); - } - } - } - #[cfg(not(feature = "workspace"))] - { - if args.json { - let error_output = ErrorOutput { - error: "workspace feature not enabled".to_string(), - error_type: "FeatureNotEnabled".to_string(), - }; - let json_output = serde_json::to_string_pretty(&error_output)?; - eprintln!("{}", json_output); - } else { - println!("workspace feature not enabled"); - } - } - } - Commands::Prune {} => { - #[cfg(feature = "workspace")] - { - let _name = Workspace::active().context("No active workspace")?; - let ws = Workspace::open()?; - let store = Store::open(&ws.config.root_dir)?; - - // Get all document paths from the workspace - let all_paths = store.get_all_document_paths()?; - let total_before = all_paths.len(); - - // Check which files no longer exist - let mut missing_paths = Vec::new(); - for path in &all_paths { - if !std::path::Path::new(path).exists() { - missing_paths.push(path.clone()); - } - } - - let files_removed = missing_paths.len(); - let files_remaining = total_before - files_removed; - - if !missing_paths.is_empty() { - // Remove stale documents - store.delete_documents(&missing_paths)?; - } - - if args.json { - let output = PruneOutput { - files_removed, - files_remaining, - }; - let json_output = serde_json::to_string_pretty(&output)?; - println!("{}", json_output); - } else if missing_paths.is_empty() { - println!("No stale documents found. Workspace is clean."); - } else { - println!("Found {} stale documents:", missing_paths.len()); - for path in &missing_paths { - println!(" - {path}"); - } - println!( - "Removed {} stale documents from workspace.", - missing_paths.len() - ); - } - } - #[cfg(not(feature = "workspace"))] - { - if args.json { - let error_output = ErrorOutput { - error: "workspace feature not enabled".to_string(), - error_type: "FeatureNotEnabled".to_string(), - }; - let json_output = serde_json::to_string_pretty(&error_output)?; - eprintln!("{}", json_output); - } else { - println!("workspace feature not enabled"); - } - } - } - } - - Ok(()) -} diff --git a/src/bin/ask.rs b/src/cmds/ask.rs similarity index 62% rename from src/bin/ask.rs rename to src/cmds/ask.rs index 28c07892..1b1c8350 100644 --- a/src/bin/ask.rs +++ b/src/cmds/ask.rs @@ -1,72 +1,40 @@ use anyhow::Result; use async_openai::Client; use async_openai::config::OpenAIConfig; -use clap::Parser; use model2vec_rs::model::StaticModel; use std::io::{self, BufRead, IsTerminal}; -use semtools::SemtoolsConfig; -use semtools::ask::chat_agent::{ask_agent, ask_agent_with_stdin}; -use semtools::ask::responses_agent::{ask_agent_responses, ask_agent_responses_with_stdin}; -use semtools::config::ApiMode; -use semtools::json_mode::ErrorOutput; -use semtools::search::MODEL_NAME; - -#[derive(Parser, Debug)] -#[command(version, about = "A CLI tool for fast semantic keyword search", long_about = None)] -struct Args { - /// Query to prompt the agent with - query: String, +use crate::SemtoolsConfig; +use crate::ask::chat_agent::{ask_agent, ask_agent_with_stdin}; +use crate::ask::responses_agent::{ask_agent_responses, ask_agent_responses_with_stdin}; +use crate::config::ApiMode; +use crate::json_mode::ErrorOutput; +use crate::search::MODEL_NAME; - /// Files to search (positional arguments, optional if using stdin) - #[arg(help = "Files to search, optional if using stdin")] - files: Vec, +fn read_from_stdin() -> Result> { + let stdin = io::stdin(); + let lines: Result, _> = stdin.lock().lines().collect(); + Ok(lines?) +} - /// Path to the config file. Defaults to ~/.semtools_config.json - #[clap(short = 'c', long)] +#[allow(clippy::too_many_arguments)] +pub async fn ask_cmd( + query: String, + files: Vec, config: Option, - - /// OpenAI API key (overrides config file and env var) - #[clap(long)] api_key: Option, - - /// OpenAI base URL (overrides config file) - #[clap(long)] base_url: Option, - - /// Model to use for the agent (overrides config file) - #[clap(short, long)] model: Option, - - /// API mode to use: 'chat' or 'responses' (overrides config file) - #[clap(long)] api_mode: Option, - - /// Output results in JSON or text format - #[clap(short, long)] json: bool, -} - -fn read_from_stdin() -> Result> { - let stdin = io::stdin(); - let lines: Result, _> = stdin.lock().lines().collect(); - Ok(lines?) -} - -#[tokio::main] -async fn main() -> Result<()> { - let args = Args::parse(); - +) -> Result<()> { // Load configuration - let config_path = args - .config - .unwrap_or_else(SemtoolsConfig::default_config_path); + let config_path = config.unwrap_or_else(SemtoolsConfig::default_config_path); let semtools_config = SemtoolsConfig::from_config_file(&config_path)?; let ask_config = semtools_config.ask.unwrap_or_default(); // Resolve API key with priority: CLI arg > config file > env var > error - let api_key = args - .api_key + let api_key = api_key .or(ask_config.api_key) .or_else(|| std::env::var("OPENAI_API_KEY").ok()) .ok_or_else(|| { @@ -76,11 +44,10 @@ async fn main() -> Result<()> { })?; // Resolve base URL with priority: CLI arg > config file > default - let base_url = args.base_url.or(ask_config.base_url); + let base_url = base_url.or(ask_config.base_url); // Resolve model with priority: CLI arg > config file > default - let model_name = args - .model + let model_name = model .or(ask_config.model) .unwrap_or_else(|| "gpt-4o-mini".to_string()); @@ -88,7 +55,7 @@ async fn main() -> Result<()> { let max_iterations = ask_config.max_iterations; // Resolve API mode with priority: CLI arg > config file > default - let api_mode = if let Some(mode_str) = args.api_mode { + let api_mode = if let Some(mode_str) = api_mode { match mode_str.to_lowercase().as_str() { "chat" => ApiMode::Chat, "responses" => ApiMode::Responses, @@ -111,7 +78,7 @@ async fn main() -> Result<()> { let client = Client::with_config(openai_config); // Check if we have stdin input (no files and stdin is not a terminal) - if args.files.is_empty() && !io::stdin().is_terminal() { + if files.is_empty() && !io::stdin().is_terminal() { let stdin_lines = read_from_stdin()?; if !stdin_lines.is_empty() { let stdin_content = stdin_lines.join("\n"); @@ -119,20 +86,15 @@ async fn main() -> Result<()> { // Run the appropriate agent with stdin content (no tools) let output = match api_mode { ApiMode::Chat => { - ask_agent_with_stdin(&stdin_content, &args.query, &client, &model_name).await? + ask_agent_with_stdin(&stdin_content, &query, &client, &model_name).await? } ApiMode::Responses => { - ask_agent_responses_with_stdin( - &stdin_content, - &args.query, - &client, - &model_name, - ) - .await? + ask_agent_responses_with_stdin(&stdin_content, &query, &client, &model_name) + .await? } }; - if args.json { + if json { let json_output = serde_json::to_string_pretty(&output)?; println!("\n{}", json_output); } else { @@ -144,10 +106,10 @@ async fn main() -> Result<()> { } // If no stdin, we need files to search through - if args.files.is_empty() { + if files.is_empty() { let error_msg = "No input provided. Either specify files as arguments or pipe input to stdin."; - if args.json { + if json { let error_output = ErrorOutput { error: error_msg.to_string(), error_type: "NoInput".to_string(), @@ -172,30 +134,14 @@ async fn main() -> Result<()> { // Run the appropriate agent based on API mode let output = match api_mode { ApiMode::Chat => { - ask_agent( - args.files, - &args.query, - &model, - &client, - &model_name, - max_iterations, - ) - .await? + ask_agent(files, &query, &model, &client, &model_name, max_iterations).await? } ApiMode::Responses => { - ask_agent_responses( - args.files, - &args.query, - &model, - &client, - &model_name, - max_iterations, - ) - .await? + ask_agent_responses(files, &query, &model, &client, &model_name, max_iterations).await? } }; - if args.json { + if json { let json_output = serde_json::to_string_pretty(&output)?; println!("\n{}", json_output); } else { diff --git a/src/cmds/mod.rs b/src/cmds/mod.rs new file mode 100644 index 00000000..a358ce32 --- /dev/null +++ b/src/cmds/mod.rs @@ -0,0 +1,11 @@ +#[cfg(feature = "ask")] +pub mod ask; + +#[cfg(feature = "parse")] +pub mod parse; + +#[cfg(feature = "search")] +pub mod search; + +#[cfg(feature = "workspace")] +pub mod workspace; diff --git a/src/cmds/parse.rs b/src/cmds/parse.rs new file mode 100644 index 00000000..e97ac728 --- /dev/null +++ b/src/cmds/parse.rs @@ -0,0 +1,47 @@ +use anyhow::Result; +use std::path::Path; + +use crate::{LlamaParseBackend, SemtoolsConfig}; + +pub async fn parse_cmd( + config: Option, + backend: String, + files: Vec, + verbose: bool, +) -> Result<()> { + // Get config file path + let config_path = config.unwrap_or_else(SemtoolsConfig::default_config_path); + + // Load configuration + let semtools_config = SemtoolsConfig::from_config_file(&config_path)?; + let parse_config = semtools_config.parse.unwrap_or_default(); + + // Validate that files exist + for file in &files { + if !Path::new(file).exists() { + eprintln!("Warning: File does not exist: {file}"); + } + } + + // Create backend and process files + match backend.as_str() { + "llama-parse" => { + let backend = LlamaParseBackend::new(parse_config, verbose)?; + let results = backend.parse(files).await?; + + // Output the paths to parsed files, one per line + for result_path in results { + println!("{result_path}"); + } + } + _ => { + eprintln!( + "Error: Unknown backend '{}'. Supported backends: llama-parse", + backend + ); + std::process::exit(1); + } + } + + Ok(()) +} diff --git a/src/bin/search.rs b/src/cmds/search.rs similarity index 79% rename from src/bin/search.rs rename to src/cmds/search.rs index 2a2fc24b..399d22ea 100644 --- a/src/bin/search.rs +++ b/src/cmds/search.rs @@ -1,50 +1,18 @@ use anyhow::Result; -use clap::Parser; use model2vec_rs::model::StaticModel; use std::io::{self, BufRead, IsTerminal}; #[cfg(feature = "workspace")] -use semtools::workspace::{Workspace, store::RankedLine}; +use crate::workspace::{Workspace, store::RankedLine}; #[cfg(feature = "workspace")] -use semtools::search::search_with_workspace; +use crate::search::search_with_workspace; -use semtools::json_mode::{ErrorOutput, SearchOutput, SearchResultJSON}; -use semtools::search::{ +use crate::json_mode::{ErrorOutput, SearchOutput, SearchResultJSON}; +use crate::search::{ Document, MODEL_NAME, SearchConfig, SearchResult, search_documents, search_files, }; -#[derive(Parser, Debug)] -#[command(version, about = "A CLI tool for fast semantic keyword search", long_about = None)] -struct Args { - /// Query to search for (positional argument) - query: String, - - /// Files to search (positional arguments, optional if using stdin) - #[arg(help = "Files to search, optional if using stdin")] - files: Vec, - - /// How many lines before/after to return as context - #[arg(short = 'n', long = "n-lines", alias = "context", default_value_t = 3)] - n_lines: usize, - - /// The top-k files or texts to return (ignored if max_distance is set) - #[arg(long, default_value_t = 3)] - top_k: usize, - - /// Return all results with distance below this threshold (0.0+) - #[arg(short = 'm', long = "max-distance", alias = "threshold")] - max_distance: Option, - - /// Perform case-insensitive search (default is false) - #[arg(short, long, default_value_t = false)] - ignore_case: bool, - - /// Output results in JSON format - #[clap(short, long)] - json: bool, -} - fn read_from_stdin() -> Result> { let stdin = io::stdin(); let lines: Result, _> = stdin.lock().lines().collect(); @@ -141,10 +109,15 @@ fn print_workspace_search_results(ranked_lines: &[RankedLine], n_lines: usize) { } } -#[tokio::main] -async fn main() -> Result<()> { - let args = Args::parse(); - +pub async fn search_cmd( + query: String, + files: Vec, + n_lines: usize, + top_k: usize, + max_distance: Option, + ignore_case: bool, + json: bool, +) -> Result<()> { let model = StaticModel::from_pretrained( MODEL_NAME, // "minishlab/potion-multilingual-128M", None, // Optional: Hugging Face API token for private models @@ -152,25 +125,25 @@ async fn main() -> Result<()> { None, // Optional: subfolder if model files are not at the root of the repo/path )?; - let query = if args.ignore_case { - args.query.to_lowercase() + let query = if ignore_case { + query.to_lowercase() } else { - args.query.clone() + query.clone() }; let query_embedding = model.encode_single(&query); let config = SearchConfig { - n_lines: args.n_lines, - top_k: args.top_k, - max_distance: args.max_distance, - ignore_case: args.ignore_case, + n_lines, + top_k, + max_distance, + ignore_case, }; // Handle stdin input (non-workspace mode) - if args.files.is_empty() && !io::stdin().is_terminal() { + if files.is_empty() && !io::stdin().is_terminal() { let stdin_lines = read_from_stdin()?; if !stdin_lines.is_empty() { - let lines_for_embedding = if args.ignore_case { + let lines_for_embedding = if ignore_case { stdin_lines.iter().map(|s| s.to_lowercase()).collect() } else { stdin_lines.clone() @@ -186,7 +159,7 @@ async fn main() -> Result<()> { let search_results = search_documents(&documents, &query_embedding, &config); - if args.json { + if json { let output = SearchOutput { results: search_results.iter().map(search_result_to_json).collect(), }; @@ -200,10 +173,10 @@ async fn main() -> Result<()> { } } - if args.files.is_empty() { + if files.is_empty() { let error_msg = "No input provided. Either specify files as arguments or pipe input to stdin."; - if args.json { + if json { let error_output = ErrorOutput { error: error_msg.to_string(), error_type: "NoInput".to_string(), @@ -222,21 +195,21 @@ async fn main() -> Result<()> { if Workspace::active().is_ok() { // Workspace mode: use persisted line embeddings for speed let config = SearchConfig { - n_lines: args.n_lines, - top_k: args.top_k, - max_distance: args.max_distance, - ignore_case: args.ignore_case, + n_lines, + top_k, + max_distance, + ignore_case, }; - let ranked_lines = search_with_workspace(&args.files, &query, &model, &config).await?; + let ranked_lines = search_with_workspace(&files, &query, &model, &config).await?; - if args.json { + if json { // Convert workspace results to SearchResultJSON let results: Vec = ranked_lines .iter() .map(|ranked_line| { let match_line_number = ranked_line.line_number as usize; - let start = match_line_number.saturating_sub(args.n_lines); - let end = match_line_number + args.n_lines + 1; + let start = match_line_number.saturating_sub(n_lines); + let end = match_line_number + n_lines + 1; // Read file content for the result let content = @@ -264,12 +237,12 @@ async fn main() -> Result<()> { let json_output = serde_json::to_string_pretty(&output)?; println!("{}", json_output); } else { - print_workspace_search_results(&ranked_lines, args.n_lines); + print_workspace_search_results(&ranked_lines, n_lines); } } else { - let search_results = search_files(&args.files, &query, &model, &config)?; + let search_results = search_files(&files, &query, &model, &config)?; - if args.json { + if json { let output = SearchOutput { results: search_results.iter().map(search_result_to_json).collect(), }; @@ -283,9 +256,9 @@ async fn main() -> Result<()> { #[cfg(not(feature = "workspace"))] { - let search_results = search_files(&args.files, &query, &model, &config)?; + let search_results = search_files(&files, &query, &model, &config)?; - if args.json { + if json { let output = SearchOutput { results: search_results.iter().map(search_result_to_json).collect(), }; diff --git a/src/cmds/workspace.rs b/src/cmds/workspace.rs new file mode 100644 index 00000000..c083eb07 --- /dev/null +++ b/src/cmds/workspace.rs @@ -0,0 +1,174 @@ +use anyhow::{Context, Result}; + +#[cfg(feature = "workspace")] +use crate::workspace::{Workspace, WorkspaceConfig, store::Store}; + +use crate::json_mode::{PruneOutput, WorkspaceOutput}; + +#[cfg(not(feature = "workspace"))] +use crate::json_mode::ErrorOutput; + +pub async fn workspace_use_cmd(name: String, json: bool) -> Result<()> { + #[cfg(feature = "workspace")] + { + // Initialize new workspace configuration + let ws = Workspace { + config: WorkspaceConfig { + name: name.clone(), + root_dir: Workspace::root_path(&name)?, + ..Default::default() + }, + }; + ws.save()?; + + if json { + // Try to get document count from store, or use 0 for new workspace + let total_documents = if let Ok(store) = Store::open(&ws.config.root_dir) { + if let Ok(stats) = store.get_stats() { + stats.total_documents + } else { + 0 + } + } else { + 0 + }; + + let output = WorkspaceOutput { + name: ws.config.name.clone(), + root_dir: ws.config.root_dir.clone(), + total_documents, + }; + let json_output = serde_json::to_string_pretty(&output)?; + println!("{}", json_output); + } else { + println!("Workspace '{name}' configured."); + println!("To activate it, run:"); + println!(" export SEMTOOLS_WORKSPACE={name}"); + println!(); + println!("Or add this to your shell profile (.bashrc, .zshrc, etc.)"); + } + } + #[cfg(not(feature = "workspace"))] + { + if json { + let error_output = ErrorOutput { + error: "workspace feature not enabled".to_string(), + error_type: "FeatureNotEnabled".to_string(), + }; + let json_output = serde_json::to_string_pretty(&error_output)?; + eprintln!("{}", json_output); + } else { + println!("workspace feature not enabled"); + } + } + Ok(()) +} + +pub async fn workspace_status_cmd(json: bool) -> Result<()> { + #[cfg(feature = "workspace")] + { + let _name = Workspace::active().context("No active workspace")?; + let ws = Workspace::open()?; + + // Open store and get stats + let store = Store::open(&ws.config.root_dir)?; + let stats = store.get_stats()?; + + if json { + let output = WorkspaceOutput { + name: ws.config.name.clone(), + root_dir: ws.config.root_dir.clone(), + total_documents: stats.total_documents, + }; + let json_output = serde_json::to_string_pretty(&output)?; + println!("{}", json_output); + } else { + println!("Active workspace: {}", ws.config.name); + println!("Root: {}", ws.config.root_dir); + println!("Documents: {}", stats.total_documents); + if stats.has_index { + let index_info = stats.index_type.unwrap_or_else(|| "Unknown".to_string()); + println!("Index: Yes ({index_info})"); + } else { + println!("Index: No"); + } + } + } + #[cfg(not(feature = "workspace"))] + { + if json { + let error_output = ErrorOutput { + error: "workspace feature not enabled".to_string(), + error_type: "FeatureNotEnabled".to_string(), + }; + let json_output = serde_json::to_string_pretty(&error_output)?; + eprintln!("{}", json_output); + } else { + println!("workspace feature not enabled"); + } + } + Ok(()) +} + +pub async fn workspace_prune_cmd(json: bool) -> Result<()> { + #[cfg(feature = "workspace")] + { + let _name = Workspace::active().context("No active workspace")?; + let ws = Workspace::open()?; + let store = Store::open(&ws.config.root_dir)?; + + // Get all document paths from the workspace + let all_paths = store.get_all_document_paths()?; + let total_before = all_paths.len(); + + // Check which files no longer exist + let mut missing_paths = Vec::new(); + for path in &all_paths { + if !std::path::Path::new(path).exists() { + missing_paths.push(path.clone()); + } + } + + let files_removed = missing_paths.len(); + let files_remaining = total_before - files_removed; + + if !missing_paths.is_empty() { + // Remove stale documents + store.delete_documents(&missing_paths)?; + } + + if json { + let output = PruneOutput { + files_removed, + files_remaining, + }; + let json_output = serde_json::to_string_pretty(&output)?; + println!("{}", json_output); + } else if missing_paths.is_empty() { + println!("No stale documents found. Workspace is clean."); + } else { + println!("Found {} stale documents:", missing_paths.len()); + for path in &missing_paths { + println!(" - {path}"); + } + println!( + "Removed {} stale documents from workspace.", + missing_paths.len() + ); + } + } + #[cfg(not(feature = "workspace"))] + { + if json { + let error_output = ErrorOutput { + error: "workspace feature not enabled".to_string(), + error_type: "FeatureNotEnabled".to_string(), + }; + let json_output = serde_json::to_string_pretty(&error_output)?; + eprintln!("{}", json_output); + } else { + println!("workspace feature not enabled"); + } + } + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index aa8f64c8..0ca147ae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ pub mod config; pub use config::{AskConfig, SemtoolsConfig}; +pub mod cmds; pub mod json_mode; #[cfg(feature = "parse")]