From f6b4a60a9860d3b809a535699230a1d914d6bbff Mon Sep 17 00:00:00 2001
From: Christopher Madison
Date: Wed, 19 Mar 2025 11:28:07 -0400
Subject: [PATCH 1/4] added tested functionality (on macos) to allow
configuration of storage outside project root
---
.env.template | 8 +-
README.md | 137 +++++++--
app/api/storage/route.ts | 276 ++++++++++--------
docker/compose/docker-compose.yml | 13 +-
.../src/fast_markdown_mcp/server.py | 149 ++++++----
scripts/docker/docker-start.sh | 21 +-
scripts/general/start.sh | 22 +-
7 files changed, 413 insertions(+), 213 deletions(-)
diff --git a/.env.template b/.env.template
index 7b63528..929f696 100644
--- a/.env.template
+++ b/.env.template
@@ -15,4 +15,10 @@ MAX_CONCURRENT_TASKS=5
# Uncomment and set these if you want to use LLM features in Crawl4AI
# OPENAI_API_KEY=
-# ANTHROPIC_API_KEY=
\ No newline at end of file
+# ANTHROPIC_API_KEY=
+
+# Storage path configuration
+# - For a path inside the project (default): STORAGE_PATH=storage/markdown
+# - For an absolute path outside the project: STORAGE_PATH=/path/to/your/storage
+# The path can be absolute or relative to the project root directory
+STORAGE_PATH=storage/markdown
\ No newline at end of file
diff --git a/README.md b/README.md
index c537b9e..f48c37f 100644
--- a/README.md
+++ b/README.md
@@ -22,38 +22,46 @@
## π― Perfect For
### π’ Enterprise Software Developers
+
Skip weeks of reading documentation and dealing with technical debt. Implement ANY technology faster by letting DevDocs handle the heavy lifting of documentation understanding.
### πΈοΈ Web Scrapers
+
Pull entire contents of websites with Smart Discovery of Child URLs up to level 5. Perfect for both internal and external website documentation with intelligent crawling.
### π₯ Development Teams
+
Leverage internal documentation with built-in MCP servers and Claude integration for intelligent data querying. Transform your team's knowledge base into an actionable resource.
### π Indie Hackers
+
DevDocs + VSCode(cline) + Your Idea = Ship products fast with ANY technology. No more getting stuck in documentation hell when building your next big thing.
## β¨ Features
### π§ Intelligent Crawling
+
- **Smart Depth Control**: Choose crawl depth from 1-5 levels
- **Automatic Link Discovery**: Finds and categorizes all related content
- **Selective Crawling**: Pick exactly what you want to extract
- **Child URL Detection**: Automatically discovers and maps website structure
### β‘ Performance & Speed
+
- **Parallel Processing**: Crawl multiple pages simultaneously
- **Smart Caching**: Never waste time on duplicate content
- **Lazy Loading Support**: Handles modern web apps effortlessly
- **Rate Limiting**: Respectful crawling that won't overload servers
### π― Content Processing
+
- **Clean Extraction**: Get content without the fluff
- **Multiple Formats**: Export to MD or JSON for LLM fine-tuning
- **Structured Output**: Logically organized content
- **MCP Server Integration**: Ready for AI processing
### π‘οΈ Enterprise Features
+
- **Error Recovery**: Auto-retry on failures
- **Full Logging**: Track every operation
- **API Access**: Integrate with your tools
@@ -62,32 +70,35 @@ DevDocs + VSCode(cline) + Your Idea = Ship products fast with ANY technology. No
## π€ Why DevDocs?
### The Problem
+
Documentation is everywhere and LLMs are OUTDATED in their knowledge. Reading it, understanding it, and implementing it takes weeks of research and development even for senior engineers. **We cut down that time to hours.**
### Our Solution
+
DevDocs brings documentation to you. Point it at any tech documentation URL, and watch as it:
+
1. Discovers all related pages to that technology
2. Extracts meaningful content without the fluff
3. Organizes information logically inside an MCP server ready for your LLM to query
4. Presents it in a clean, searchable format in MD or JSON for finetuning LLM purpose
-π₯ We want anyone in the world to have the ability to build amazing products quickly using the most cutting edge LLM technology.
+π₯ We want anyone in the world to have the ability to build amazing products quickly using the most cutting edge LLM technology.
## π° Pricing Comparison
-| Feature | DevDocs | Firecrawl |
-|---------|---------|-----------|
-| Free Tier | Unlimited pages | None |
-| Starting Price | Free Forever | $16/month |
-| Enterprise Plan | Custom | $333/month |
-| Crawl Speed | 1000/min | 20/min |
-| Depth Levels | Up to 5 | Limited |
-| Team Seats | Unlimited | 1-5 seats |
-| Export Formats | MD, JSON, LLM-ready MCP servers | Limited formats |
-| API Access | Coming Soon | Limited |
-| Model Context Protocol Integration | β
| β |
-| Support | Priority Available via Discord | Standard only |
-| Self-hosted (free use) | β
| β |
+| Feature | DevDocs | Firecrawl |
+| ---------------------------------- | ------------------------------- | --------------- |
+| Free Tier | Unlimited pages | None |
+| Starting Price | Free Forever | $16/month |
+| Enterprise Plan | Custom | $333/month |
+| Crawl Speed | 1000/min | 20/min |
+| Depth Levels | Up to 5 | Limited |
+| Team Seats | Unlimited | 1-5 seats |
+| Export Formats | MD, JSON, LLM-ready MCP servers | Limited formats |
+| API Access | Coming Soon | Limited |
+| Model Context Protocol Integration | β
| β |
+| Support | Priority Available via Discord | Standard only |
+| Self-hosted (free use) | β
| β |
## π Getting Started
@@ -101,6 +112,7 @@ DevDocs is designed to be easy to use with Docker, requiring minimal setup for n
### Quick Start with Docker (Recommended)
For Mac/Linux users:
+
```bash
# Clone the repository
git clone https://github.com/cyberagiinc/DevDocs.git
@@ -113,6 +125,7 @@ cd DevDocs
```
For Windows users:
+
```cmd
# Clone the repository
git clone https://github.com/cyberagiinc/DevDocs.git
@@ -123,6 +136,7 @@ cd DevDocs
# Start all services using Docker
docker-start.bat
```
+
Note for Windows Users
@@ -133,6 +147,7 @@ docker-start.bat
> If you need to manually set permissions, you can do so using either the Windows GUI or command line:
>
> **Using Windows Explorer**:
+>
> 1. Right-click on each directory (logs, storage, crawl_results)
> 2. Select "Properties"
> 3. Go to the "Security" tab
@@ -145,30 +160,84 @@ docker-start.bat
> 10. Click "Apply" and "OK"
>
> **Using Command Prompt (as Administrator)**:
+>
> ```cmd
> icacls logs /grant Everyone:F /T
> icacls storage /grant Everyone:F /T
> icacls crawl_results /grant Everyone:F /T
> ```
-
+
+
Note about docker-compose.yml on Windows
> If you encounter issues with the docker-compose.yml file (such as "Top-level object must be a mapping" error), the `docker-start.bat` script automatically fixes this by ensuring the file has the correct format and encoding. This fix is applied every time you run the script, so you don't need to manually modify the file.
-
-
+
This single command will:
+
1. Create all necessary directories
2. Set appropriate permissions
3. Build and start all Docker containers
4. Monitor the services to ensure they're running properly
+### Storage Configuration
+
+DevDocs now supports configurable storage paths, allowing you to separate your curated data from the source code.
+
+#### Configuration Options
+
+You can configure the storage location by setting the `STORAGE_PATH` environment variable in your `.env` file:
+
+```
+# For a path inside the project (default)
+STORAGE_PATH=storage/markdown
+
+# For an absolute path outside the project
+STORAGE_PATH=/path/to/your/storage
+```
+
+The storage path can be:
+
+- A relative path (relative to the project root directory)
+- An absolute path to store data outside the project directory
+
+#### Usage with Docker
+
+When using Docker, the configured path will be mounted as a volume into the containers. Make sure the path:
+
+- Is accessible by Docker (if using Docker Desktop, enable the necessary file sharing)
+- Has appropriate permissions (the containers run as root by default)
+
+#### Gitignore Considerations
+
+If you're using a custom storage path outside the project directory, you don't need to worry about Git tracking your data files. If your storage path is inside the project, consider adding it to your `.gitignore` file to prevent accidentally committing your data.
+
+#### Troubleshooting Docker File Sharing on macOS
+
+If you encounter an error like this:
+
+```
+Error response from daemon: Mounts denied:
+The path /users/[username]/MCP/devdocs/storage/markdown is not shared from the host and is not known to Docker.
+You can configure shared paths from Docker -> Preferences... -> Resources -> File Sharing.
+```
+
+This means Docker Desktop doesn't have permission to access the directory you're trying to mount. To fix this:
+
+1. Open Docker Desktop
+2. Go to Settings (βοΈ) > Resources > File Sharing
+3. Add the parent directory of your storage path to the list of shared folders
+4. Click "Apply & Restart"
+
+This is particularly important when using custom storage paths outside the default project structure.
+
### Accessing DevDocs
Once the services are running:
+
- Frontend UI: http://localhost:3001
- Backend API: http://localhost:24125
- Crawl4AI Service: http://localhost:11235
@@ -178,13 +247,14 @@ Once the services are running:
When using Docker, logs can be accessed :
1. **Container Logs** (recommended for debugging):
+
```bash
# View logs from a specific container
docker logs devdocs-frontend
docker logs devdocs-backend
docker logs devdocs-mcp
docker logs devdocs-crawl4ai
-
+
# Follow logs in real-time
docker logs -f devdocs-backend
```
@@ -196,26 +266,31 @@ To stop all services, press `Ctrl+C` in the terminal where docker-start is runni
DevDocs includes various utility scripts to help with development, testing, and maintenance. Here's a quick reference:
### Startup Scripts
+
- `start.sh` / `start.bat` / `start.ps1` - Start all services (frontend, backend, MCP) for local development.
- `docker-start.sh` / `docker-start.bat` - Start all services using Docker containers.
### MCP Server Scripts
+
- `check_mcp_health.sh` - Verify the MCP server's health and configuration status.
- `restart_and_test_mcp.sh` - Restart Docker containers with updated MCP configuration and test connectivity.
### Crawl4AI Scripts
+
- `check_crawl4ai.sh` - Check the status and health of the Crawl4AI service.
- `debug_crawl4ai.sh` - Run Crawl4AI in debug mode with verbose logging for troubleshooting.
- `test_crawl4ai.py` - Run tests against the Crawl4AI service to verify functionality.
- `test_from_container.sh` - Test the Crawl4AI service from within a Docker container.
### Utility Scripts
+
- `view_result.sh` - Display crawl results in a formatted view.
- `find_empty_folders.sh` - Identify empty directories in the project structure.
- `analyze_empty_folders.sh` - Analyze empty folders and categorize them by risk level.
- `verify_reorganization.sh` - Verify that code reorganization was successful.
These scripts are organized in the following directories:
+
- Root directory: Main scripts for common operations
- `scripts/general/`: General utility scripts
- `scripts/docker/`: Docker-specific scripts
@@ -225,27 +300,31 @@ These scripts are organized in the following directories:
## π Built for Developers, by Developers
DevDocs is more than a toolβit's your documentation companion that:
+
- **Saves Time**: Turn weeks of research into hours
- **Improves Understanding**: Get clean, organized documentation
- **Enables Innovation**: Build faster with any technology
- **Supports Teams**: Share knowledge efficiently
-- **LLM READY**: Modern times require modern solutions, using devdocs with LLM is extremely easy and intuitive. With minimal configuration you can run Devdocs and Claude App and recognizes DevDocs's MCP server ready to chat with your data.
+- **LLM READY**: Modern times require modern solutions, using devdocs with LLM is extremely easy and intuitive. With minimal configuration you can run Devdocs and Claude App and recognizes DevDocs's MCP server ready to chat with your data.
## π οΈ Setting Up the Cline/Roo Cline for Rapid software development.
-1. **Open the "Modes" Interface**
+1. **Open the "Modes" Interface**
- In **Roo Code**, click the **+** to create a new Mode-Specific Prompts.
-2. **Name**
- - Give the mode a **Name** (e.g., `Research_MCP`).
+2. **Name**
+ - Give the mode a **Name** (e.g., `Research_MCP`).
3. **Role Definition Prompt**
+
```
Expertise and Personality: Expertise: Developer documentation retrieval, technical synthesis, and documentation search. Personality: Systematic, detail-oriented, and precise. Provide well-structured answers with clear references to documentation sections.
Behavioral Mandate: Always use the Table Of Contents and Section Access tools when addressing any query regarding the MCP documentation. Maintain clarity, accuracy, and traceability in your responses.
```
+
4. **Mode-Specific Custom Instructions Prompt**
+
```
-1. Table Of Contents Tool: Returns a full or filtered list of documentation topics.
+1. Table Of Contents Tool: Returns a full or filtered list of documentation topics.
2. Section Access Tool: Retrieves the detailed content of specific documentation sections.
General Process: Query Interpretation: Parse the user's query to extract key topics, keywords, and context. Identify the likely relevant sections (e.g., API configurations, error handling) from the query.
@@ -258,10 +337,10 @@ Synthesis and Response Formation: Combine the retrieved content into a coherent
Error Handling: If no matching sections are found, adjust the search parameters and retry. Clearly report if the query remains ambiguous or if no relevant documentation is available.
-Mandatory Tool Usage:
+Mandatory Tool Usage:
Enforcement: Every time a query is received that requires information from the MCP server docs, the agent MUST first query the Table Of Contents tool to list potential relevant topics, then use the Section Access tool to retrieve the necessary detailed content.
-Search & Retrieve Workflow:
+Search & Retrieve Workflow:
Interpret and Isolate: Identify the key terms and data points from the user's query.
Index Lookup: Immediately query the Table Of Contents tool to obtain a list of relevant documentation sections.
@@ -276,6 +355,7 @@ Custom Instruction Loading: Additional custom instructions specific to Research_
Final Output Construction: The final answer should be organized, directly address the query, and include clear pointers (e.g., section names or identifiers) back to the MCP documentation. Ensure minimal redundancy while covering all necessary details.
```
+
## π€ Join Our Community
- π [Star us on GitHub](https://github.com/cyberagi/devdocs)
@@ -284,10 +364,9 @@ Final Output Construction: The final answer should be organized, directly addres
## π Success Stories
-"DevDocs turned our 3-week implementation timeline into 2 days. It's not just a crawler, it's a development accelerator." - *Senior Engineer at Fortune 100 Company*
-
-"Launched my SaaS in half the time by using DevDocs to understand and implement new technologies quickly." - *Successful Indie Hacker*
+"DevDocs turned our 3-week implementation timeline into 2 days. It's not just a crawler, it's a development accelerator." - _Senior Engineer at Fortune 100 Company_
+"Launched my SaaS in half the time by using DevDocs to understand and implement new technologies quickly." - _Successful Indie Hacker_
## π Technology Partners
@@ -301,4 +380,4 @@ Final Output Construction: The final answer should be organized, directly addres
Make Software Development Better Again Contribute to DevDocs
-
\ No newline at end of file
+
diff --git a/app/api/storage/route.ts b/app/api/storage/route.ts
index 67ac364..5897df6 100644
--- a/app/api/storage/route.ts
+++ b/app/api/storage/route.ts
@@ -1,95 +1,126 @@
-import { NextResponse } from 'next/server'
-import fs from 'fs/promises'
-import path from 'path'
+import { NextResponse } from "next/server";
+import fs from "fs/promises";
+import path from "path";
-const STORAGE_DIR = path.join(process.cwd(), 'storage/markdown')
+// Get storage directory from environment variable with fallback to default
+const getStoragePath = () => {
+ const configuredPath = process.env.STORAGE_PATH || "storage/markdown";
+
+ // If it's an absolute path, use it directly
+ if (path.isAbsolute(configuredPath)) {
+ return configuredPath;
+ }
+
+ // Otherwise, treat as relative to process.cwd()
+ return path.join(process.cwd(), configuredPath);
+};
+
+const STORAGE_DIR = getStoragePath();
export async function POST(request: Request) {
try {
- const { url, content } = await request.json()
-
+ const { url, content } = await request.json();
+
// Create storage directory if it doesn't exist
- await fs.mkdir(STORAGE_DIR, { recursive: true })
-
+ await fs.mkdir(STORAGE_DIR, { recursive: true });
+
// Generate filename from URL
- const filename = url
- .replace(/^https?:\/\//, '')
- .replace(/[^a-z0-9]/gi, '_')
- .toLowerCase() + '.md'
-
- const filePath = path.join(STORAGE_DIR, filename)
- await fs.writeFile(filePath, content, 'utf-8')
-
- return NextResponse.json({ success: true })
+ const filename =
+ url
+ .replace(/^https?:\/\//, "")
+ .replace(/[^a-z0-9]/gi, "_")
+ .toLowerCase() + ".md";
+
+ const filePath = path.join(STORAGE_DIR, filename);
+ await fs.writeFile(filePath, content, "utf-8");
+
+ return NextResponse.json({ success: true });
} catch (error) {
return NextResponse.json(
- { success: false, error: error instanceof Error ? error.message : 'Failed to save markdown' },
+ {
+ success: false,
+ error:
+ error instanceof Error ? error.message : "Failed to save markdown",
+ },
{ status: 500 }
- )
+ );
}
}
export async function GET(request: Request) {
try {
- const { searchParams } = new URL(request.url)
- const url = searchParams.get('url')
-
+ const { searchParams } = new URL(request.url);
+ const url = searchParams.get("url");
+
// Handle list request
if (!url) {
// Only get .md files
- const files = await fs.readdir(STORAGE_DIR)
- const mdFiles = files.filter(f => f.endsWith('.md'))
- const jsonFiles = files.filter(f => f.endsWith('.json'))
-
+ const files = await fs.readdir(STORAGE_DIR);
+ const mdFiles = files.filter((f) => f.endsWith(".md"));
+ const jsonFiles = files.filter((f) => f.endsWith(".json"));
+
// Get disk files
const diskFileDetails = await Promise.all(
mdFiles.map(async (filename) => {
- const mdPath = path.join(STORAGE_DIR, filename)
- const jsonPath = path.join(STORAGE_DIR, filename.replace('.md', '.json'))
- const stats = await fs.stat(mdPath)
- const content = await fs.readFile(mdPath, 'utf-8')
-
+ const mdPath = path.join(STORAGE_DIR, filename);
+ const jsonPath = path.join(
+ STORAGE_DIR,
+ filename.replace(".md", ".json")
+ );
+ const stats = await fs.stat(mdPath);
+ const content = await fs.readFile(mdPath, "utf-8");
+
// Check if this is a consolidated file by examining the JSON metadata
- let isConsolidated = false
- let pagesCount = 0
- let rootUrl = ''
-
- if (jsonFiles.includes(filename.replace('.md', '.json'))) {
+ let isConsolidated = false;
+ let pagesCount = 0;
+ let rootUrl = "";
+
+ if (jsonFiles.includes(filename.replace(".md", ".json"))) {
try {
- const jsonContent = await fs.readFile(jsonPath, 'utf-8')
- const metadata = JSON.parse(jsonContent)
-
+ const jsonContent = await fs.readFile(jsonPath, "utf-8");
+ const metadata = JSON.parse(jsonContent);
+
// If the metadata has a "pages" array or is_consolidated flag, it's a consolidated file
- if ((metadata.pages && Array.isArray(metadata.pages)) || metadata.is_consolidated === true) {
- isConsolidated = true
- pagesCount = metadata.pages ? metadata.pages.length : 1
- rootUrl = metadata.root_url || ''
+ if (
+ (metadata.pages && Array.isArray(metadata.pages)) ||
+ metadata.is_consolidated === true
+ ) {
+ isConsolidated = true;
+ pagesCount = metadata.pages ? metadata.pages.length : 1;
+ rootUrl = metadata.root_url || "";
}
} catch (e) {
- console.error(`Error reading JSON metadata for ${filename}:`, e)
+ console.error(`Error reading JSON metadata for ${filename}:`, e);
// Create a default metadata file if it doesn't exist or is invalid
try {
const defaultMetadata = {
- title: `Documentation for ${filename.replace('.md', '')}`,
+ title: `Documentation for ${filename.replace(".md", "")}`,
timestamp: new Date().toISOString(),
pages: [
{
title: "Main Content",
- url: `file://${filename.replace('.md', '')}`,
+ url: `file://${filename.replace(".md", "")}`,
timestamp: new Date().toISOString(),
internal_links: 0,
- external_links: 0
- }
+ external_links: 0,
+ },
],
is_consolidated: true,
- last_updated: new Date().toISOString()
- }
- await fs.writeFile(jsonPath, JSON.stringify(defaultMetadata, null, 2), 'utf-8')
- console.log(`Created default metadata for ${filename}`)
- isConsolidated = true
- pagesCount = 1
+ last_updated: new Date().toISOString(),
+ };
+ await fs.writeFile(
+ jsonPath,
+ JSON.stringify(defaultMetadata, null, 2),
+ "utf-8"
+ );
+ console.log(`Created default metadata for ${filename}`);
+ isConsolidated = true;
+ pagesCount = 1;
} catch (writeError) {
- console.error(`Error creating default metadata for ${filename}:`, writeError)
+ console.error(
+ `Error creating default metadata for ${filename}:`,
+ writeError
+ );
}
}
} else {
@@ -97,44 +128,51 @@ export async function GET(request: Request) {
try {
// Create a consolidated metadata file by default
const defaultMetadata = {
- title: `Documentation for ${filename.replace('.md', '')}`,
+ title: `Documentation for ${filename.replace(".md", "")}`,
timestamp: new Date().toISOString(),
content,
pages: [
{
title: "Main Content",
- url: `file://${filename.replace('.md', '')}`,
+ url: `file://${filename.replace(".md", "")}`,
timestamp: new Date().toISOString(),
internal_links: 0,
- external_links: 0
- }
+ external_links: 0,
+ },
],
is_consolidated: true,
last_updated: new Date().toISOString(),
metadata: {
wordCount: content.split(/\s+/).length,
charCount: content.length,
- timestamp: stats.mtime
- }
- }
- await fs.writeFile(jsonPath, JSON.stringify(defaultMetadata, null, 2), 'utf-8')
- console.log(`Created consolidated metadata for ${filename}`)
- isConsolidated = true
- pagesCount = 1
+ timestamp: stats.mtime,
+ },
+ };
+ await fs.writeFile(
+ jsonPath,
+ JSON.stringify(defaultMetadata, null, 2),
+ "utf-8"
+ );
+ console.log(`Created consolidated metadata for ${filename}`);
+ isConsolidated = true;
+ pagesCount = 1;
} catch (writeError) {
- console.error(`Error creating metadata for ${filename}:`, writeError)
+ console.error(
+ `Error creating metadata for ${filename}:`,
+ writeError
+ );
}
}
-
+
// Extract sections to count how many pages are included
if (!pagesCount && isConsolidated) {
// Count sections that start with "## " and have a URL: line after them
- const sectionMatches = content.match(/## .+\nURL: .+/g)
- pagesCount = sectionMatches ? sectionMatches.length : 0
+ const sectionMatches = content.match(/## .+\nURL: .+/g);
+ pagesCount = sectionMatches ? sectionMatches.length : 0;
}
-
+
return {
- name: filename.replace('.md', ''),
+ name: filename.replace(".md", ""),
jsonPath,
markdownPath: mdPath,
timestamp: stats.mtime,
@@ -143,12 +181,12 @@ export async function GET(request: Request) {
charCount: content.length,
isConsolidated,
pagesCount: isConsolidated ? pagesCount : 1,
- rootUrl: rootUrl || '',
- isInMemory: false
- }
+ rootUrl: rootUrl || "",
+ isInMemory: false,
+ };
})
- )
-
+ );
+
// Define interface for in-memory file
interface MemoryFile {
name: string;
@@ -161,20 +199,22 @@ export async function GET(request: Request) {
isJson: boolean;
metadata?: any;
}
-
+
// Get in-memory files from the backend
- let memoryFiles = []
+ let memoryFiles = [];
try {
- const memoryResponse = await fetch('http://localhost:24125/api/memory-files')
+ const memoryResponse = await fetch(
+ "http://localhost:24125/api/memory-files"
+ );
if (memoryResponse.ok) {
- const memoryData = await memoryResponse.json()
+ const memoryData = await memoryResponse.json();
if (memoryData.success && Array.isArray(memoryData.files)) {
// Convert in-memory files to the same format as disk files
memoryFiles = memoryData.files
.filter((file: MemoryFile) => !file.isJson) // Only include markdown files
.map((file: MemoryFile) => ({
name: file.name,
- jsonPath: file.path.replace('.md', '.json'),
+ jsonPath: file.path.replace(".md", ".json"),
markdownPath: file.path,
timestamp: new Date(file.timestamp),
size: file.size,
@@ -182,55 +222,63 @@ export async function GET(request: Request) {
charCount: file.charCount,
isConsolidated: false,
pagesCount: 1,
- rootUrl: '',
- isInMemory: true
- }))
+ rootUrl: "",
+ isInMemory: true,
+ }));
}
}
} catch (e) {
- console.error('Error fetching in-memory files:', e)
+ console.error("Error fetching in-memory files:", e);
}
-
+
// Combine disk and memory files
- const allFiles = [...diskFileDetails, ...memoryFiles]
-
+ const allFiles = [...diskFileDetails, ...memoryFiles];
+
// Filter out individual files (non-consolidated files)
// Only show consolidated files in the Stored Files section
- const consolidatedFiles = allFiles.filter(file => file.isConsolidated)
-
+ const consolidatedFiles = allFiles.filter((file) => file.isConsolidated);
+
// Additional filter to exclude files with UUID-like names
// UUID pattern: 8-4-4-4-12 hex digits (e.g., 095104d8-8e90-48f0-8670-9e45c914f115)
- const uuidPattern = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
-
+ const uuidPattern =
+ /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+
// Keep only files with domain-like names (e.g., docs_crawl4ai_com)
// These are files created through the crawling process
- const crawledFiles = consolidatedFiles.filter(file => {
+ const crawledFiles = consolidatedFiles.filter((file) => {
// Check if the filename is NOT a UUID
- return !uuidPattern.test(file.name)
- })
-
- console.log(`Found ${consolidatedFiles.length} consolidated files, ${crawledFiles.length} are crawled files`)
-
+ return !uuidPattern.test(file.name);
+ });
+
+ console.log(
+ `Found ${consolidatedFiles.length} consolidated files, ${crawledFiles.length} are crawled files`
+ );
+
return NextResponse.json({
success: true,
- files: crawledFiles
- })
+ files: crawledFiles,
+ });
}
-
+
// Handle single file request
- const filename = url
- .replace(/^https?:\/\//, '')
- .replace(/[^a-z0-9]/gi, '_')
- .toLowerCase() + '.md'
-
- const filePath = path.join(STORAGE_DIR, filename)
- const content = await fs.readFile(filePath, 'utf-8')
-
- return NextResponse.json({ success: true, content })
+ const filename =
+ url
+ .replace(/^https?:\/\//, "")
+ .replace(/[^a-z0-9]/gi, "_")
+ .toLowerCase() + ".md";
+
+ const filePath = path.join(STORAGE_DIR, filename);
+ const content = await fs.readFile(filePath, "utf-8");
+
+ return NextResponse.json({ success: true, content });
} catch (error) {
return NextResponse.json(
- { success: false, error: error instanceof Error ? error.message : 'Failed to load markdown' },
+ {
+ success: false,
+ error:
+ error instanceof Error ? error.message : "Failed to load markdown",
+ },
{ status: 500 }
- )
+ );
}
-}
\ No newline at end of file
+}
diff --git a/docker/compose/docker-compose.yml b/docker/compose/docker-compose.yml
index 0f24bf1..d9c31a5 100644
--- a/docker/compose/docker-compose.yml
+++ b/docker/compose/docker-compose.yml
@@ -23,13 +23,14 @@ services:
ports:
- "24125:24125"
volumes:
- - ./storage:/app/storage
+ - ${STORAGE_PATH:-./storage/markdown}:/app/storage/markdown
- ./logs:/app/logs
- ./crawl_results:/app/crawl_results
environment:
- MCP_HOST=mcp
- CRAWL4AI_URL=http://crawl4ai:11235
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-devdocs-demo-key}
+ - STORAGE_PATH=/app/storage/markdown
depends_on:
- crawl4ai
- mcp
@@ -43,12 +44,14 @@ services:
dockerfile: docker/dockerfiles/Dockerfile.mcp
container_name: devdocs-mcp
volumes:
- - ./storage/markdown:/app/storage/markdown
+ - ${STORAGE_PATH:-./storage/markdown}:/app/storage/markdown
- ./logs:/app/logs
+ environment:
+ - STORAGE_PATH=/app/storage/markdown
networks:
- devdocs-network
- stdin_open: true # Keep stdin open
- tty: true # Allocate a pseudo-TTY
+ stdin_open: true # Keep stdin open
+ tty: true # Allocate a pseudo-TTY
restart: unless-stopped
crawl4ai:
@@ -75,4 +78,4 @@ services:
networks:
devdocs-network:
- driver: bridge
\ No newline at end of file
+ driver: bridge
diff --git a/fast-markdown-mcp/src/fast_markdown_mcp/server.py b/fast-markdown-mcp/src/fast_markdown_mcp/server.py
index b779ff7..a786227 100644
--- a/fast-markdown-mcp/src/fast_markdown_mcp/server.py
+++ b/fast-markdown-mcp/src/fast_markdown_mcp/server.py
@@ -1,5 +1,7 @@
#!/usr/bin/env python3
+from .document_structure import DocumentStructure
import sys
+import os
import logging
import signal
import json
@@ -15,17 +17,16 @@
logger = logging.getLogger(__name__)
-from .document_structure import DocumentStructure
class MarkdownStore:
"""Manages markdown content and metadata."""
-
+
def __init__(self, storage_path: str):
self.base_path = Path(storage_path)
self.content_cache = {}
self.metadata_cache = {}
self.structure_cache = {} # Cache for parsed document structures
-
+
async def sync_all_files(self):
"""Initial sync of all files in the storage directory."""
logger.info("Starting initial sync of all files...")
@@ -37,7 +38,7 @@ async def sync_all_files(self):
except Exception as e:
logger.error(f"Error during initial sync: {e}")
raise
-
+
async def get_content(self, file_id: str) -> str:
"""Get markdown content."""
file_path = self.base_path / f"{file_id}.md"
@@ -57,38 +58,41 @@ async def get_section(self, file_id: str, section_id: str) -> str:
"""Get a specific section from a markdown file."""
try:
if file_id not in self.structure_cache:
- await self.get_content(file_id) # This will parse and cache the structure
-
+ # This will parse and cache the structure
+ await self.get_content(file_id)
+
structure = self.structure_cache[file_id]
section = structure.get_section_by_id(section_id)
-
+
if not section:
return f"Section '{section_id}' not found in {file_id}"
-
+
return f"Section: {section.title}\n\n{section.content}"
except Exception as e:
- logger.error(f"Error getting section {section_id} from {file_id}: {e}")
+ logger.error(
+ f"Error getting section {section_id} from {file_id}: {e}")
return f"Error getting section: {str(e)}"
async def get_table_of_contents(self, file_id: str) -> str:
"""Get table of contents for a markdown file."""
try:
if file_id not in self.structure_cache:
- await self.get_content(file_id) # This will parse and cache the structure
-
+ # This will parse and cache the structure
+ await self.get_content(file_id)
+
structure = self.structure_cache[file_id]
toc = structure.get_table_of_contents()
-
+
result = [f"Table of Contents for {file_id}:"]
for level, title, section_id in toc:
indent = " " * level
result.append(f"{indent}- {title} [{section_id}]")
-
+
return "\n".join(result)
except Exception as e:
logger.error(f"Error getting table of contents for {file_id}: {e}")
return f"Error getting table of contents: {str(e)}"
-
+
async def get_metadata(self, file_id: str) -> dict:
"""Get metadata as a dictionary."""
file_path = self.base_path / f"{file_id}.json"
@@ -124,13 +128,15 @@ async def get_metadata(self, file_id: str) -> dict:
logger.info(f"Created default metadata for {file_id}")
return default_metadata
except Exception as write_error:
- logger.error(f"Error creating default metadata for {file_id}: {write_error}")
- logger.error(f"Error reading metadata for {file_id}: File not found")
+ logger.error(
+ f"Error creating default metadata for {file_id}: {write_error}")
+ logger.error(
+ f"Error reading metadata for {file_id}: File not found")
return {}
except Exception as e:
logger.error(f"Error reading metadata for {file_id}: {e}")
return {}
-
+
async def get_index(self) -> str:
"""Get list of available files."""
try:
@@ -141,7 +147,7 @@ async def get_index(self) -> str:
except Exception as e:
logger.error(f"Error getting index: {e}")
return f"Error getting index: {str(e)}"
-
+
async def sync_file(self, file_id: str) -> str:
"""Force sync a file."""
try:
@@ -149,7 +155,7 @@ async def sync_file(self, file_id: str) -> str:
self.content_cache.pop(file_id, None)
self.metadata_cache.pop(file_id, None)
self.structure_cache.pop(file_id, None)
-
+
# Reload content and metadata
content = await self.get_content(file_id)
metadata = await self.get_metadata(file_id)
@@ -200,7 +206,7 @@ async def search_files(self, query: str) -> str:
file_id = md_file.stem
content = await self.get_content(file_id)
metadata = await self.get_metadata(file_id)
-
+
if query.lower() in content.lower():
# Find the context around the match
lines = content.split('\n')
@@ -208,13 +214,14 @@ async def search_files(self, query: str) -> str:
if query.lower() in line.lower():
context_start = max(0, i - 2)
context_end = min(len(lines), i + 3)
- context = '\n'.join(lines[context_start:context_end])
-
+ context = '\n'.join(
+ lines[context_start:context_end])
+
results.append(f"""Match in {file_id}.md:
Context:
{context}
---""")
-
+
if not results:
return f"No matches found for query: {query}"
return "\n\n".join(results)
@@ -229,16 +236,17 @@ async def search_by_tag(self, tag: str) -> str:
for json_file in self.base_path.glob("*.json"):
file_id = json_file.stem
metadata = await self.get_metadata(file_id)
-
+
# Check both metadata.tags and top-level tags
- tags = metadata.get('metadata', {}).get('tags', []) + metadata.get('tags', [])
-
+ tags = metadata.get('metadata', {}).get(
+ 'tags', []) + metadata.get('tags', [])
+
if tag.lower() in [t.lower() for t in tags]:
results.append(f"""File: {file_id}.md
Tags: {', '.join(tags)}
Last modified: {metadata.get('timestamp', 'Unknown')}
---""")
-
+
if not results:
return f"No files found with tag: {tag}"
return "\n\n".join(results)
@@ -254,25 +262,26 @@ async def get_stats(self) -> str:
total_chars = 0
files_by_month = {}
all_tags = set()
-
+
for json_file in self.base_path.glob("*.json"):
file_id = json_file.stem
metadata = await self.get_metadata(file_id)
-
+
total_files += 1
total_words += metadata.get('stats', {}).get('wordCount', 0)
total_chars += metadata.get('stats', {}).get('charCount', 0)
-
+
# Extract month from timestamp
timestamp = metadata.get('timestamp', '')
if timestamp:
month = timestamp[:7] # YYYY-MM
files_by_month[month] = files_by_month.get(month, 0) + 1
-
+
# Collect all tags
- tags = metadata.get('metadata', {}).get('tags', []) + metadata.get('tags', [])
+ tags = metadata.get('metadata', {}).get(
+ 'tags', []) + metadata.get('tags', [])
all_tags.update(tags)
-
+
stats = f"""Markdown Files Statistics:
Total Files: {total_files}
@@ -290,13 +299,14 @@ async def get_stats(self) -> str:
logger.error(f"Error getting stats: {e}")
return f"Error getting statistics: {str(e)}"
+
class MarkdownEventHandler(FileSystemEventHandler):
"""Handles file system events for markdown files."""
-
+
def __init__(self, store: MarkdownStore, loop: asyncio.AbstractEventLoop):
self.store = store
self.loop = loop
-
+
def sync_file(self, path: str):
"""Sync a file when it's created or modified."""
if path.endswith(('.md', '.json')):
@@ -305,29 +315,32 @@ def sync_file(self, path: str):
self.store.sync_file(file_id),
self.loop
)
-
+
def on_created(self, event):
"""Handle file creation."""
if not event.is_directory:
self.sync_file(event.src_path)
-
+
def on_modified(self, event):
"""Handle file modification."""
if not event.is_directory:
self.sync_file(event.src_path)
+
class FastMarkdownServer:
"""MCP server for markdown content management."""
-
+
def __init__(self, storage_path: str):
- self.server = Server("fast-markdown", version="1.0.0") # Set default version
+ # Set default version
+ self.server = Server("fast-markdown", version="1.0.0")
self.store = MarkdownStore(storage_path)
self.loop = asyncio.get_event_loop()
self.event_handler = MarkdownEventHandler(self.store, self.loop)
self.observer = Observer()
- self.observer.schedule(self.event_handler, storage_path, recursive=False)
+ self.observer.schedule(
+ self.event_handler, storage_path, recursive=False)
self.setup_handlers()
-
+
def setup_handlers(self):
"""Set up request handlers."""
@self.server.list_resources()
@@ -350,14 +363,14 @@ async def read_resource(uri: str) -> str:
"""Read resource content."""
if not uri.startswith("markdown://"):
raise ValueError(f"Invalid resource URI: {uri}")
-
+
parts = uri.split("/")
if len(parts) != 4 or parts[3] not in ["content", "metadata"]:
raise ValueError(f"Invalid resource URI format: {uri}")
-
+
file_id = parts[2]
resource_type = parts[3]
-
+
if resource_type == "content":
return await self.store.get_content(file_id)
else:
@@ -537,10 +550,10 @@ async def run(self):
logger.info("Starting server...")
# Start the file observer
self.observer.start()
-
+
# Initial sync of all files
await self.store.sync_all_files()
-
+
try:
# Keep the server running
while True:
@@ -560,51 +573,65 @@ async def run(self):
self.observer.join()
logger.info("Server shutdown complete")
+
def setup_logging():
"""Configure logging."""
# Get the project root directory
root_dir = Path(__file__).parents[3].resolve()
log_dir = root_dir / "logs"
log_dir.mkdir(exist_ok=True)
-
+
# Use absolute path for log file
log_path = log_dir / "mcp.log"
-
+
# Configure file handler for all logs
file_handler = logging.FileHandler(str(log_path))
file_handler.setLevel(logging.INFO)
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-
+ file_handler.setFormatter(logging.Formatter(
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
# Configure console handler with higher log level to reduce noise
console_handler = logging.StreamHandler()
- console_handler.setLevel(logging.WARNING) # Only show WARNING and above in console
- console_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
-
+ # Only show WARNING and above in console
+ console_handler.setLevel(logging.WARNING)
+ console_handler.setFormatter(
+ logging.Formatter('%(levelname)s: %(message)s'))
+
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
root_logger.addHandler(file_handler)
root_logger.addHandler(console_handler)
-
+
# Specifically set higher log level for MCP SDK's internal logging
mcp_logger = logging.getLogger('mcp.server.lowlevel')
mcp_logger.setLevel(logging.WARNING)
+
def handle_sigterm(signum, frame):
"""Handle SIGTERM signal."""
logger.info("Received shutdown signal")
sys.exit(0)
+
async def main() -> None:
"""Main entry point."""
- if len(sys.argv) != 2:
- print("Usage: fast-markdown-mcp ")
- sys.exit(1)
-
setup_logging()
signal.signal(signal.SIGTERM, handle_sigterm)
- storage_path = sys.argv[1]
-
+
+ # Get storage path from command line arguments or environment variable
+ if len(sys.argv) >= 2:
+ storage_path = sys.argv[1]
+ logger.info(f"Using storage path from command line: {storage_path}")
+ else:
+ # Try to get from environment variable with default fallback
+ storage_path = os.environ.get('STORAGE_PATH', '/app/storage/markdown')
+ logger.info(
+ f"Using storage path from environment variable: {storage_path}")
+
+ # Create the directory if it doesn't exist
+ Path(storage_path).mkdir(parents=True, exist_ok=True)
+
try:
server = FastMarkdownServer(storage_path)
logger.info(f"Starting server with storage path: {storage_path}")
@@ -619,4 +646,4 @@ async def main() -> None:
if __name__ == "__main__":
import asyncio
- asyncio.run(main())
\ No newline at end of file
+ asyncio.run(main())
diff --git a/scripts/docker/docker-start.sh b/scripts/docker/docker-start.sh
index adc0ffa..8bdb843 100755
--- a/scripts/docker/docker-start.sh
+++ b/scripts/docker/docker-start.sh
@@ -10,11 +10,28 @@ NC='\033[0m' # No Color
ROOT_DIR="$(pwd)"
echo -e "${BLUE}Project root directory: ${ROOT_DIR}${NC}"
+# Get storage path from .env file if it exists, otherwise use default
+STORAGE_PATH="storage/markdown"
+if [ -f ".env" ]; then
+ STORAGE_PATH_FROM_ENV=$(grep -e "^STORAGE_PATH=" .env | cut -d '=' -f 2)
+ if [ ! -z "$STORAGE_PATH_FROM_ENV" ]; then
+ STORAGE_PATH="$STORAGE_PATH_FROM_ENV"
+ fi
+fi
+
+# If not absolute, make it relative to current directory
+if [[ "$STORAGE_PATH" != /* ]]; then
+ STORAGE_PATH="$ROOT_DIR/$STORAGE_PATH"
+fi
+
+echo -e "${BLUE}Using storage path: $STORAGE_PATH${NC}"
+export STORAGE_PATH
+
# Create necessary directories with proper permissions
mkdir -p logs
-mkdir -p storage/markdown
+mkdir -p "$STORAGE_PATH"
mkdir -p crawl_results
-chmod -R 777 logs storage crawl_results
+chmod -R 777 logs "$STORAGE_PATH" crawl_results
# Start Docker containers
echo -e "${BLUE}Starting Docker containers...${NC}"
diff --git a/scripts/general/start.sh b/scripts/general/start.sh
index ae39494..cabb70c 100755
--- a/scripts/general/start.sh
+++ b/scripts/general/start.sh
@@ -81,9 +81,29 @@ cd "$ROOT_DIR"
# Activate MCP server's virtual environment and start it
echo -e "${BLUE}Starting MCP server...${NC}"
source fast-markdown-mcp/venv/bin/activate
+
+# Get storage path from .env file if it exists, otherwise use default
+STORAGE_PATH="$ROOT_DIR/storage/markdown"
+if [ -f ".env" ]; then
+ # Extract STORAGE_PATH from .env file
+ ENV_STORAGE_PATH=$(grep -e "^STORAGE_PATH=" .env | cut -d '=' -f 2)
+ if [ ! -z "$ENV_STORAGE_PATH" ]; then
+ # If path is relative (doesn't start with /), prepend ROOT_DIR
+ if [[ "$ENV_STORAGE_PATH" != /* ]]; then
+ STORAGE_PATH="$ROOT_DIR/$ENV_STORAGE_PATH"
+ else
+ STORAGE_PATH="$ENV_STORAGE_PATH"
+ fi
+ fi
+fi
+
+# Create storage directory if it doesn't exist
+mkdir -p "$STORAGE_PATH"
+echo -e "${BLUE}Using storage path: $STORAGE_PATH${NC}"
+
PYTHONPATH="$ROOT_DIR/fast-markdown-mcp/src" \
fast-markdown-mcp/venv/bin/python -m fast_markdown_mcp.server \
- "$ROOT_DIR/storage/markdown" > logs/mcp.log 2>&1 &
+ "$STORAGE_PATH" > logs/mcp.log 2>&1 &
MCP_PID=$!
# Wait for services to be ready
From 4baa6c3a5df5a91999ea7c21e50816bdc0d11b0e Mon Sep 17 00:00:00 2001
From: Christopher Madison
Date: Wed, 19 Mar 2025 12:09:44 -0400
Subject: [PATCH 2/4] fix: use configurable storage path in download route
This commit fixes an inconsistency in how storage paths are handled between
different parts of the application. Previously, the download route was using
a hardcoded storage path while the main API route was using the configurable
path from environment variables.
The change ensures the download route uses the same storage path resolution
logic, allowing files to be properly accessed when using custom storage
locations. This resolves the issue where consolidated files weren't appearing
in the frontend after setting a custom STORAGE_PATH.
---
app/api/storage/download/route.ts | 108 ++++++++++++++++++------------
1 file changed, 66 insertions(+), 42 deletions(-)
diff --git a/app/api/storage/download/route.ts b/app/api/storage/download/route.ts
index 384880f..ba28d78 100644
--- a/app/api/storage/download/route.ts
+++ b/app/api/storage/download/route.ts
@@ -1,92 +1,116 @@
-import { NextResponse } from 'next/server'
-import fs from 'fs/promises'
-import path from 'path'
+import { NextResponse } from "next/server";
+import fs from "fs/promises";
+import path from "path";
+
+// Get storage directory from environment variable with fallback to default
+const getStoragePath = () => {
+ const configuredPath = process.env.STORAGE_PATH || "storage/markdown";
+
+ // If it's an absolute path, use it directly
+ if (path.isAbsolute(configuredPath)) {
+ return configuredPath;
+ }
+
+ // Otherwise, treat as relative to process.cwd()
+ return path.join(process.cwd(), configuredPath);
+};
+
+const STORAGE_DIR = getStoragePath();
export async function GET(request: Request) {
try {
- const { searchParams } = new URL(request.url)
- const filePath = searchParams.get('path')
+ const { searchParams } = new URL(request.url);
+ const filePath = searchParams.get("path");
if (!filePath) {
return NextResponse.json(
- { success: false, error: 'No file path provided' },
+ { success: false, error: "No file path provided" },
{ status: 400 }
- )
+ );
}
- console.log(`Download requested for file: ${filePath}`)
+ console.log(`Download requested for file: ${filePath}`);
// Security check to ensure the path is within the storage directory
- const storagePath = path.join(process.cwd(), 'storage/markdown')
- const normalizedPath = path.normalize(filePath)
- if (!normalizedPath.startsWith(storagePath)) {
- console.error(`Security check failed: ${normalizedPath} is outside of ${storagePath}`)
+ const normalizedPath = path.normalize(filePath);
+ if (!normalizedPath.startsWith(STORAGE_DIR)) {
+ console.error(
+ `Security check failed: ${normalizedPath} is outside of ${STORAGE_DIR}`
+ );
return NextResponse.json(
- { success: false, error: 'Invalid file path' },
+ { success: false, error: "Invalid file path" },
{ status: 403 }
- )
+ );
}
// Check if file exists
try {
- await fs.access(normalizedPath)
+ await fs.access(normalizedPath);
} catch {
- console.error(`File not found: ${normalizedPath}`)
+ console.error(`File not found: ${normalizedPath}`);
return NextResponse.json(
- { success: false, error: 'File not found' },
+ { success: false, error: "File not found" },
{ status: 404 }
- )
+ );
}
// Read the file
- const content = await fs.readFile(normalizedPath, 'utf-8')
- const fileSize = Buffer.byteLength(content, 'utf8')
- console.log(`File read successfully: ${normalizedPath} (${fileSize} bytes)`)
+ const content = await fs.readFile(normalizedPath, "utf-8");
+ const fileSize = Buffer.byteLength(content, "utf8");
+ console.log(
+ `File read successfully: ${normalizedPath} (${fileSize} bytes)`
+ );
// If it's a JSON file, verify it's valid JSON and check if it's a consolidated file
- if (path.extname(filePath) === '.json') {
+ if (path.extname(filePath) === ".json") {
try {
- const jsonData = JSON.parse(content)
-
+ const jsonData = JSON.parse(content);
+
// Check if this is a consolidated file
if (jsonData.pages && Array.isArray(jsonData.pages)) {
- console.log(`Consolidated JSON file detected with ${jsonData.pages.length} pages`)
+ console.log(
+ `Consolidated JSON file detected with ${jsonData.pages.length} pages`
+ );
}
} catch (e) {
- console.error(`Invalid JSON file: ${normalizedPath}`, e)
+ console.error(`Invalid JSON file: ${normalizedPath}`, e);
return NextResponse.json(
- { success: false, error: 'Invalid JSON file' },
+ { success: false, error: "Invalid JSON file" },
{ status: 500 }
- )
+ );
}
- } else if (path.extname(filePath) === '.md') {
+ } else if (path.extname(filePath) === ".md") {
// For markdown files, check if it's a consolidated file by looking for section markers
- const sectionMatches = content.match(/## .+\nURL: .+/g)
+ const sectionMatches = content.match(/## .+\nURL: .+/g);
if (sectionMatches && sectionMatches.length > 0) {
- console.log(`Consolidated Markdown file detected with ${sectionMatches.length} sections`)
+ console.log(
+ `Consolidated Markdown file detected with ${sectionMatches.length} sections`
+ );
}
}
-
+
// Determine content type based on file extension
- const contentType = path.extname(filePath) === '.json'
- ? 'application/json'
- : 'text/markdown'
+ const contentType =
+ path.extname(filePath) === ".json" ? "application/json" : "text/markdown";
// Create response with appropriate headers for download
return new NextResponse(content, {
headers: {
- 'Content-Type': contentType,
- 'Content-Disposition': `attachment; filename="${path.basename(filePath)}"`,
+ "Content-Type": contentType,
+ "Content-Disposition": `attachment; filename="${path.basename(
+ filePath
+ )}"`,
},
- })
+ });
} catch (error) {
- console.error('Error downloading file:', error)
+ console.error("Error downloading file:", error);
return NextResponse.json(
{
success: false,
- error: error instanceof Error ? error.message : 'Failed to download file'
+ error:
+ error instanceof Error ? error.message : "Failed to download file",
},
{ status: 500 }
- )
+ );
}
-}
\ No newline at end of file
+}
From f7b73a4ae96705f5e0c620c0c005849bd180c91b Mon Sep 17 00:00:00 2001
From: Christopher Madison
Date: Wed, 19 Mar 2025 12:45:22 -0400
Subject: [PATCH 3/4] fix(docker): add storage volume mount to frontend
container
The frontend container was trying to access a storage path that doesn't exist
within the container's filesystem, causing ENOENT errors when listing files.
This change:
- Adds a volume mount to map host storage to the frontend container
- Sets consistent STORAGE_PATH environment variable
- Ensures all containers (frontend, backend, MCP) share storage access
---
app/api/storage/route.ts | 117 +++++++++++++++++++++++++++---
docker/compose/docker-compose.yml | 3 +
2 files changed, 111 insertions(+), 9 deletions(-)
diff --git a/app/api/storage/route.ts b/app/api/storage/route.ts
index 5897df6..d53f12f 100644
--- a/app/api/storage/route.ts
+++ b/app/api/storage/route.ts
@@ -12,7 +12,17 @@ const getStoragePath = () => {
}
// Otherwise, treat as relative to process.cwd()
- return path.join(process.cwd(), configuredPath);
+ const storagePath = path.join(process.cwd(), configuredPath);
+
+ // TEMP-DEBUG: Log the storage path being used
+ console.log("TEMP-DEBUG: Storage path resolved to:", storagePath);
+ console.log("TEMP-DEBUG: Current working directory:", process.cwd());
+ console.log(
+ "TEMP-DEBUG: Environment STORAGE_PATH:",
+ process.env.STORAGE_PATH
+ );
+
+ return storagePath;
};
const STORAGE_DIR = getStoragePath();
@@ -49,17 +59,47 @@ export async function POST(request: Request) {
export async function GET(request: Request) {
try {
+ // TEMP-DEBUG: Log the request URL
+ console.log("TEMP-DEBUG: GET request URL:", request.url);
+
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
+ console.log("TEMP-DEBUG: URL param:", url);
+ console.log("TEMP-DEBUG: STORAGE_DIR:", STORAGE_DIR);
+
// Handle list request
if (!url) {
- // Only get .md files
- const files = await fs.readdir(STORAGE_DIR);
- const mdFiles = files.filter((f) => f.endsWith(".md"));
- const jsonFiles = files.filter((f) => f.endsWith(".json"));
+ // TEMP-DEBUG: Listing files
+ console.log("TEMP-DEBUG: Listing files from directory:", STORAGE_DIR);
+
+ try {
+ // Only get .md files
+ const files = await fs.readdir(STORAGE_DIR);
+ console.log("TEMP-DEBUG: Directory contents:", files);
+
+ const mdFiles = files.filter((f) => f.endsWith(".md"));
+ const jsonFiles = files.filter((f) => f.endsWith(".json"));
+
+ console.log("TEMP-DEBUG: Found MD files:", mdFiles.length);
+ console.log("TEMP-DEBUG: Found JSON files:", jsonFiles.length);
+ } catch (dirError) {
+ console.error("TEMP-DEBUG: Error reading directory:", dirError);
+ return NextResponse.json(
+ {
+ success: false,
+ error: `Failed to read storage directory: ${dirError.message}`,
+ debug: {
+ storageDir: STORAGE_DIR,
+ exists: false,
+ },
+ },
+ { status: 500 }
+ );
+ }
// Get disk files
+ console.log("TEMP-DEBUG: Starting to process disk files");
const diskFileDetails = await Promise.all(
mdFiles.map(async (filename) => {
const mdPath = path.join(STORAGE_DIR, filename);
@@ -67,8 +107,16 @@ export async function GET(request: Request) {
STORAGE_DIR,
filename.replace(".md", ".json")
);
+
+ console.log(`TEMP-DEBUG: Processing file: ${filename}`);
+ console.log(`TEMP-DEBUG: MD Path: ${mdPath}`);
+ console.log(`TEMP-DEBUG: JSON Path: ${jsonPath}`);
+
const stats = await fs.stat(mdPath);
const content = await fs.readFile(mdPath, "utf-8");
+ console.log(
+ `TEMP-DEBUG: File size: ${stats.size}, Modified: ${stats.mtime}`
+ );
// Check if this is a consolidated file by examining the JSON metadata
let isConsolidated = false;
@@ -233,31 +281,82 @@ export async function GET(request: Request) {
// Combine disk and memory files
const allFiles = [...diskFileDetails, ...memoryFiles];
+ console.log("TEMP-DEBUG: All files combined count:", allFiles.length);
+
+ // Log details of each file
+ console.log("TEMP-DEBUG: File details:");
+ allFiles.forEach((file, index) => {
+ console.log(`TEMP-DEBUG: File ${index + 1}:`, {
+ name: file.name,
+ isConsolidated: file.isConsolidated,
+ pagesCount: file.pagesCount,
+ isInMemory: file.isInMemory,
+ size: file.size,
+ });
+ });
// Filter out individual files (non-consolidated files)
// Only show consolidated files in the Stored Files section
const consolidatedFiles = allFiles.filter((file) => file.isConsolidated);
+ console.log(
+ "TEMP-DEBUG: Consolidated files count:",
+ consolidatedFiles.length
+ );
// Additional filter to exclude files with UUID-like names
// UUID pattern: 8-4-4-4-12 hex digits (e.g., 095104d8-8e90-48f0-8670-9e45c914f115)
const uuidPattern =
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+ // Log consolidated files before UUID filtering
+ console.log("TEMP-DEBUG: Consolidated files before UUID filtering:");
+ consolidatedFiles.forEach((file, index) => {
+ console.log(`TEMP-DEBUG: Consolidated file ${index + 1}:`, {
+ name: file.name,
+ isUUID: uuidPattern.test(file.name),
+ pagesCount: file.pagesCount,
+ });
+ });
+
// Keep only files with domain-like names (e.g., docs_crawl4ai_com)
// These are files created through the crawling process
const crawledFiles = consolidatedFiles.filter((file) => {
// Check if the filename is NOT a UUID
- return !uuidPattern.test(file.name);
+ const isNotUUID = !uuidPattern.test(file.name);
+ console.log(
+ `TEMP-DEBUG: File ${file.name} is ${isNotUUID ? "NOT" : "IS"} a UUID`
+ );
+ return isNotUUID;
});
console.log(
- `Found ${consolidatedFiles.length} consolidated files, ${crawledFiles.length} are crawled files`
+ `TEMP-DEBUG: Found ${consolidatedFiles.length} consolidated files, ${crawledFiles.length} are crawled files`
);
- return NextResponse.json({
+ // Log the final files being returned to the frontend
+ console.log("TEMP-DEBUG: Final files being returned to frontend:");
+ crawledFiles.forEach((file, index) => {
+ console.log(`TEMP-DEBUG: Returned file ${index + 1}:`, {
+ name: file.name,
+ isConsolidated: file.isConsolidated,
+ pagesCount: file.pagesCount,
+ size: file.size,
+ rootUrl: file.rootUrl || "none",
+ });
+ });
+
+ // Return the files
+ const response = {
success: true,
files: crawledFiles,
- });
+ };
+
+ console.log(
+ "TEMP-DEBUG: Final response:",
+ JSON.stringify(response, null, 2).substring(0, 1000) + "..."
+ );
+
+ return NextResponse.json(response);
}
// Handle single file request
diff --git a/docker/compose/docker-compose.yml b/docker/compose/docker-compose.yml
index d9c31a5..fc24a06 100644
--- a/docker/compose/docker-compose.yml
+++ b/docker/compose/docker-compose.yml
@@ -6,9 +6,12 @@ services:
container_name: devdocs-frontend
ports:
- "3001:3001"
+ volumes:
+ - ${STORAGE_PATH:-./storage/markdown}:/app/storage/markdown
environment:
- BACKEND_URL=http://backend:24125
- MCP_HOST=mcp
+ - STORAGE_PATH=/app/storage/markdown
depends_on:
- backend
networks:
From cec71c0ddde5275ed1da5bc3fd39c1e8df8f12d0 Mon Sep 17 00:00:00 2001
From: Christopher Madison
Date: Wed, 19 Mar 2025 13:03:59 -0400
Subject: [PATCH 4/4] fix(docker): configure storage volume mount for frontend
container
The frontend container was attempting to access files in a directory path
that existed on the host but not inside the container, resulting in
"ENOENT: no such file or directory" errors when scanning the storage directory.
Changes:
- Add volume mount to frontend container mapping host storage to container path
- Set consistent STORAGE_PATH environment variable across all containers
- Fix syntax errors and improve error handling in storage/route.ts
- Clean up temporary debug statements after diagnosis
This ensures all containers (frontend, backend, MCP) share access to the same
storage directory, allowing files created by one container to be accessible
by others, which fixes file listing and retrieval in the UI.
---
app/api/storage/route.ts | 364 ++++++++++++++++-----------------------
1 file changed, 153 insertions(+), 211 deletions(-)
diff --git a/app/api/storage/route.ts b/app/api/storage/route.ts
index d53f12f..aa28437 100644
--- a/app/api/storage/route.ts
+++ b/app/api/storage/route.ts
@@ -13,15 +13,6 @@ const getStoragePath = () => {
// Otherwise, treat as relative to process.cwd()
const storagePath = path.join(process.cwd(), configuredPath);
-
- // TEMP-DEBUG: Log the storage path being used
- console.log("TEMP-DEBUG: Storage path resolved to:", storagePath);
- console.log("TEMP-DEBUG: Current working directory:", process.cwd());
- console.log(
- "TEMP-DEBUG: Environment STORAGE_PATH:",
- process.env.STORAGE_PATH
- );
-
return storagePath;
};
@@ -59,32 +50,22 @@ export async function POST(request: Request) {
export async function GET(request: Request) {
try {
- // TEMP-DEBUG: Log the request URL
- console.log("TEMP-DEBUG: GET request URL:", request.url);
-
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
- console.log("TEMP-DEBUG: URL param:", url);
- console.log("TEMP-DEBUG: STORAGE_DIR:", STORAGE_DIR);
-
// Handle list request
if (!url) {
- // TEMP-DEBUG: Listing files
- console.log("TEMP-DEBUG: Listing files from directory:", STORAGE_DIR);
+ // Initialize these variables outside the try block so they're accessible in the outer scope
+ let mdFiles = [];
+ let jsonFiles = [];
+ let diskFileDetails = [];
try {
// Only get .md files
const files = await fs.readdir(STORAGE_DIR);
- console.log("TEMP-DEBUG: Directory contents:", files);
-
- const mdFiles = files.filter((f) => f.endsWith(".md"));
- const jsonFiles = files.filter((f) => f.endsWith(".json"));
-
- console.log("TEMP-DEBUG: Found MD files:", mdFiles.length);
- console.log("TEMP-DEBUG: Found JSON files:", jsonFiles.length);
+ mdFiles = files.filter((f) => f.endsWith(".md"));
+ jsonFiles = files.filter((f) => f.endsWith(".json"));
} catch (dirError) {
- console.error("TEMP-DEBUG: Error reading directory:", dirError);
return NextResponse.json(
{
success: false,
@@ -99,141 +80,149 @@ export async function GET(request: Request) {
}
// Get disk files
- console.log("TEMP-DEBUG: Starting to process disk files");
- const diskFileDetails = await Promise.all(
- mdFiles.map(async (filename) => {
- const mdPath = path.join(STORAGE_DIR, filename);
- const jsonPath = path.join(
- STORAGE_DIR,
- filename.replace(".md", ".json")
- );
-
- console.log(`TEMP-DEBUG: Processing file: ${filename}`);
- console.log(`TEMP-DEBUG: MD Path: ${mdPath}`);
- console.log(`TEMP-DEBUG: JSON Path: ${jsonPath}`);
-
- const stats = await fs.stat(mdPath);
- const content = await fs.readFile(mdPath, "utf-8");
- console.log(
- `TEMP-DEBUG: File size: ${stats.size}, Modified: ${stats.mtime}`
- );
-
- // Check if this is a consolidated file by examining the JSON metadata
- let isConsolidated = false;
- let pagesCount = 0;
- let rootUrl = "";
+ try {
+ diskFileDetails = await Promise.all(
+ mdFiles.map(async (filename) => {
+ const mdPath = path.join(STORAGE_DIR, filename);
+ const jsonPath = path.join(
+ STORAGE_DIR,
+ filename.replace(".md", ".json")
+ );
- if (jsonFiles.includes(filename.replace(".md", ".json"))) {
try {
- const jsonContent = await fs.readFile(jsonPath, "utf-8");
- const metadata = JSON.parse(jsonContent);
-
- // If the metadata has a "pages" array or is_consolidated flag, it's a consolidated file
- if (
- (metadata.pages && Array.isArray(metadata.pages)) ||
- metadata.is_consolidated === true
- ) {
- isConsolidated = true;
- pagesCount = metadata.pages ? metadata.pages.length : 1;
- rootUrl = metadata.root_url || "";
- }
- } catch (e) {
- console.error(`Error reading JSON metadata for ${filename}:`, e);
- // Create a default metadata file if it doesn't exist or is invalid
+ const stats = await fs.stat(mdPath);
+ let content = "";
try {
- const defaultMetadata = {
- title: `Documentation for ${filename.replace(".md", "")}`,
- timestamp: new Date().toISOString(),
- pages: [
- {
- title: "Main Content",
- url: `file://${filename.replace(".md", "")}`,
+ content = await fs.readFile(mdPath, "utf-8");
+ } catch (readError) {
+ content = ""; // Default to empty content if file can't be read
+ }
+
+ // Check if this is a consolidated file by examining the JSON metadata
+ let isConsolidated = false;
+ let pagesCount = 0;
+ let rootUrl = "";
+
+ if (jsonFiles.includes(filename.replace(".md", ".json"))) {
+ try {
+ const jsonContent = await fs.readFile(jsonPath, "utf-8");
+ const metadata = JSON.parse(jsonContent);
+
+ // If the metadata has a "pages" array or is_consolidated flag, it's a consolidated file
+ if (
+ (metadata.pages && Array.isArray(metadata.pages)) ||
+ metadata.is_consolidated === true
+ ) {
+ isConsolidated = true;
+ pagesCount = metadata.pages ? metadata.pages.length : 1;
+ rootUrl = metadata.root_url || "";
+ }
+ } catch (e) {
+ // Create a default metadata file if it doesn't exist or is invalid
+ try {
+ const defaultMetadata = {
+ title: `Documentation for ${filename.replace(".md", "")}`,
timestamp: new Date().toISOString(),
- internal_links: 0,
- external_links: 0,
+ pages: [
+ {
+ title: "Main Content",
+ url: `file://${filename.replace(".md", "")}`,
+ timestamp: new Date().toISOString(),
+ internal_links: 0,
+ external_links: 0,
+ },
+ ],
+ is_consolidated: true,
+ last_updated: new Date().toISOString(),
+ };
+ await fs.writeFile(
+ jsonPath,
+ JSON.stringify(defaultMetadata, null, 2),
+ "utf-8"
+ );
+ isConsolidated = true;
+ pagesCount = 1;
+ } catch (writeError) {
+ // Failed to create metadata file, continue with default values
+ }
+ }
+ } else {
+ // Create JSON file if it doesn't exist
+ try {
+ // Create a consolidated metadata file by default
+ const defaultMetadata = {
+ title: `Documentation for ${filename.replace(".md", "")}`,
+ timestamp: new Date().toISOString(),
+ content,
+ pages: [
+ {
+ title: "Main Content",
+ url: `file://${filename.replace(".md", "")}`,
+ timestamp: new Date().toISOString(),
+ internal_links: 0,
+ external_links: 0,
+ },
+ ],
+ is_consolidated: true,
+ last_updated: new Date().toISOString(),
+ metadata: {
+ wordCount: content.split(/\s+/).length,
+ charCount: content.length,
+ timestamp: stats.mtime,
},
- ],
- is_consolidated: true,
- last_updated: new Date().toISOString(),
- };
- await fs.writeFile(
- jsonPath,
- JSON.stringify(defaultMetadata, null, 2),
- "utf-8"
- );
- console.log(`Created default metadata for ${filename}`);
- isConsolidated = true;
- pagesCount = 1;
- } catch (writeError) {
- console.error(
- `Error creating default metadata for ${filename}:`,
- writeError
- );
+ };
+ await fs.writeFile(
+ jsonPath,
+ JSON.stringify(defaultMetadata, null, 2),
+ "utf-8"
+ );
+ isConsolidated = true;
+ pagesCount = 1;
+ } catch (writeError) {
+ // Failed to create metadata file, continue with default values
+ }
}
- }
- } else {
- // Create JSON file if it doesn't exist
- try {
- // Create a consolidated metadata file by default
- const defaultMetadata = {
- title: `Documentation for ${filename.replace(".md", "")}`,
- timestamp: new Date().toISOString(),
- content,
- pages: [
- {
- title: "Main Content",
- url: `file://${filename.replace(".md", "")}`,
- timestamp: new Date().toISOString(),
- internal_links: 0,
- external_links: 0,
- },
- ],
- is_consolidated: true,
- last_updated: new Date().toISOString(),
- metadata: {
- wordCount: content.split(/\s+/).length,
- charCount: content.length,
- timestamp: stats.mtime,
- },
+
+ // Extract sections to count how many pages are included
+ if (!pagesCount && isConsolidated) {
+ // Count sections that start with "## " and have a URL: line after them
+ const sectionMatches = content.match(/## .+\nURL: .+/g);
+ pagesCount = sectionMatches ? sectionMatches.length : 0;
+ }
+
+ return {
+ name: filename.replace(".md", ""),
+ jsonPath,
+ markdownPath: mdPath,
+ timestamp: stats.mtime,
+ size: stats.size,
+ wordCount: content.split(/\s+/).length,
+ charCount: content.length,
+ isConsolidated,
+ pagesCount: isConsolidated ? pagesCount : 1,
+ rootUrl: rootUrl || "",
+ isInMemory: false,
};
- await fs.writeFile(
+ } catch (error) {
+ return {
+ name: filename.replace(".md", ""),
jsonPath,
- JSON.stringify(defaultMetadata, null, 2),
- "utf-8"
- );
- console.log(`Created consolidated metadata for ${filename}`);
- isConsolidated = true;
- pagesCount = 1;
- } catch (writeError) {
- console.error(
- `Error creating metadata for ${filename}:`,
- writeError
- );
+ markdownPath: mdPath,
+ timestamp: new Date(),
+ size: 0,
+ wordCount: 0,
+ charCount: 0,
+ isConsolidated: false,
+ pagesCount: 0,
+ rootUrl: "",
+ isInMemory: false,
+ };
}
- }
-
- // Extract sections to count how many pages are included
- if (!pagesCount && isConsolidated) {
- // Count sections that start with "## " and have a URL: line after them
- const sectionMatches = content.match(/## .+\nURL: .+/g);
- pagesCount = sectionMatches ? sectionMatches.length : 0;
- }
-
- return {
- name: filename.replace(".md", ""),
- jsonPath,
- markdownPath: mdPath,
- timestamp: stats.mtime,
- size: stats.size,
- wordCount: content.split(/\s+/).length,
- charCount: content.length,
- isConsolidated,
- pagesCount: isConsolidated ? pagesCount : 1,
- rootUrl: rootUrl || "",
- isInMemory: false,
- };
- })
- );
+ })
+ );
+ } catch (promiseError) {
+ diskFileDetails = [];
+ }
// Define interface for in-memory file
interface MemoryFile {
@@ -276,100 +265,53 @@ export async function GET(request: Request) {
}
}
} catch (e) {
- console.error("Error fetching in-memory files:", e);
+ // Error fetching in-memory files, continue with empty array
}
// Combine disk and memory files
const allFiles = [...diskFileDetails, ...memoryFiles];
- console.log("TEMP-DEBUG: All files combined count:", allFiles.length);
-
- // Log details of each file
- console.log("TEMP-DEBUG: File details:");
- allFiles.forEach((file, index) => {
- console.log(`TEMP-DEBUG: File ${index + 1}:`, {
- name: file.name,
- isConsolidated: file.isConsolidated,
- pagesCount: file.pagesCount,
- isInMemory: file.isInMemory,
- size: file.size,
- });
- });
// Filter out individual files (non-consolidated files)
// Only show consolidated files in the Stored Files section
const consolidatedFiles = allFiles.filter((file) => file.isConsolidated);
- console.log(
- "TEMP-DEBUG: Consolidated files count:",
- consolidatedFiles.length
- );
// Additional filter to exclude files with UUID-like names
// UUID pattern: 8-4-4-4-12 hex digits (e.g., 095104d8-8e90-48f0-8670-9e45c914f115)
const uuidPattern =
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
- // Log consolidated files before UUID filtering
- console.log("TEMP-DEBUG: Consolidated files before UUID filtering:");
- consolidatedFiles.forEach((file, index) => {
- console.log(`TEMP-DEBUG: Consolidated file ${index + 1}:`, {
- name: file.name,
- isUUID: uuidPattern.test(file.name),
- pagesCount: file.pagesCount,
- });
- });
-
// Keep only files with domain-like names (e.g., docs_crawl4ai_com)
// These are files created through the crawling process
const crawledFiles = consolidatedFiles.filter((file) => {
// Check if the filename is NOT a UUID
const isNotUUID = !uuidPattern.test(file.name);
- console.log(
- `TEMP-DEBUG: File ${file.name} is ${isNotUUID ? "NOT" : "IS"} a UUID`
- );
return isNotUUID;
});
- console.log(
- `TEMP-DEBUG: Found ${consolidatedFiles.length} consolidated files, ${crawledFiles.length} are crawled files`
- );
-
- // Log the final files being returned to the frontend
- console.log("TEMP-DEBUG: Final files being returned to frontend:");
- crawledFiles.forEach((file, index) => {
- console.log(`TEMP-DEBUG: Returned file ${index + 1}:`, {
- name: file.name,
- isConsolidated: file.isConsolidated,
- pagesCount: file.pagesCount,
- size: file.size,
- rootUrl: file.rootUrl || "none",
- });
- });
-
// Return the files
const response = {
success: true,
files: crawledFiles,
};
- console.log(
- "TEMP-DEBUG: Final response:",
- JSON.stringify(response, null, 2).substring(0, 1000) + "..."
- );
-
return NextResponse.json(response);
}
// Handle single file request
- const filename =
- url
- .replace(/^https?:\/\//, "")
- .replace(/[^a-z0-9]/gi, "_")
- .toLowerCase() + ".md";
+ if (url) {
+ const filename =
+ url
+ .replace(/^https?:\/\//, "")
+ .replace(/[^a-z0-9]/gi, "_")
+ .toLowerCase() + ".md";
- const filePath = path.join(STORAGE_DIR, filename);
- const content = await fs.readFile(filePath, "utf-8");
+ const filePath = path.join(STORAGE_DIR, filename);
+ const content = await fs.readFile(filePath, "utf-8");
+
+ return NextResponse.json({ success: true, content });
+ }
- return NextResponse.json({ success: true, content });
+ return NextResponse.json({ success: false, error: "No URL provided" });
} catch (error) {
return NextResponse.json(
{