Merge branch 'main' into nsc/concurrency-prod

mendableai · Jan 30, 2025 · 425378b · 425378b
2 parents 124e58e + e2c3932
commit 425378b
Show file tree

Hide file tree

Showing 127 changed files with 20,635 additions and 1,567 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,6 +40,12 @@ jobs:
           - 6379:6379
     steps:
       - uses: actions/checkout@v3
+      - name: Tailscale
+        uses: tailscale/github-action@v3
+        with:
+          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
+          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
+          tags: tag:ci
       - name: Set up Node.js
         uses: actions/setup-node@v3
         with:
@@ -57,7 +63,7 @@ jobs:
         run: npm run workers &
         working-directory: ./apps/api
         id: start_workers
-      - name: Run E2E tests
+      - name: Run snippet tests
         run: |
-          npm run test:prod
+          npm run test:snips
         working-directory: ./apps/api
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,5 @@ apps/js-sdk/firecrawl/dist
 /apps/api/debug/*
 
 .vscode
+llm-links.txt
+mapped-links.txt
diff --git a/README.md b/README.md
@@ -72,6 +72,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
 - [**Scrape**](#scraping): scrapes a URL and get its content in LLM-ready format (markdown, structured data via [LLM Extract](#llm-extraction-beta), screenshot, html)
 - [**Crawl**](#crawling): scrapes all the URLs of a web page and return content in LLM-ready format
 - [**Map**](#map-alpha): input a website and get all the website urls - extremely fast
+- [**Extract**](#extract): get structured data from single page, multiple pages or entire websites with AI.
 
 ### Powerful Capabilities
 - **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
@@ -240,6 +241,76 @@ Response will be an ordered list from the most relevant to the least relevant.
 }
 ```
 
+### Extract
+
+Get structured data from entire websites with a prompt and/or a schema.
+
+You can extract structured data from one or multiple URLs, including wildcards:
+
+Single Page:
+Example: https://firecrawl.dev/some-page
+
+Multiple Pages / Full Domain
+Example: https://firecrawl.dev/*
+
+When you use /*, Firecrawl will automatically crawl and parse all URLs it can discover in that domain, then extract the requested data.
+
+```bash
+curl -X POST https://api.firecrawl.dev/v1/extract \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: Bearer YOUR_API_KEY' \
+    -d '{
+      "urls": [
+        "https://firecrawl.dev/*", 
+        "https://docs.firecrawl.dev/", 
+        "https://www.ycombinator.com/companies"
+      ],
+      "prompt": "Extract the company mission, whether it is open source, and whether it is in Y Combinator from the page.",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "company_mission": {
+            "type": "string"
+          },
+          "is_open_source": {
+            "type": "boolean"
+          },
+          "is_in_yc": {
+            "type": "boolean"
+          }
+        },
+        "required": [
+          "company_mission",
+          "supports_sso",
+          "is_open_source",
+          "is_in_yc"
+        ]
+      }
+    }'
+```
+
+```json
+{
+  "success": true,
+  "id": "44aa536d-f1cb-4706-ab87-ed0386685740",
+  "urlTrace": []
+}
+```
+
+If you are using the sdks, it will auto pull the response for you:
+
+```json
+{
+  "success": true,
+  "data": {
+    "company_mission": "Firecrawl is the easiest way to extract data from the web. Developers use us to reliably convert URLs into LLM-ready markdown or structured data with a single API call.",
+    "supports_sso": false,
+    "is_open_source": true,
+    "is_in_yc": true
+  }
+}
+```
+
 ### LLM Extraction (Beta)
 
 Used to extract structured data from scraped pages.
@@ -250,8 +321,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
     -H 'Authorization: Bearer YOUR_API_KEY' \
     -d '{
       "url": "https://www.mendable.ai/",
-      "formats": ["extract"],
-      "extract": {
+      "formats": ["json"],
+      "jsonOptions": {
         "schema": {
           "type": "object",
           "properties": {
@@ -296,7 +367,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
       "ogSiteName": "Mendable",
       "sourceURL": "https://mendable.ai/"
     },
-    "llm_extraction": {
+    "json": {
       "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
       "supports_sso": true,
       "is_open_source": false,
@@ -316,8 +387,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
     -H 'Authorization: Bearer YOUR_API_KEY' \
     -d '{
       "url": "https://docs.firecrawl.dev/",
-      "formats": ["extract"],
-      "extract": {
+      "formats": ["json"],
+      "jsonOptions": {
         "prompt": "Extract the company mission from the page."
       }
     }'
@@ -447,12 +518,12 @@ class TopArticlesSchema(BaseModel):
     top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
 
 data = app.scrape_url('https://news.ycombinator.com', {
-    'formats': ['extract'],
-    'extract': {
+    'formats': ['json'],
+    'jsonOptions': {
         'schema': TopArticlesSchema.model_json_schema()
     }
 })
-print(data["extract"])
+print(data["json"])
 ```
 
 ## Using the Node SDK
@@ -526,10 +597,10 @@ const schema = z.object({
 });
 
 const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
-  extractorOptions: { extractionSchema: schema },
+  jsonOptions: { extractionSchema: schema },
 });
 
-console.log(scrapeResult.data["llm_extraction"]);
+console.log(scrapeResult.data["json"]);
 ```
 
 ## Open Source vs Cloud Offering

diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile
@@ -27,10 +27,20 @@ RUN cd /app/sharedLibs/go-html-to-md && \
     go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
     chmod +x html-to-markdown.so
 
+# Install Rust
+FROM rust:1-bullseye AS rust-base
+COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer
+
+# Install Go dependencies and build parser lib
+RUN cd /app/sharedLibs/html-transformer && \
+    cargo build --release && \
+    chmod +x target/release/libhtml_transformer.so
+
 FROM base
 COPY --from=prod-deps /app/node_modules /app/node_modules
 COPY --from=build /app /app
 COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
+COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so
 
 # Start the server by default, this can be overwritten at runtime
 EXPOSE 8080

diff --git a/apps/api/docker-entrypoint.sh b/apps/api/docker-entrypoint.sh
@@ -13,6 +13,9 @@ if [ $FLY_PROCESS_GROUP = "app" ]; then
 elif [ $FLY_PROCESS_GROUP = "worker" ]; then
   echo "RUNNING worker"
   node --max-old-space-size=8192 dist/src/services/queue-worker.js
+elif [ $FLY_PROCESS_GROUP = "index-worker" ]; then
+  echo "RUNNING index worker"
+  node --max-old-space-size=8192 dist/src/services/indexing/index-worker.js
 else
   echo "NO FLY PROCESS GROUP"
   node --max-old-space-size=8192 dist/src/index.js

diff --git a/apps/api/package.json b/apps/api/package.json
@@ -14,9 +14,12 @@
     "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
     "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
     "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
-    "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
+    "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
+    "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
     "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
     "worker:production": "node dist/src/services/queue-worker.js",
+    "index-worker": "nodemon --exec ts-node src/services/indexing/index-worker.ts",
+    "index-worker:production": "node dist/src/services/indexing/index-worker.js",
     "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
     "mongo-docker-console": "docker exec -it mongodb mongosh",
     "run-example": "npx ts-node src/example.ts",
@@ -35,8 +38,10 @@
     "@types/escape-html": "^1.0.4",
     "@types/express": "^4.17.17",
     "@types/jest": "^29.5.12",
+    "@types/lodash": "^4.17.14",
     "@types/node": "^20.14.1",
     "@types/pdf-parse": "^1.1.4",
+    "@types/supertest": "^6.0.2",
     "body-parser": "^1.20.1",
     "express": "^4.18.2",
     "jest": "^29.6.3",
@@ -52,6 +57,7 @@
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.24.3",
+    "@apidevtools/json-schema-ref-parser": "^11.7.3",
     "@brillout/import": "^0.2.2",
     "@bull-board/api": "^5.20.5",
     "@bull-board/express": "^5.20.5",
@@ -71,7 +77,7 @@
     "axios": "^1.3.4",
     "axios-retry": "^4.5.0",
     "bottleneck": "^2.19.5",
-    "bullmq": "^5.11.0",
+    "bullmq": "^5.36.0",
     "cacheable-lookup": "^6.1.0",
     "cheerio": "^1.0.0-rc.12",
     "cohere": "^1.1.1",
@@ -94,6 +100,7 @@
     "koffi": "^2.9.0",
     "langchain": "^0.2.8",
     "languagedetect": "^2.0.0",
+    "lodash": "^4.17.21",
     "logsnag": "^1.0.0",
     "luxon": "^3.4.3",
     "marked": "^14.1.2",