refactor: evaluator (#66)

* refactor: evaluator * refactor: evaluator * refactor: evaluator * refactor: evaluator
jina-ai · Feb 20, 2025 · 3c0dffc · 3c0dffc
1 parent 40ee042
commit 3c0dffc
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # DeepResearch
 
-[UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
+[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
 
 Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
 
@@ -54,15 +54,21 @@ export JINA_API_KEY=jina_...  # free jina api key, get from https://jina.ai/read
 npm run dev $QUERY
 ```
 
+### Official Site
+
+You can try it on [our official site](https://search.jina.ai).
+
 ### Official API
 
-You can also use our official DeepSearch API, hosted and optimized by Jina AI:
+You can also use [our official DeepSearch API](https://jina.ai/deepsearch):
 
 ```
 https://deepsearch.jina.ai/v1/chat/completions
 ```
 
-You can use it with any OpenAI-compatible client. For the authentication Bearer, get your Jina API key from https://jina.ai
+You can use it with any OpenAI-compatible client. 
+
+For the authentication Bearer, API key, rate limit, get from https://jina.ai/deepsearch.
 
 #### Client integration guidelines
 

diff --git a/src/agent.ts b/src/agent.ts
@@ -576,7 +576,7 @@ But then you realized you have asked them before. You decided to to think out of
       let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker);
 
       // add the original query before rewrite to the keywordsQueries
-      keywordsQueries.push(thisStep.searchQuery)
+      keywordsQueries.push(question)
 
       const oldKeywords = keywordsQueries;
       // avoid exisitng searched queries

diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
@@ -21,9 +21,7 @@ const freshnessSchema = z.object({
   ...baseSchema,
   type: z.literal('freshness'),
   freshness_analysis: z.object({
-    likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
-    dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
-    current_time: z.string().describe('Current system time when evaluation was performed'),
+    days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
     max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
   })
 });
@@ -32,8 +30,6 @@ const pluralitySchema = z.object({
   ...baseSchema,
   type: z.literal('plurality'),
   plurality_analysis: z.object({
-    expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
-    provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
     count_expected: z.number().optional().describe('Number of items expected if specified in question'),
     count_provided: z.number().describe('Number of items provided in answer')
   })
@@ -162,69 +158,61 @@ Answer: ${JSON.stringify(answer)}`;
 }
 
 function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
-  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
+  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates (or implied datetime) and current system time: ${currentTime}
 
 <rules>
-1. Date Analysis:
-   - Extract all dates mentioned in the answer
-   - Compare against current system time: ${currentTime}
-   - Consider content outdated if:
-     * It refers to a "latest" or "current" state from more than 30 days ago
-     * It mentions specific dates/events that have been superseded
-     * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
-   - For product versions, releases, or announcements, max age is 30 days
-   - For company positions, leadership, or general facts, max age is 90 days
-
-2. Context Hints:
-   - Words indicating recency: "latest", "current", "newest", "just released", "recently"
-   - Time-sensitive terms: "CEO", "price", "version", "release"
-   - Future dates should be ignored in outdated calculation
+Question-Answer Freshness Checker Guidelines
+
+# Revised QA Type Maximum Age Table
+
+| QA Type                  | Max Age (Days) | Notes                                                                 |
+|--------------------------|--------------|-----------------------------------------------------------------------|
+| Financial Data (Real-time)| 0.1        | Stock prices, exchange rates, crypto (real-time preferred)             |
+| Breaking News            | 1           | Immediate coverage of major events                                     |
+| News/Current Events      | 1           | Time-sensitive news, politics, or global events                        |
+| Weather Forecasts        | 1           | Accuracy drops significantly after 24 hours                            |
+| Sports Scores/Events     | 1           | Live updates required for ongoing matches                              |
+| Security Advisories      | 1           | Critical security updates and patches                                  |
+| Social Media Trends      | 1           | Viral content, hashtags, memes                                         |
+| Cybersecurity Threats    | 7           | Rapidly evolving vulnerabilities/patches                               |
+| Tech News                | 7           | Technology industry updates and announcements                          |
+| Political Developments   | 7           | Legislative changes, political statements                              |
+| Political Elections      | 7           | Poll results, candidate updates                                        |
+| Sales/Promotions         | 7           | Limited-time offers and marketing campaigns                            |
+| Travel Restrictions      | 7           | Visa rules, pandemic-related policies                                  |
+| Entertainment News       | 14          | Celebrity updates, industry announcements                              |
+| Product Launches         | 14          | New product announcements and releases                                 |
+| Market Analysis          | 14          | Market trends and competitive landscape                                |
+| Competitive Intelligence | 21          | Analysis of competitor activities and market position                  |
+| Product Recalls          | 30          | Safety alerts or recalls from manufacturers                            |
+| Industry Reports         | 30          | Sector-specific analysis and forecasting                               |
+| Software Version Info    | 30          | Updates, patches, and compatibility information                        |
+| Legal/Regulatory Updates | 30          | Laws, compliance rules (jurisdiction-dependent)                        |
+| Economic Forecasts       | 30          | Macroeconomic predictions and analysis                                 |
+| Consumer Trends          | 45          | Shifting consumer preferences and behaviors                            |
+| Scientific Discoveries   | 60          | New research findings and breakthroughs (includes all scientific research) |
+| Healthcare Guidelines    | 60          | Medical recommendations and best practices (includes medical guidelines)|
+| Environmental Reports    | 60          | Climate and environmental status updates                               |
+| Best Practices           | 90          | Industry standards and recommended procedures                          |
+| API Documentation        | 90          | Technical specifications and implementation guides                     |
+| Tutorial Content         | 180         | How-to guides and instructional materials (includes educational content)|
+| Tech Product Info        | 180         | Product specs, release dates, or pricing                               |
+| Statistical Data         | 180         | Demographic and statistical information                                |
+| Reference Material       | 180         | General reference information and resources                            |
+| Historical Content       | 365         | Events and information from the past year                              |
+| Cultural Trends          | 730         | Shifts in language, fashion, or social norms                           |
+| Entertainment Releases   | 730         | Movie/TV show schedules, media catalogs                                |
+| Factual Knowledge        | ∞           | Static facts (e.g., historical events, geography, physical constants)   |
+
+### Implementation Notes:
+1. **Contextual Adjustment**: Freshness requirements may change during crises or rapid developments in specific domains.
+2. **Tiered Approach**: Consider implementing urgency levels (critical, important, standard) alongside age thresholds.
+3. **User Preferences**: Allow customization of thresholds for specific query types or user needs.
+4. **Source Reliability**: Pair freshness metrics with source credibility scores for better quality assessment.
+5. **Domain Specificity**: Some specialized fields (medical research during pandemics, financial data during market volatility) may require dynamically adjusted thresholds.
+6. **Geographic Relevance**: Regional considerations may alter freshness requirements for local regulations or events.
 </rules>
 
-<examples>
-Question: "What was Jina AI's closing stock price yesterday?"
-Answer: "Jina AI's stock closed at $45.30 per share at yesterday's market close."
-Current Time: "2024-03-07T14:30:00Z"
-Evaluation: {
-  "pass": true,
-  "think": "The question specifically asks for yesterday's closing price, and the answer provides exactly that information. Since it's asking for a historical data point rather than current price, yesterday's closing price is the correct timeframe.",
-  "freshness_analysis": {
-    "likely_outdated": false,
-    "dates_mentioned": ["2024-03-06"],
-    "current_time": "2024-03-07T14:30:00Z",
-    "max_age_days": 1
-  }
-}
-
-Question: "What is Jina AI's latest embedding model?"
-Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
-Current Time: "2024-10-06T00:00:00Z"
-Evaluation: {
-  "pass": false,
-  "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
-  "freshness_analysis": {
-    "likely_outdated": true,
-    "dates_mentioned": ["2024-03-15"],
-    "current_time": "2024-10-06T00:00:00Z",
-    "max_age_days": 30
-  }
-}
-
-Question: "Who is OpenAI's CEO?"
-Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
-Current Time: "2024-02-06T00:00:00Z"
-Evaluation: {
-  "pass": true,
-  "think": "The answer is about company leadership and is within the 60-day threshold for such information",
-  "freshness_analysis": {
-    "likely_outdated": false,
-    "dates_mentioned": ["2023-12"],
-    "current_time": "2024-02-06T00:00:00Z",
-    "max_age_days": 90
-  }
-}
-</examples>
-
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
@@ -234,77 +222,38 @@ function getPluralityPrompt(question: string, answer: string): string {
   return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
 
 <rules>
-1. Question Analysis:
-   - Check if question asks for multiple items using indicators like:
-     * Plural nouns: "companies", "people", "names"
-     * Quantifiers: "all", "many", "several", "various", "multiple"
-     * List requests: "list", "enumerate", "name all", "give me all"
-     * Numbers: "5 examples", "top 10"
-   - Otherwise skip the analysis and return pass to true
-
-2. Answer Analysis:
-   - Count distinct items provided in the answer
-   - Check if answer uses limiting words like "only", "just", "single"
-   - Identify if answer acknowledges there are more items but only provides some
-
-3. Definitiveness Rules:
-   - If question asks for multiple items but answer provides only one → NOT definitive
-   - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
-   - If answer clearly states it's providing a partial list → NOT definitive
-   - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
+Question Type Reference Table
+
+| Question Type | Expected Items | Evaluation Rules |
+|---------------|----------------|------------------|
+| Explicit Count | Exact match to number specified | Provide exactly the requested number of distinct, non-redundant items relevant to the query. |
+| Numeric Range | Any number within specified range | Ensure count falls within given range with distinct, non-redundant items. For "at least N" queries, meet minimum threshold. |
+| Implied Multiple | ≥ 2 | Provide multiple items (typically 2-4 unless context suggests more) with balanced detail and importance. |
+| "Few" | 2-4 | Offer 2-4 substantive items prioritizing quality over quantity. |
+| "Several" | 3-7 | Include 3-7 items with comprehensive yet focused coverage, each with brief explanation. |
+| "Many" | 7+ | Present 7+ items demonstrating breadth, with concise descriptions per item. |
+| "Most important" | Top 3-5 by relevance | Prioritize by importance, explain ranking criteria, and order items by significance. |
+| "Top N" | Exactly N, ranked | Provide exactly N items ordered by importance/relevance with clear ranking criteria. |
+| "Pros and Cons" | ≥ 2 of each category | Present balanced perspectives with at least 2 items per category addressing different aspects. |
+| "Compare X and Y" | ≥ 3 comparison points | Address at least 3 distinct comparison dimensions with balanced treatment covering major differences/similarities. |
+| "Steps" or "Process" | All essential steps | Include all critical steps in logical order without missing dependencies. |
+| "Examples" | ≥ 3 unless specified | Provide at least 3 diverse, representative, concrete examples unless count specified. |
+| "Comprehensive" | 10+ | Deliver extensive coverage (10+ items) across major categories/subcategories demonstrating domain expertise. |
+| "Brief" or "Quick" | 1-3 | Present concise content (1-3 items) focusing on most important elements described efficiently. |
+| "Complete" | All relevant items | Provide exhaustive coverage within reasonable scope without major omissions, using categorization if needed. |
+| "Thorough" | 7-10 | Offer detailed coverage addressing main topics and subtopics with both breadth and depth. |
+| "Overview" | 3-5 | Cover main concepts/aspects with balanced coverage focused on fundamental understanding. |
+| "Summary" | 3-5 key points | Distill essential information capturing main takeaways concisely yet comprehensively. |
+| "Main" or "Key" | 3-7 | Focus on most significant elements fundamental to understanding, covering distinct aspects. |
+| "Essential" | 3-7 | Include only critical, necessary items without peripheral or optional elements. |
+| "Basic" | 2-5 | Present foundational concepts accessible to beginners focusing on core principles. |
+| "Detailed" | 5-10 with elaboration | Provide in-depth coverage with explanations beyond listing, including specific information and nuance. |
+| "Common" | 4-8 most frequent | Focus on typical or prevalent items, ordered by frequency when possible, that are widely recognized. |
+| "Primary" | 2-5 most important | Focus on dominant factors with explanation of their primacy and outsized impact. |
+| "Secondary" | 3-7 supporting items | Present important but not critical items that complement primary factors and provide additional context. |
+| Unspecified Analysis | 3-5 key points | Default to 3-5 main points covering primary aspects with balanced breadth and depth. |
 </rules>
 
-<examples>
-Question: "Who works in Jina AI's sales team?"
-Answer: "John Smith is a sales representative at Jina AI."
-Evaluation: {
-  "pass": true,
-  "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
-  "plurality_analysis": {
-    "expects_multiple": false,
-    "provides_multiple": false,
-    "count_provided": 1
-  }
-}
-
-Question: "List all the salespeople who work at Jina AI"
-Answer: "John Smith is a sales representative at Jina AI."
-Evaluation: {
-  "pass": false,
-  "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
-  "plurality_analysis": {
-    "expects_multiple": true,
-    "provides_multiple": false,
-    "count_provided": 1
-  }
-}
-
-Question: "Name the top 3 products sold by Jina AI"
-Answer: "Jina AI's product lineup includes DocArray and Jina."
-Evaluation: {
-  "pass": false,
-  "think": "The question asks for top 3 products but only 2 are provided.",
-  "plurality_analysis": {
-    "expects_multiple": true,
-    "provides_multiple": true,
-    "count_expected": 3,
-    "count_provided": 2
-  }
-}
-
-Question: "List as many AI companies in Berlin as you can find"
-Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
-Evaluation: {
-  "pass": false,
-  "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
-  "plurality_analysis": {
-    "expects_multiple": true,
-    "provides_multiple": true,
-    "count_provided": 5
-  }
-}
-</examples>
-
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;

diff --git a/src/tools/read.ts b/src/tools/read.ts
@@ -23,7 +23,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
         'Content-Type': 'application/json',
         'Content-Length': data.length,
         'X-Retain-Images': 'none',
-        'X-Return-Format': 'markdown'
+        'X-Return-Format': 'markdown',
+        'X-Engine': 'direct'
       }
     };
 

diff --git a/src/types.ts b/src/types.ts
@@ -143,14 +143,10 @@ export type EvaluationResponse = {
   think: string;
   type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
   freshness_analysis?: {
-    likely_outdated: boolean;
-    dates_mentioned: string[];
-    current_time: string;
+    days_ago: number;
     max_age_days?: number;
   };
   plurality_analysis?: {
-    expects_multiple: boolean;
-    provides_multiple: boolean;
     count_expected?: number;
     count_provided: number;
   };