Skip to content

Commit

Permalink
doc:add evaluation doc and example
Browse files Browse the repository at this point in the history
  • Loading branch information
Aries-ckt committed Oct 15, 2024
1 parent 8abf9d2 commit c47c472
Show file tree
Hide file tree
Showing 7 changed files with 336 additions and 17 deletions.
28 changes: 28 additions & 0 deletions dbgpt/client/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Evaluation."""
from typing import List

from dbgpt.core.schema.api import Result

from ..core.interface.evaluation import EvaluationResult
from ..serve.evaluate.api.schemas import EvaluateServeRequest
from .client import Client, ClientException


async def run_evaluation(
client: Client, request: EvaluateServeRequest
) -> List[EvaluationResult]:
"""Run evaluation.
Args:
client (Client): The dbgpt client.
request (EvaluateServeRequest): The Evaluate Request.
"""
try:
res = await client.post("/evaluate/evaluation", request.dict())
result: Result = res.json()
if result["success"]:
return list(result["data"])
else:
raise ClientException(status=result["err_code"], reason=result)
except Exception as e:
raise ClientException(f"Failed to run evaluation: {e}")
10 changes: 1 addition & 9 deletions dbgpt/serve/evaluate/api/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,7 @@
from functools import cache
from typing import List, Optional

from fastapi import (
APIRouter,
Depends,
File,
Form,
HTTPException,
Query,
UploadFile,
)
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer

Expand Down
5 changes: 1 addition & 4 deletions dbgpt/serve/evaluate/models/models_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@

from dbgpt.storage.metadata import BaseDao, Model, db

from ..api.schemas import (
DatasetServeRequest,
DatasetServeResponse,
)
from ..api.schemas import DatasetServeRequest, DatasetServeResponse
from ..config import SERVER_APP_TABLE_NAME, ServeConfig


Expand Down
12 changes: 8 additions & 4 deletions dbgpt/serve/evaluate/service/service_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,10 @@ async def upload_content_dataset(
datasets_df = pd.DataFrame(datasets_dicts)

if EVALUATE_FILE_COL_QUESTION not in datasets_df.columns:
raise ValueError(f"cannot be recognized and columns are missing "
f"{EVALUATE_FILE_COL_QUESTION}")
raise ValueError(
f"cannot be recognized and columns are missing "
f"{EVALUATE_FILE_COL_QUESTION}"
)

have_answer = False
if EVALUATE_FILE_COL_ANSWER in datasets_df.columns:
Expand Down Expand Up @@ -211,8 +213,10 @@ async def get_dataset_json_record(
encoding=encoding,
)
else:
raise ValueError(f"Evaluate does not support the current file "
f"type {dataset_info.file_type}.")
raise ValueError(
f"Evaluate does not support the current file "
f"type {dataset_info.file_type}."
)

return dataset_info, df_tmp.to_dict(orient="records")
elif dataset_info.storage_type == DatasetStorageType.DB.value:
Expand Down
205 changes: 205 additions & 0 deletions docs/docs/api/evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Evaluation

Get started with the Evaluation API


### Create Evaluation

```python
POST /api/v2/serve/evaluate/evaluation
```
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';

<Tabs
defaultValue="curl_evaluation"
groupId="chat1"
values={[
{label: 'Curl', value: 'curl_evaluation'},
{label: 'Python', value: 'python_evaluation'},
]
}>

<TabItem value="curl_evaluation">

```shell
DBGPT_API_KEY=dbgpt
DATASOURCE_ID={YOUR_DATASOURCE_ID}

curl -X POST "http://localhost:5670/api/v2/serve/evaluate/evaluation"
-H "Authorization: Bearer $DBGPT_API_KEY" \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"scene_key": "recall",
"scene_value":"147",
"context":{"top_k":5},
"sys_code":"xx",
"evaluate_metrics":["RetrieverHitRateMetric","RetrieverMRRMetric","RetrieverSimilarityMetric"],
"datasets": [{
"query": "what awel talked about",
"doc_name":"awel.md"
}]
}'

```
</TabItem>

<TabItem value="python_evaluation">


```python
from dbgpt.client import Client
from dbgpt.client.evaluation import run_evaluation
from dbgpt.serve.evaluate.api.schemas import EvaluateServeRequest

DBGPT_API_KEY = "dbgpt"
client = Client(api_key=DBGPT_API_KEY)
request = EvaluateServeRequest(
# The scene type of the evaluation, e.g. support app, recall
scene_key="recall",
# e.g. app id(when scene_key is app), space id(when scene_key is recall)
scene_value="147",
context={"top_k": 5},
evaluate_metrics=[
"RetrieverHitRateMetric",
"RetrieverMRRMetric",
"RetrieverSimilarityMetric",
],
datasets=[
{
"query": "what awel talked about",
"doc_name": "awel.md",
}
],
)
data = await run_evaluation(client, request=request)

```

</TabItem>
</Tabs>

#### Request body
Request <a href="#the-evaluation-request">Evaluation Object</a>

when scene_key is app, the request body should be like this:
```json

{
"scene_key": "app",
"scene_value":"2c76eea2-83b6-11ef-b482-acde48001122",
"context":{"top_k":5, "prompt":"942acd7e33b54ce28565f89f9b278044","model":"zhipu_proxyllm"},
"sys_code":"xx",
"evaluate_metrics":["AnswerRelevancyMetric"],
"datasets": [{
"query": "what awel talked about",
"doc_name":"awel.md"
}]
}
```

when scene_key is recall, the request body should be like this:
```json

{
"scene_key": "recall",
"scene_value":"2c76eea2-83b6-11ef-b482-acde48001122",
"context":{"top_k":5, "prompt":"942acd7e33b54ce28565f89f9b278044","model":"zhipu_proxyllm"},
"evaluate_metrics":["RetrieverHitRateMetric", "RetrieverMRRMetric", "RetrieverSimilarityMetric"],
"datasets": [{
"query": "what awel talked about",
"doc_name":"awel.md"
}]
}
```

#### Response body
Return <a href="#the-evaluation-object">Evaluation Object</a> List


### The Evaluation Request Object

________
<b>scene_key</b> <font color="gray"> string </font> <font color="red"> Required </font>

The scene type of the evaluation, e.g. support app, recall

--------
<b>scene_value</b> <font color="gray"> string </font> <font color="red"> Required </font>

The scene value of the evaluation, e.g. app id(when scene_key is app), space id(when scene_key is recall)

--------
<b>context</b> <font color="gray"> object </font> <font color="red"> Required </font>

The context of the evaluation
- top_k <font color="gray"> int </font> <font color="red"> Required </font>
- prompt <font color="gray"> string </font> prompt code
- model <font color="gray"> string </font> llm model name

--------
evaluate_metrics <font color="gray"> array </font> <font color="red"> Required </font>

The evaluate metrics of the evaluation,
e.g.
- <b>AnswerRelevancyMetric</b>: the answer relevancy metric(when scene_key is app)
- <b>RetrieverHitRateMetric</b>: Hit rate calculates the fraction of queries where the correct answer is found
within the top-k retrieved documents. In simpler terms, it’s about how often our
system gets it right within the top few guesses. (when scene_key is recall)
- <b>RetrieverMRRMetric</b>: For each query, MRR evaluates the system’s accuracy by looking at the rank of the
highest-placed relevant document. Specifically, it’s the average of the reciprocals
of these ranks across all the queries. So, if the first relevant document is the
top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2,
and so on. (when scene_key is recall)
- <b>RetrieverSimilarityMetric</b>: Embedding Similarity Metric (when scene_key is recall)

--------
datasets <font color="gray"> array </font> <font color="red"> Required </font>


The datasets of the evaluation


--------


### The Evaluation Result

________
<b>prediction</b> <font color="gray">string</font>

The prediction result
________
<b>contexts</b> <font color="gray">string</font>

The contexts of RAG Retrieve chunk
________
<b>score</b> <font color="gray">float</font>

The score of the prediction
________
<b>passing</b> <font color="gray">bool</font>

The passing of the prediction
________
<b>metric_name</b> <font color="gray">string</font>

The metric name of the evaluation
________
<b>prediction_cost</b> <font color="gray">int</font>

The prediction cost of the evaluation
________
<b>query</b> <font color="gray">string</font>

The query of the evaluation
________
<b>raw_dataset</b> <font color="gray">object</font>

The raw dataset of the evaluation
________
<b>feedback</b> <font color="gray">string</font>

The feedback of the llm evaluation
________
3 changes: 3 additions & 0 deletions docs/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,9 @@ const sidebars = {
},{
type: 'doc',
id: 'api/datasource'
},{
type: 'doc',
id: 'api/evaluation'
},
],
link: {
Expand Down
90 changes: 90 additions & 0 deletions examples/client/client_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Client: run evaluation example.
This example demonstrates how to use the dbgpt client to evaluate with the rag recall
and app answer.
Example:
.. code-block:: python
DBGPT_API_KEY = "dbgpt"
client = Client(api_key=DBGPT_API_KEY)
# 1. evaluate with rag recall
request = EvaluateServeRequest(
# The scene type of the evaluation, e.g. support app, recall
scene_key="recall",
# e.g. app id(when scene_key is app), space id(when scene_key is recall)
scene_value="147",
context={"top_k": 5},
evaluate_metrics=[
"RetrieverHitRateMetric",
"RetrieverMRRMetric",
"RetrieverSimilarityMetric",
],
datasets=[
{
"query": "what awel talked about",
"doc_name": "awel.md",
}
],
)
# 2. evaluate with app answer
request = EvaluateServeRequest(
# The scene type of the evaluation, e.g. support app, recall
scene_key="app",
# e.g. app id(when scene_key is app), space id(when scene_key is recall)
scene_value="2c76eea2-83b6-11ef-b482-acde48001122",
"context"={
"top_k": 5,
"prompt": "942acd7e33b54ce28565f89f9b278044",
"model": "zhipu_proxyllm",
},
evaluate_metrics=[
"AnswerRelevancyMetric",
],
datasets=[
{
"query": "what awel talked about",
"doc_name": "awel.md",
}
],
)
data = await run_evaluation(client, request=request)
print(data)
"""

import asyncio

from dbgpt.client import Client
from dbgpt.client.evaluation import run_evaluation
from dbgpt.serve.evaluate.api.schemas import EvaluateServeRequest


async def main():
# initialize client
DBGPT_API_KEY = "dbgpt"
client = Client(api_key=DBGPT_API_KEY)
request = EvaluateServeRequest(
# The scene type of the evaluation, e.g. support app, recall
scene_key="recall",
# e.g. app id(when scene_key is app), space id(when scene_key is recall)
scene_value="147",
context={"top_k": 5},
evaluate_metrics=[
"RetrieverHitRateMetric",
"RetrieverMRRMetric",
"RetrieverSimilarityMetric",
],
datasets=[
{
"query": "what awel talked about",
"doc_name": "awel.md",
}
],
)
data = await run_evaluation(client, request=request)
print(data)


if __name__ == "__main__":
asyncio.run(main())

0 comments on commit c47c472

Please sign in to comment.