|
25 | 25 | "nest_asyncio.apply()"
|
26 | 26 | ]
|
27 | 27 | },
|
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 2, |
| 31 | + "id": "8333f65e", |
| 32 | + "metadata": {}, |
| 33 | + "outputs": [], |
| 34 | + "source": [ |
| 35 | + "%load_ext autoreload\n", |
| 36 | + "%autoreload 2" |
| 37 | + ] |
| 38 | + }, |
28 | 39 | {
|
29 | 40 | "cell_type": "markdown",
|
30 | 41 | "id": "842e32dc",
|
|
35 | 46 | },
|
36 | 47 | {
|
37 | 48 | "cell_type": "code",
|
38 |
| - "execution_count": 2, |
| 49 | + "execution_count": 3, |
39 | 50 | "id": "4aa9a986",
|
40 | 51 | "metadata": {},
|
41 | 52 | "outputs": [],
|
|
51 | 62 | "\n",
|
52 | 63 | "llm = ChatOpenAI()\n",
|
53 | 64 | "qa_chain = RetrievalQA.from_chain_type(\n",
|
54 |
| - " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True\n", |
| 65 | + " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True,\n", |
55 | 66 | ")"
|
56 | 67 | ]
|
57 | 68 | },
|
58 | 69 | {
|
59 | 70 | "cell_type": "code",
|
60 |
| - "execution_count": 3, |
| 71 | + "execution_count": 4, |
61 | 72 | "id": "b0ebdf8d",
|
62 | 73 | "metadata": {},
|
63 | 74 | "outputs": [
|
64 | 75 | {
|
65 | 76 | "data": {
|
66 | 77 | "text/plain": [
|
67 |
| - "'New York City was named in honor of the Duke of York, who would become King James II of England. King Charles II appointed the Duke as proprietor of the former territory of New Netherland, including the city of New Amsterdam, when England seized it from Dutch control.'" |
| 78 | + "'New York City got its name in 1664 when it was renamed after the Duke of York, who later became King James II of England. The city was originally called New Amsterdam by Dutch colonists and was renamed New York when it came under British control.'" |
68 | 79 | ]
|
69 | 80 | },
|
70 |
| - "execution_count": 3, |
| 81 | + "execution_count": 4, |
71 | 82 | "metadata": {},
|
72 | 83 | "output_type": "execute_result"
|
73 | 84 | }
|
|
90 | 101 | },
|
91 | 102 | {
|
92 | 103 | "cell_type": "code",
|
93 |
| - "execution_count": 4, |
| 104 | + "execution_count": 5, |
94 | 105 | "id": "e67ce0e0",
|
95 | 106 | "metadata": {},
|
96 | 107 | "outputs": [],
|
|
103 | 114 | " \"What is the significance of the Statue of Liberty in New York City?\",\n",
|
104 | 115 | "]\n",
|
105 | 116 | "\n",
|
106 |
| - "queries = [{\"query\": q} for q in eval_questions]" |
| 117 | + "eval_answers = [\n", |
| 118 | + " \"8,804,000\", # incorrect answer\n", |
| 119 | + " \"Queens\", # incorrect answer\n", |
| 120 | + " \"New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.\",\n", |
| 121 | + " \"New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.\",\n", |
| 122 | + " 'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.',\n", |
| 123 | + "]\n", |
| 124 | + "\n", |
| 125 | + "examples = [{\"query\": q, \"ground_truths\": [eval_answers[i]]} \n", |
| 126 | + " for i, q in enumerate(eval_questions)]" |
107 | 127 | ]
|
108 | 128 | },
|
109 | 129 | {
|
|
126 | 146 | },
|
127 | 147 | {
|
128 | 148 | "cell_type": "code",
|
129 |
| - "execution_count": 5, |
| 149 | + "execution_count": 10, |
| 150 | + "id": "8f89d719", |
| 151 | + "metadata": {}, |
| 152 | + "outputs": [ |
| 153 | + { |
| 154 | + "data": { |
| 155 | + "text/plain": [ |
| 156 | + "'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.'" |
| 157 | + ] |
| 158 | + }, |
| 159 | + "execution_count": 10, |
| 160 | + "metadata": {}, |
| 161 | + "output_type": "execute_result" |
| 162 | + } |
| 163 | + ], |
| 164 | + "source": [ |
| 165 | + "result = qa_chain({\"query\": eval_questions[4]})\n", |
| 166 | + "result[\"result\"]" |
| 167 | + ] |
| 168 | + }, |
| 169 | + { |
| 170 | + "cell_type": "code", |
| 171 | + "execution_count": 16, |
| 172 | + "id": "81fa9c47", |
| 173 | + "metadata": {}, |
| 174 | + "outputs": [ |
| 175 | + { |
| 176 | + "data": { |
| 177 | + "text/plain": [ |
| 178 | + "'The borough of Brooklyn (Kings County) has the highest population in New York City.'" |
| 179 | + ] |
| 180 | + }, |
| 181 | + "execution_count": 16, |
| 182 | + "metadata": {}, |
| 183 | + "output_type": "execute_result" |
| 184 | + } |
| 185 | + ], |
| 186 | + "source": [ |
| 187 | + "result = qa_chain(examples[1])\n", |
| 188 | + "result[\"result\"]" |
| 189 | + ] |
| 190 | + }, |
| 191 | + { |
| 192 | + "cell_type": "code", |
| 193 | + "execution_count": 8, |
130 | 194 | "id": "1d9266d4",
|
131 | 195 | "metadata": {},
|
132 | 196 | "outputs": [],
|
133 | 197 | "source": [
|
134 | 198 | "from ragas.langchain.evalchain import RagasEvaluatorChain\n",
|
135 |
| - "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy\n", |
| 199 | + "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall\n", |
136 | 200 | "\n",
|
137 | 201 | "# create evaluation chains\n",
|
138 | 202 | "faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)\n",
|
139 | 203 | "answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)\n",
|
140 |
| - "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)" |
| 204 | + "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)\n", |
| 205 | + "context_recall_chain = RagasEvaluatorChain(metric=context_recall)" |
141 | 206 | ]
|
142 | 207 | },
|
143 | 208 | {
|
|
152 | 217 | },
|
153 | 218 | {
|
154 | 219 | "cell_type": "code",
|
155 |
| - "execution_count": 6, |
| 220 | + "execution_count": 17, |
156 | 221 | "id": "5ede32cd",
|
157 | 222 | "metadata": {},
|
158 | 223 | "outputs": [
|
159 | 224 | {
|
160 | 225 | "data": {
|
161 | 226 | "text/plain": [
|
162 |
| - "1.0" |
| 227 | + "0.5" |
163 | 228 | ]
|
164 | 229 | },
|
165 |
| - "execution_count": 6, |
| 230 | + "execution_count": 17, |
166 | 231 | "metadata": {},
|
167 | 232 | "output_type": "execute_result"
|
168 | 233 | }
|
|
172 | 237 | "eval_result[\"faithfulness_score\"]"
|
173 | 238 | ]
|
174 | 239 | },
|
| 240 | + { |
| 241 | + "cell_type": "code", |
| 242 | + "execution_count": 18, |
| 243 | + "id": "94b5544e", |
| 244 | + "metadata": {}, |
| 245 | + "outputs": [ |
| 246 | + { |
| 247 | + "data": { |
| 248 | + "text/plain": [ |
| 249 | + "0.0" |
| 250 | + ] |
| 251 | + }, |
| 252 | + "execution_count": 18, |
| 253 | + "metadata": {}, |
| 254 | + "output_type": "execute_result" |
| 255 | + } |
| 256 | + ], |
| 257 | + "source": [ |
| 258 | + "eval_result = context_recall_chain(result)\n", |
| 259 | + "eval_result[\"context_recall_score\"]" |
| 260 | + ] |
| 261 | + }, |
175 | 262 | {
|
176 | 263 | "cell_type": "markdown",
|
177 | 264 | "id": "f11295b5",
|
|
184 | 271 | },
|
185 | 272 | {
|
186 | 273 | "cell_type": "code",
|
187 |
| - "execution_count": 7, |
| 274 | + "execution_count": 24, |
188 | 275 | "id": "1ce7bff1",
|
189 | 276 | "metadata": {},
|
190 | 277 | "outputs": [
|
|
199 | 286 | "name": "stderr",
|
200 | 287 | "output_type": "stream",
|
201 | 288 | "text": [
|
202 |
| - "100%|█████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.77s/it]\n" |
| 289 | + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:57<00:00, 57.41s/it]\n" |
203 | 290 | ]
|
204 | 291 | },
|
205 | 292 | {
|
206 | 293 | "data": {
|
207 | 294 | "text/plain": [
|
208 | 295 | "[{'faithfulness_score': 1.0},\n",
|
209 | 296 | " {'faithfulness_score': 0.5},\n",
|
210 |
| - " {'faithfulness_score': 0.75},\n", |
| 297 | + " {'faithfulness_score': 1.0},\n", |
211 | 298 | " {'faithfulness_score': 1.0},\n",
|
212 | 299 | " {'faithfulness_score': 1.0}]"
|
213 | 300 | ]
|
214 | 301 | },
|
215 |
| - "execution_count": 7, |
| 302 | + "execution_count": 24, |
216 | 303 | "metadata": {},
|
217 | 304 | "output_type": "execute_result"
|
218 | 305 | }
|
219 | 306 | ],
|
220 | 307 | "source": [
|
221 | 308 | "# run the queries as a batch for efficiency\n",
|
222 |
| - "predictions = qa_chain.batch(queries)\n", |
| 309 | + "predictions = qa_chain.batch(examples)\n", |
223 | 310 | "\n",
|
224 | 311 | "# evaluate\n",
|
225 | 312 | "print(\"evaluating...\")\n",
|
226 |
| - "r = faithfulness_chain.evaluate(queries, predictions)\n", |
| 313 | + "r = faithfulness_chain.evaluate(examples, predictions)\n", |
| 314 | + "r" |
| 315 | + ] |
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "code", |
| 319 | + "execution_count": 25, |
| 320 | + "id": "55299f14", |
| 321 | + "metadata": {}, |
| 322 | + "outputs": [ |
| 323 | + { |
| 324 | + "name": "stdout", |
| 325 | + "output_type": "stream", |
| 326 | + "text": [ |
| 327 | + "evaluating...\n" |
| 328 | + ] |
| 329 | + }, |
| 330 | + { |
| 331 | + "name": "stderr", |
| 332 | + "output_type": "stream", |
| 333 | + "text": [ |
| 334 | + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:54<00:00, 54.21s/it]\n" |
| 335 | + ] |
| 336 | + }, |
| 337 | + { |
| 338 | + "data": { |
| 339 | + "text/plain": [ |
| 340 | + "[{'context_recall_score': 0.9333333333333333},\n", |
| 341 | + " {'context_recall_score': 0.0},\n", |
| 342 | + " {'context_recall_score': 1.0},\n", |
| 343 | + " {'context_recall_score': 1.0},\n", |
| 344 | + " {'context_recall_score': 1.0}]" |
| 345 | + ] |
| 346 | + }, |
| 347 | + "execution_count": 25, |
| 348 | + "metadata": {}, |
| 349 | + "output_type": "execute_result" |
| 350 | + } |
| 351 | + ], |
| 352 | + "source": [ |
| 353 | + "# evaluate context recall\n", |
| 354 | + "print(\"evaluating...\")\n", |
| 355 | + "r = context_recall_chain.evaluate(examples, predictions)\n", |
227 | 356 | "r"
|
228 | 357 | ]
|
229 | 358 | },
|
|
244 | 373 | },
|
245 | 374 | {
|
246 | 375 | "cell_type": "code",
|
247 |
| - "execution_count": 8, |
| 376 | + "execution_count": 48, |
248 | 377 | "id": "e75144c5",
|
249 | 378 | "metadata": {},
|
250 | 379 | "outputs": [
|
251 | 380 | {
|
252 | 381 | "name": "stdout",
|
253 | 382 | "output_type": "stream",
|
254 | 383 | "text": [
|
255 |
| - "using existing dataset: NYC test\n" |
| 384 | + "Created a new dataset: NYC test\n" |
256 | 385 | ]
|
257 | 386 | }
|
258 | 387 | ],
|
|
274 | 403 | " dataset = client.create_dataset(\n",
|
275 | 404 | " dataset_name=dataset_name, description=\"NYC test dataset\"\n",
|
276 | 405 | " )\n",
|
277 |
| - " for q in eval_questions:\n", |
| 406 | + " for e in examples:\n", |
278 | 407 | " client.create_example(\n",
|
279 |
| - " inputs={\"query\": q},\n", |
| 408 | + " inputs={\"query\": e[\"query\"]},\n", |
| 409 | + " outputs={\"ground_truths\": e[\"ground_truths\"]},\n", |
280 | 410 | " dataset_id=dataset.id,\n",
|
281 | 411 | " )\n",
|
282 | 412 | "\n",
|
|
297 | 427 | },
|
298 | 428 | {
|
299 | 429 | "cell_type": "code",
|
300 |
| - "execution_count": 9, |
| 430 | + "execution_count": 27, |
301 | 431 | "id": "3a6decc6",
|
302 | 432 | "metadata": {},
|
303 | 433 | "outputs": [],
|
|
322 | 452 | },
|
323 | 453 | {
|
324 | 454 | "cell_type": "code",
|
325 |
| - "execution_count": 10, |
| 455 | + "execution_count": 49, |
326 | 456 | "id": "25f7992f",
|
327 | 457 | "metadata": {},
|
328 | 458 | "outputs": [
|
329 | 459 | {
|
330 | 460 | "name": "stdout",
|
331 | 461 | "output_type": "stream",
|
332 | 462 | "text": [
|
333 |
| - "View the evaluation results for project '2023-08-22-19-28-17-RetrievalQA' at:\n", |
334 |
| - "https://smith.langchain.com/projects/p/2133d672-b69a-4091-bc96-a4e39d150db5?eval=true\n" |
| 463 | + "View the evaluation results for project '2023-08-24-03-36-45-RetrievalQA' at:\n", |
| 464 | + "https://smith.langchain.com/projects/p/9fb78371-150e-49cc-a927-b1247fdb9e8d?eval=true\n" |
335 | 465 | ]
|
336 | 466 | }
|
337 | 467 | ],
|
338 | 468 | "source": [
|
339 | 469 | "from langchain.smith import RunEvalConfig, run_on_dataset\n",
|
340 | 470 | "\n",
|
341 | 471 | "evaluation_config = RunEvalConfig(\n",
|
342 |
| - " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain],\n", |
| 472 | + " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain, context_recall_chain],\n", |
343 | 473 | " prediction_key=\"result\",\n",
|
344 | 474 | ")\n",
|
345 |
| - "\n", |
| 475 | + " \n", |
346 | 476 | "result = run_on_dataset(\n",
|
347 | 477 | " client,\n",
|
348 | 478 | " dataset_name,\n",
|
|
0 commit comments