diff --git a/.github/workflows/combine_nb_to_docs_testing.sh b/.github/workflows/combine_nb_to_docs_testing.sh index fa08345f2..519d0a9ba 100755 --- a/.github/workflows/combine_nb_to_docs_testing.sh +++ b/.github/workflows/combine_nb_to_docs_testing.sh @@ -8,14 +8,14 @@ rm -rf all_tools.ipynb # IF MOVING ANY IPYNB, MAKE SURE TO RE-SYMLINK. MANY IPYNB REFERENCED HERE LIVE # IN OTHER PATHS ALL_NOTEBOOKS=( - langchain_quickstart.ipynb - llama_index_quickstart.ipynb - quickstart.ipynb - prototype_evals.ipynb - human_feedback.ipynb - groundtruth_evals.ipynb - logging.ipynb - custom_feedback_functions.ipynb + ./getting_started/quickstarts/langchain_quickstart.ipynb + ./getting_started/quickstarts/llama_index_quickstart.ipynb + ./getting_started/quickstarts/quickstart.ipynb + ./getting_started/quickstarts/prototype_evals.ipynb + ./getting_started/quickstarts/human_feedback.ipynb + ./getting_started/quickstarts/groundtruth_evals.ipynb + ./tracking/logging/logging.ipynb + ./evaluation/feedback_implementations/custom_feedback_functions.ipynb ) echo "Merging notebooks to all_tools.ipynb: ${ALL_NOTEBOOKS[@]}" nbmerge ${ALL_NOTEBOOKS[@]} --output all_tools.ipynb @@ -28,16 +28,11 @@ printf "\n\n" >> break.md cat gh_top_intro.md break.md ../trulens_explain/gh_top_intro.md > TOP_README.md # Create non-jupyter scripts -OUT_DIR=./py_script_quickstarts -mkdir -p $OUT_DIR -NOTEBOOKS=( - quickstart.ipynb - langchain_quickstart.ipynb - llama_index_quickstart.ipynb - text2text_quickstart.ipynb - all_tools.ipynb -) - +OUT_DIR=./getting_started/quickstarts/ +if [ -f "all_tools.ipynb" ]; then + echo "converting notebook all_tools.ipynb to script" + jupyter nbconvert --to script --output-dir $OUT_DIR all_tools.ipynb +fi # gnu sed/gsed needed on mac: SED=`which -a gsed sed | head -n1` echo "sed=$SED" @@ -45,34 +40,24 @@ echo "sed=$SED" # Fix nbmerge ids field invalid for ipynb $SED -i -e '/\"id\":/d' all_tools.ipynb -for NOTEBOOK in ${NOTEBOOKS[@]} -do - echo "converting notebook $NOTEBOOK to script" - jupyter nbconvert --to script --output-dir $OUT_DIR $NOTEBOOK -done +if [ -f "all_tools.ipynb" ]; then + echo "converting notebook all_tools.ipynb to script" + jupyter nbconvert --to script --output-dir $OUT_DIR all_tools.ipynb +fi -PY_FILES=( - $OUT_DIR/quickstart.py - $OUT_DIR/langchain_quickstart.py - $OUT_DIR/llama_index_quickstart.py - $OUT_DIR/text2text_quickstart.py - $OUT_DIR/all_tools.py -) -for FILE in ${PY_FILES[@]} -do - echo "fixing $FILE" +if [ -f "all_tools.py" ]; then + echo "fixing all_tools.py" ## Remove ipynb JSON calls - $SED'' -i -e "/JSON/d" $FILE + $SED'' -i -e "/JSON/d" all_tools.py ## Replace jupyter display with python print - $SED'' -i -e "s/display/print/g" $FILE + $SED'' -i -e "s/display/print/g" all_tools.py ## Remove cell metadata - $SED'' -i -e "/\# In\[/d" $FILE + $SED'' -i -e "/\# In\[/d" all_tools.py ## Remove single # lines - $SED'' -i -e "/\#$/d" $FILE + $SED'' -i -e "/\#$/d" all_tools.py ## Collapse multiple empty line from sed replacements with a single line - $SED'' -i -e "/./b" -e ":n" -e "N;s/\\n$//;tn" $FILE -done - + $SED'' -i -e "/./b" -e ":n" -e "N;s/\\n$//;tn" all_tools.py +fi # Move generated files to their end locations # EVERYTHING BELOW IS LINKED TO DOCUMENTATION OR TESTS; MAKE SURE YOU UPDATE @@ -82,10 +67,5 @@ done mv README.md ../../trulens_eval/README.md mv TOP_README.md ../../README.md -# Links are referenced in intro.md and gh_intro.md -# There are symlinks from ../../trulens_eval/generated_files/ to these scripts for testing -mkdir -p ../../trulens_eval/examples/quickstart/py_script_quickstarts/ -mv ./py_script_quickstarts/*.py ../../trulens_eval/examples/quickstart/py_script_quickstarts/ - # Trulens tests run off of these files -mv all_tools* ../../trulens_eval/generated_files/ +mv ./getting_started/quickstarts/all_tools* ../../trulens_eval/generated_files/ diff --git a/docs/trulens_eval/getting_started/quickstarts/index.md b/docs/trulens_eval/getting_started/quickstarts/index.md index 2d9869df8..0ef2d3656 100644 --- a/docs/trulens_eval/getting_started/quickstarts/index.md +++ b/docs/trulens_eval/getting_started/quickstarts/index.md @@ -12,3 +12,4 @@ Quickstart notebooks in this section: - trulens_eval/text2text_quickstart.ipynb - trulens_eval/groundtruth_evals.ipynb - trulens_eval/human_feedback.ipynb +- trulens_eval/prototype_evals.ipynb diff --git a/docs/trulens_eval/getting_started/quickstarts/prototype_evals.ipynb b/docs/trulens_eval/getting_started/quickstarts/prototype_evals.ipynb new file mode 120000 index 000000000..0d3b4f1c1 --- /dev/null +++ b/docs/trulens_eval/getting_started/quickstarts/prototype_evals.ipynb @@ -0,0 +1 @@ +../../../../trulens_eval/examples/quickstart/prototype_evals.ipynb \ No newline at end of file diff --git a/docs/trulens_eval/prototype_evals.ipynb b/docs/trulens_eval/prototype_evals.ipynb deleted file mode 120000 index c02af3ad3..000000000 --- a/docs/trulens_eval/prototype_evals.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../trulens_eval/examples/quickstart/prototype_evals.ipynb \ No newline at end of file diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py b/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py deleted file mode 100644 index 8a60fcaa2..000000000 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py +++ /dev/null @@ -1,1308 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # ๐ Langchain Quickstart -# -# In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb) - -# ## Setup -# ### Add API keys -# For this quickstart you will need Open AI and Huggingface keys - - - -# ! pip install trulens_eval openai langchain chromadb langchainhub bs4 - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ### Import from LangChain and TruLens - - - -# Imports main tools: -from trulens_eval import TruChain, Feedback, Tru -tru = Tru() -tru.reset_database() - -# Imports from langchain to build app -import bs4 -from langchain import hub -from langchain.chat_models import ChatOpenAI -from langchain.document_loaders import WebBaseLoader -from langchain.embeddings import OpenAIEmbeddings -from langchain.schema import StrOutputParser -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import Chroma -from langchain_core.runnables import RunnablePassthrough - - -# ### Load documents - - - -loader = WebBaseLoader( - web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), - bs_kwargs=dict( - parse_only=bs4.SoupStrainer( - class_=("post-content", "post-title", "post-header") - ) - ), -) -docs = loader.load() - - -# ### Create Vector Store - - - -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=200 -) - -splits = text_splitter.split_documents(docs) - -vectorstore = Chroma.from_documents( - documents=splits, - embedding=OpenAIEmbeddings() -) - - -# ### Create RAG - - - -retriever = vectorstore.as_retriever() - -prompt = hub.pull("rlm/rag-prompt") -llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) - -def format_docs(docs): - return "\n\n".join(doc.page_content for doc in docs) - -rag_chain = ( - {"context": retriever | format_docs, "question": RunnablePassthrough()} - | prompt - | llm - | StrOutputParser() -) - - -# ### Send your first request - - - -rag_chain.invoke("What is Task Decomposition?") - - -# ## Initialize Feedback Function(s) - - - -from trulens_eval.feedback.provider import OpenAI -import numpy as np - -# Initialize provider class -openai = OpenAI() - -# select context to be used in feedback. the location of context is app specific. -from trulens_eval.app import App -context = App.select_context(rag_chain) - -from trulens_eval.feedback import Groundedness -grounded = Groundedness(groundedness_provider=OpenAI()) -# Define a groundedness feedback function -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons) - .on(context.collect()) # collect context chunks into a list - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = Feedback(openai.relevance).on_input_output() -# Question/statement relevance between question and each context chunk. -f_context_relevance = ( - Feedback(openai.qs_relevance) - .on_input() - .on(context) - .aggregate(np.mean) -) - - -# ## Instrument chain for logging with TruLens - - - -tru_recorder = TruChain(rag_chain, - app_id='Chain1_ChatApplication', - feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness]) - - - - -response, tru_record = tru_recorder.with_record(rag_chain.invoke, "What is Task Decomposition?") - - - - -json_like = tru_record.layout_calls_as_app() - - - - -json_like - - - - -from ipytree import Tree, Node - -def print_call_stack(data): - tree = Tree() - tree.add_node(Node('Record ID: {}'.format(data['record_id']))) - tree.add_node(Node('App ID: {}'.format(data['app_id']))) - tree.add_node(Node('Cost: {}'.format(data['cost']))) - tree.add_node(Node('Performance: {}'.format(data['perf']))) - tree.add_node(Node('Timestamp: {}'.format(data['ts']))) - tree.add_node(Node('Tags: {}'.format(data['tags']))) - tree.add_node(Node('Main Input: {}'.format(data['main_input']))) - tree.add_node(Node('Main Output: {}'.format(data['main_output']))) - tree.add_node(Node('Main Error: {}'.format(data['main_error']))) - - calls_node = Node('Calls') - tree.add_node(calls_node) - - for call in data['calls']: - call_node = Node('Call') - calls_node.add_node(call_node) - - for step in call['stack']: - step_node = Node('Step: {}'.format(step['path'])) - call_node.add_node(step_node) - if 'expanded' in step: - expanded_node = Node('Expanded') - step_node.add_node(expanded_node) - for expanded_step in step['expanded']: - expanded_step_node = Node('Step: {}'.format(expanded_step['path'])) - expanded_node.add_node(expanded_step_node) - - return tree - -# Usage -tree = print_call_stack(json_like) -tree - - - - -tree - - - - -with tru_recorder as recording: - llm_response = rag_chain.invoke("What is Task Decomposition?") - -print(llm_response) - - -# ## Retrieve records and feedback - - - -# The record of the app invocation can be retrieved from the `recording`: - -rec = recording.get() # use .get if only one record -# recs = recording.records # use .records if multiple - -print(rec) - - - - -# The results of the feedback functions can be rertireved from -# `Record.feedback_results` or using the `wait_for_feedback_result` method. The -# results if retrieved directly are `Future` instances (see -# `concurrent.futures`). You can use `as_completed` to wait until they have -# finished evaluating or use the utility method: - -for feedback, feedback_result in rec.wait_for_feedback_results().items(): - print(feedback.name, feedback_result.result) - -# See more about wait_for_feedback_results: -# help(rec.wait_for_feedback_results) - - - - -records, feedback = tru.get_records_and_feedback(app_ids=["Chain1_ChatApplication"]) - -records.head() - - - - -tru.get_leaderboard(app_ids=["Chain1_ChatApplication"]) - - -# ## Explore in a Dashboard - - - -tru.run_dashboard() # open a local streamlit app to explore - -# tru.stop_dashboard() # stop if needed - - -# Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. - -# Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. - -# # ๐ Llama-Index Quickstart -# -# In this quickstart you will create a simple Llama Index app and learn how to log it and get feedback on an LLM response. -# -# For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb) - -# ## Setup -# -# ### Install dependencies -# Let's install some of the dependencies for this notebook if we don't have them already - - - -# pip install trulens_eval llama_index openai - - -# ### Add API keys -# For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation. - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ### Import from TruLens - - - -from trulens_eval import Tru -tru = Tru() - - -# ### Download data -# -# This example uses the text of Paul Grahamโs essay, [โWhat I Worked Onโ](https://paulgraham.com/worked.html), and is the canonical llama-index example. -# -# The easiest way to get it is to [download it via this link](https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt) and save it in a folder called data. You can do so with the following command: - - - -get_ipython().system('wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/') - - -# ### Create Simple LLM Application -# -# This example uses LlamaIndex which internally uses an OpenAI LLM. - - - -from llama_index.core import VectorStoreIndex, SimpleDirectoryReader - -documents = SimpleDirectoryReader("data").load_data() -index = VectorStoreIndex.from_documents(documents) - -query_engine = index.as_query_engine() - - -# ### Send your first request - - - -response = query_engine.query("What did the author do growing up?") -print(response) - - -# ## Initialize Feedback Function(s) - - - -import numpy as np - -# Initialize provider class -from trulens_eval.feedback.provider.openai import OpenAI -openai = OpenAI() - -# select context to be used in feedback. the location of context is app specific. -from trulens_eval.app import App -context = App.select_context(query_engine) - -# imports for feedback -from trulens_eval import Feedback - -# Define a groundedness feedback function -from trulens_eval.feedback import Groundedness -grounded = Groundedness(groundedness_provider=OpenAI()) -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons) - .on(context.collect()) # collect context chunks into a list - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = Feedback(openai.relevance).on_input_output() - -# Question/statement relevance between question and each context chunk. -f_qs_relevance = ( - Feedback(openai.qs_relevance) - .on_input() - .on(context) - .aggregate(np.mean) -) - - -# ## Instrument app for logging with TruLens - - - -from trulens_eval import TruLlama -tru_query_engine_recorder = TruLlama(query_engine, - app_id='LlamaIndex_App1', - feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance]) - - - - -# or as context manager -with tru_query_engine_recorder as recording: - query_engine.query("What did the author do growing up?") - - -# ## Retrieve records and feedback - - - -# The record of the app invocation can be retrieved from the `recording`: - -rec = recording.get() # use .get if only one record -# recs = recording.records # use .records if multiple - -print(rec) - - - - -tru.run_dashboard() - - - - -# The results of the feedback functions can be rertireved from -# `Record.feedback_results` or using the `wait_for_feedback_result` method. The -# results if retrieved directly are `Future` instances (see -# `concurrent.futures`). You can use `as_completed` to wait until they have -# finished evaluating or use the utility method: - -for feedback, feedback_result in rec.wait_for_feedback_results().items(): - print(feedback.name, feedback_result.result) - -# See more about wait_for_feedback_results: -# help(rec.wait_for_feedback_results) - - - - -records, feedback = tru.get_records_and_feedback(app_ids=["LlamaIndex_App1"]) - -records.head() - - - - -tru.get_leaderboard(app_ids=["LlamaIndex_App1"]) - - -# ## Explore in a Dashboard - - - -tru.run_dashboard() # open a local streamlit app to explore - -# tru.stop_dashboard() # stop if needed - - -# Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. - -# # ๐ TruLens Quickstart -# -# In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response. -# -# For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb) - - - -# ! pip install trulens_eval chromadb openai - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ## Get Data -# -# In this case, we'll just initialize some simple text in the notebook. - - - -university_info = """ -The University of Washington, founded in 1861 in Seattle, is a public research university -with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. -As the flagship institution of the six public universities in Washington state, -UW encompasses over 500 buildings and 20 million square feet of space, -including one of the largest library systems in the world. -""" - - -# ## Create Vector Store -# -# Create a chromadb vector store in memory. - - - -from openai import OpenAI -oai_client = OpenAI() - -oai_client.embeddings.create( - model="text-embedding-ada-002", - input=university_info - ) - - - - -import chromadb -from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction - -embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), - model_name="text-embedding-ada-002") - - -chroma_client = chromadb.Client() -vector_store = chroma_client.get_or_create_collection(name="Universities", - embedding_function=embedding_function) - - -# Add the university_info to the embedding database. - - - -vector_store.add("uni_info", documents=university_info) - - -# ## Build RAG from scratch -# -# Build a custom RAG from scratch, and add TruLens custom instrumentation. - - - -from trulens_eval import Tru -from trulens_eval.tru_custom_app import instrument -tru = Tru() - - - - -class RAG_from_scratch: - @instrument - def retrieve(self, query: str) -> list: - """ - Retrieve relevant text from vector store. - """ - results = vector_store.query( - query_texts=query, - n_results=2 - ) - return results['documents'][0] - - @instrument - def generate_completion(self, query: str, context_str: list) -> str: - """ - Generate answer from context. - """ - completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages= - [ - {"role": "user", - "content": - f"We have provided context information below. \n" - f"---------------------\n" - f"{context_str}" - f"\n---------------------\n" - f"Given this information, please answer the question: {query}" - } - ] - ).choices[0].message.content - return completion - - @instrument - def query(self, query: str) -> str: - context_str = self.retrieve(query) - completion = self.generate_completion(query, context_str) - return completion - -rag = RAG_from_scratch() - - -# ## Set up feedback functions. -# -# Here we'll use groundedness, answer relevance and context relevance to detect hallucination. - - - -from trulens_eval import Feedback, Select -from trulens_eval.feedback import Groundedness -from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI - -import numpy as np - -# Initialize provider class -fopenai = fOpenAI() - -grounded = Groundedness(groundedness_provider=fopenai) - -# Define a groundedness feedback function -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness") - .on(Select.RecordCalls.retrieve.rets.collect()) - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = ( - Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance") - .on(Select.RecordCalls.retrieve.args.query) - .on_output() -) - -# Question/statement relevance between question and each context chunk. -f_context_relevance = ( - Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance") - .on(Select.RecordCalls.retrieve.args.query) - .on(Select.RecordCalls.retrieve.rets.collect()) - .aggregate(np.mean) -) - - -# ## Construct the app -# Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval - - - -from trulens_eval import TruCustomApp -tru_rag = TruCustomApp(rag, - app_id = 'RAG v1', - feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance]) - - -# ## Run the app -# Use `tru_rag` as a context manager for the custom RAG-from-scratch app. - - - -with tru_rag as recording: - rag.query("When was the University of Washington founded?") - - - - -tru.get_leaderboard(app_ids=["RAG v1"]) - - - - -tru.run_dashboard() - - -# # Prototype Evals -# This notebook shows the use of the dummy feedback function provider which -# behaves like the huggingface provider except it does not actually perform any -# network calls and just produces constant results. It can be used to prototype -# feedback function wiring for your apps before invoking potentially slow (to -# run/to load) feedback functions. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/prototype_evals.ipynb) - -# ## Import libraries - - - -# ! pip install trulens_eval - - - - -from trulens_eval import Feedback -from trulens_eval import Tru - -tru = Tru() - -tru.run_dashboard() - - -# ## Set keys - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ## Build the app - - - -from openai import OpenAI -oai_client = OpenAI() - -from trulens_eval.tru_custom_app import instrument - -class APP: - @instrument - def completion(self, prompt): - completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages= - [ - {"role": "user", - "content": - f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content - return completion - -llm_app = APP() - - -# ## Create dummy feedback -# -# By setting the provider as `Dummy()`, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later. - - - -from trulens_eval.feedback.provider.hugs import Dummy - -# hugs = Huggingface() -hugs = Dummy() - -f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output() - - -# ## Create the app - - - -# add trulens as a context manager for llm_app with dummy feedback -from trulens_eval import TruCustomApp -tru_app = TruCustomApp(llm_app, - app_id = 'LLM App v1', - feedbacks = [f_positive_sentiment]) - - -# ## Run the app - - - -with tru_app as recording: - llm_app.completion('give me a good name for a colorful sock company') - - - - -tru.get_leaderboard(app_ids=[tru_app.app_id]) - - -# # ๐ Logging Human Feedback -# -# In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/human_feedback.ipynb) - - - -# ! pip install trulens_eval openai - - - - -import os - -from trulens_eval import Tru -from trulens_eval import TruCustomApp - -tru = Tru() - - -# ## Set Keys -# -# For this example, you need an OpenAI key. - - - -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ## Set up your app -# -# Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app. - - - -from openai import OpenAI -oai_client = OpenAI() - -from trulens_eval.tru_custom_app import instrument - -class APP: - @instrument - def completion(self, prompt): - completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages= - [ - {"role": "user", - "content": - f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content - return completion - -llm_app = APP() - -# add trulens as a context manager for llm_app -tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1') - - -# ## Run the app - - - -with tru_app as recording: - llm_app.completion("Give me 10 names for a colorful sock company") - - - - -# Get the record to add the feedback to. -record = recording.get() - - -# ## Create a mechamism for recording human feedback. -# -# Be sure to click an emoji in the record to record `human_feedback` to log. - - - -from ipywidgets import Button, HBox, VBox - -thumbs_up_button = Button(description='๐') -thumbs_down_button = Button(description='๐') - -human_feedback = None - -def on_thumbs_up_button_clicked(b): - global human_feedback - human_feedback = 1 - -def on_thumbs_down_button_clicked(b): - global human_feedback - human_feedback = 0 - -thumbs_up_button.on_click(on_thumbs_up_button_clicked) -thumbs_down_button.on_click(on_thumbs_down_button_clicked) - -HBox([thumbs_up_button, thumbs_down_button]) - - - - -# add the human feedback to a particular app and record -tru.add_feedback( - name="Human Feedack", - record_id=record.record_id, - app_id=tru_app.app_id, - result=human_feedback -) - - -# ## See the result logged with your app. - - - -tru.get_leaderboard(app_ids=[tru_app.app_id]) - - -# # ๐ Ground Truth Evaluations -# -# In this quickstart you will create a evaluate a LangChain app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right. -# -# Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/groundtruth_evals.ipynb) - -# ### Add API keys -# For this quickstart, you will need Open AI keys. - - - -# ! pip install trulens_eval openai - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - - - -from trulens_eval import Tru - -tru = Tru() - - -# ### Create Simple LLM Application - - - -from openai import OpenAI -oai_client = OpenAI() - -from trulens_eval.tru_custom_app import instrument - -class APP: - @instrument - def completion(self, prompt): - completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages= - [ - {"role": "user", - "content": - f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content - return completion - -llm_app = APP() - - -# ## Initialize Feedback Function(s) - - - -from trulens_eval import Feedback -from trulens_eval.feedback import GroundTruthAgreement - -golden_set = [ - {"query": "who invented the lightbulb?", "response": "Thomas Edison"}, - {"query": "ยฟquien invento la bombilla?", "response": "Thomas Edison"} -] - -f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = "Ground Truth").on_input_output() - - -# ## Instrument chain for logging with TruLens - - - -# add trulens as a context manager for llm_app -from trulens_eval import TruCustomApp -tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1', feedbacks = [f_groundtruth]) - - - - -# Instrumented query engine can operate as a context manager: -with tru_app as recording: - llm_app.completion("ยฟquien invento la bombilla?") - llm_app.completion("who invented the lightbulb?") - - -# ## See results - - - -tru.get_leaderboard(app_ids=[tru_app.app_id]) - - -# # Logging Methods -# -# ## Automatic Logging -# -# The simplest method for logging with TruLens is by wrapping with TruChain and -# including the tru argument, as shown in the quickstart. -# -# This is done like so: - - - -# Imports main tools: -from trulens_eval import Feedback -from trulens_eval import Huggingface -from trulens_eval import Tru -from trulens_eval import TruChain - -tru = Tru() - -Tru().migrate_database() - -from langchain.chains import LLMChain -from langchain_community.llms import OpenAI -from langchain.prompts import ChatPromptTemplate -from langchain.prompts import HumanMessagePromptTemplate -from langchain.prompts import PromptTemplate - -full_prompt = HumanMessagePromptTemplate( - prompt=PromptTemplate( - template= - "Provide a helpful response with relevant background information for the following: {prompt}", - input_variables=["prompt"], - ) -) - -chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt]) - -llm = OpenAI(temperature=0.9, max_tokens=128) - -chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True) - -truchain = TruChain( - chain, - app_id='Chain1_ChatApplication', - tru=tru -) -with truchain: - chain("This will be automatically logged.") - - -# Feedback functions can also be logged automatically by providing them in a list -# to the feedbacks arg. - - - -# Initialize Huggingface-based feedback function collection class: -hugs = Huggingface() - -# Define a language match feedback function using HuggingFace. -f_lang_match = Feedback(hugs.language_match).on_input_output() -# By default this will check language match on the main app input and main app -# output. - - - - -truchain = TruChain( - chain, - app_id='Chain1_ChatApplication', - feedbacks=[f_lang_match], # feedback functions - tru=tru -) -with truchain: - chain("This will be automatically logged.") - - -# ## Manual Logging -# -# ### Wrap with TruChain to instrument your chain - - - -tc = TruChain(chain, app_id='Chain1_ChatApplication') - - -# ### Set up logging and instrumentation -# -# Making the first call to your wrapped LLM Application will now also produce a log or "record" of the chain execution. -# - - - -prompt_input = 'que hora es?' -gpt3_response, record = tc.with_record(chain.__call__, prompt_input) - - -# We can log the records but first we need to log the chain itself. - - - -tru.add_app(app=truchain) - - -# Then we can log the record: - - - -tru.add_record(record) - - -# ### Log App Feedback -# Capturing app feedback such as user feedback of the responses can be added with -# one call. - - - -thumb_result = True -tru.add_feedback( - name="๐ (1) or ๐ (0)", - record_id=record.record_id, - result=thumb_result -) - - -# ### Evaluate Quality -# -# Following the request to your app, you can then evaluate LLM quality using -# feedback functions. This is completed in a sequential call to minimize latency -# for your application, and evaluations will also be logged to your local machine. -# -# To get feedback on the quality of your LLM, you can use any of the provided -# feedback functions or add your own. -# -# To assess your LLM quality, you can provide the feedback functions to -# `tru.run_feedback()` in a list provided to `feedback_functions`. -# - - - -feedback_results = tru.run_feedback_functions( - record=record, - feedback_functions=[f_lang_match] -) -for result in feedback_results: - print(result) - - -# After capturing feedback, you can then log it to your local database. - - - -tru.add_feedbacks(feedback_results) - - -# ### Out-of-band Feedback evaluation -# -# In the above example, the feedback function evaluation is done in the same -# process as the chain evaluation. The alternative approach is the use the -# provided persistent evaluator started via -# `tru.start_deferred_feedback_evaluator`. Then specify the `feedback_mode` for -# `TruChain` as `deferred` to let the evaluator handle the feedback functions. -# -# For demonstration purposes, we start the evaluator here but it can be started in -# another process. - - - -truchain: TruChain = TruChain( - chain, - app_id='Chain1_ChatApplication', - feedbacks=[f_lang_match], - tru=tru, - feedback_mode="deferred" -) - -with truchain: - chain("This will be logged by deferred evaluator.") - -tru.start_evaluator() -# tru.stop_evaluator() - - -# # ๐ Custom Feedback Functions -# -# Feedback functions are an extensible framework for evaluating LLMs. You can add your own feedback functions to evaluate the qualities required by your application by updating `trulens_eval/feedback.py`, or simply creating a new provider class and feedback function in youre notebook. If your contributions would be useful for others, we encourage you to contribute to TruLens! -# -# Feedback functions are organized by model provider into Provider classes. -# -# The process for adding new feedback functions is: -# 1. Create a new Provider class or locate an existing one that applies to your feedback function. If your feedback function does not rely on a model provider, you can create a standalone class. Add the new feedback function method to your selected class. Your new method can either take a single text (str) as a parameter or both prompt (str) and response (str). It should return a float between 0 (worst) and 1 (best). - - - -from trulens_eval import Provider, Feedback, Select, Tru - -class StandAlone(Provider): - def custom_feedback(self, my_text_field: str) -> float: - """ - A dummy function of text inputs to float outputs. - - Parameters: - my_text_field (str): Text to evaluate. - - Returns: - float: square length of the text - """ - return 1.0 / (1.0 + len(my_text_field) * len(my_text_field)) - - -# 2. Instantiate your provider and feedback functions. The feedback function is wrapped by the trulens-eval Feedback class which helps specify what will get sent to your function parameters (For example: Select.RecordInput or Select.RecordOutput) - - - -standalone = StandAlone() -f_custom_function = Feedback(standalone.custom_feedback).on( - my_text_field=Select.RecordOutput -) - - -# 3. Your feedback function is now ready to use just like the out of the box feedback functions. Below is an example of it being used. - - - -tru = Tru() -feedback_results = tru.run_feedback_functions( - record=record, - feedback_functions=[f_custom_function] -) -tru.add_feedbacks(feedback_results) - - -# ## Extending existing providers. -# -# In addition to calling your own methods, you can also extend stock feedback providers (such as `OpenAI`, `AzureOpenAI`, `Bedrock`) to custom feedback implementations. This can be especially useful for tweaking stock feedback functions, or running custom feedback function prompts while letting TruLens handle the backend LLM provider. -# -# This is done by subclassing the provider you wish to extend, and using the `generate_score` method that runs the provided prompt with your specified provider, and extracts a float score from 0-1. Your prompt should request the LLM respond on the scale from 0 to 10, then the `generate_score` method will normalize to 0-1. -# -# See below for example usage: - - - -from trulens_eval.feedback.provider import AzureOpenAI - -class Custom_AzureOpenAI(AzureOpenAI): - def style_check_professional(self, response: str) -> float: - """ - Custom feedback function to grade the professional style of the resposne, extending AzureOpenAI provider. - - Args: - response (str): text to be graded for professional style. - - Returns: - float: A value between 0 and 1. 0 being "not professional" and 1 being "professional". - """ - professional_prompt = str.format("Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}", response) - return self.generate_score(system_prompt=professional_prompt) - - -# Running "chain of thought evaluations" is another use case for extending providers. Doing so follows a similar process as above, where the base provider (such as `AzureOpenAI`) is subclassed. -# -# For this case, the method `generate_score_and_reasons` can be used to extract both the score and chain of thought reasons from the LLM response. -# -# To use this method, the prompt used should include the `COT_REASONS_TEMPLATE` available from the TruLens prompts library (`trulens_eval.feedback.prompts`). -# -# See below for example usage: - - - -from typing import Tuple, Dict -from trulens_eval.feedback import prompts - -class Custom_AzureOpenAI(AzureOpenAI): - def qs_relevance_with_cot_reasons_extreme(self, question: str, statement: str) -> Tuple[float, Dict]: - """ - Tweaked version of question statement relevance, extending AzureOpenAI provider. - A function that completes a template to check the relevance of the statement to the question. - Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores. - Also uses chain of thought methodology and emits the reasons. - - Args: - question (str): A question being asked. - statement (str): A statement to the question. - - Returns: - float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant". - """ - - system_prompt = str.format(prompts.QS_RELEVANCE, question = question, statement = statement) - - # remove scoring guidelines around middle scores - system_prompt = system_prompt.replace( - "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n", "") - - system_prompt = system_prompt.replace( - "RELEVANCE:", prompts.COT_REASONS_TEMPLATE - ) - - return self.generate_score_and_reasons(system_prompt) - - -# ## Multi-Output Feedback functions -# Trulens also supports multi-output feedback functions. As a typical feedback function will output a float between 0 and 1, multi-output should output a dictionary of `output_key` to a float between 0 and 1. The feedbacks table will print the feedback with column `feedback_name:::outputkey` - - - -multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi").on( - input_param=Select.RecordOutput -) -feedback_results = tru.run_feedback_functions( - record=record, - feedback_functions=[multi_output_feedback] -) -tru.add_feedbacks(feedback_results) - - - - -# Aggregators will run on the same dict keys. -import numpy as np -multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg").on( - input_param=Select.RecordOutput -).aggregate(np.mean) -feedback_results = tru.run_feedback_functions( - record=record, - feedback_functions=[multi_output_feedback] -) -tru.add_feedbacks(feedback_results) - - - - -# For multi-context chunking, an aggregator can operate on a list of multi output dictionaries. -def dict_aggregator(list_dict_input): - agg = 0 - for dict_input in list_dict_input: - agg += dict_input['output_key1'] - return agg -multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg-dict").on( - input_param=Select.RecordOutput -).aggregate(dict_aggregator) -feedback_results = tru.run_feedback_functions( - record=record, - feedback_functions=[multi_output_feedback] -) -tru.add_feedbacks(feedback_results) - diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py deleted file mode 100644 index 942d2d186..000000000 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # ๐ Langchain Quickstart -# -# In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb) - -# ## Setup -# ### Add API keys -# For this quickstart you will need Open AI and Huggingface keys - - - -# ! pip install trulens_eval openai langchain chromadb langchainhub bs4 - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ### Import from LangChain and TruLens - - - -# Imports main tools: -from trulens_eval import TruChain, Feedback, Tru -tru = Tru() -tru.reset_database() - -# Imports from langchain to build app -import bs4 -from langchain import hub -from langchain.chat_models import ChatOpenAI -from langchain.document_loaders import WebBaseLoader -from langchain.embeddings import OpenAIEmbeddings -from langchain.schema import StrOutputParser -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import Chroma -from langchain_core.runnables import RunnablePassthrough - - -# ### Load documents - - - -loader = WebBaseLoader( - web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), - bs_kwargs=dict( - parse_only=bs4.SoupStrainer( - class_=("post-content", "post-title", "post-header") - ) - ), -) -docs = loader.load() - - -# ### Create Vector Store - - - -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=200 -) - -splits = text_splitter.split_documents(docs) - -vectorstore = Chroma.from_documents( - documents=splits, - embedding=OpenAIEmbeddings() -) - - -# ### Create RAG - - - -retriever = vectorstore.as_retriever() - -prompt = hub.pull("rlm/rag-prompt") -llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) - -def format_docs(docs): - return "\n\n".join(doc.page_content for doc in docs) - -rag_chain = ( - {"context": retriever | format_docs, "question": RunnablePassthrough()} - | prompt - | llm - | StrOutputParser() -) - - -# ### Send your first request - - - -rag_chain.invoke("What is Task Decomposition?") - - -# ## Initialize Feedback Function(s) - - - -from trulens_eval.feedback.provider import OpenAI -import numpy as np - -# Initialize provider class -openai = OpenAI() - -# select context to be used in feedback. the location of context is app specific. -from trulens_eval.app import App -context = App.select_context(rag_chain) - -from trulens_eval.feedback import Groundedness -grounded = Groundedness(groundedness_provider=OpenAI()) -# Define a groundedness feedback function -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons) - .on(context.collect()) # collect context chunks into a list - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = Feedback(openai.relevance).on_input_output() -# Question/statement relevance between question and each context chunk. -f_context_relevance = ( - Feedback(openai.qs_relevance) - .on_input() - .on(context) - .aggregate(np.mean) -) - - -# ## Instrument chain for logging with TruLens - - - -tru_recorder = TruChain(rag_chain, - app_id='Chain1_ChatApplication', - feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness]) - - - - -response, tru_record = tru_recorder.with_record(rag_chain.invoke, "What is Task Decomposition?") - - - - -json_like = tru_record.layout_calls_as_app() - - - - -json_like - - - - -from ipytree import Tree, Node - -def print_call_stack(data): - tree = Tree() - tree.add_node(Node('Record ID: {}'.format(data['record_id']))) - tree.add_node(Node('App ID: {}'.format(data['app_id']))) - tree.add_node(Node('Cost: {}'.format(data['cost']))) - tree.add_node(Node('Performance: {}'.format(data['perf']))) - tree.add_node(Node('Timestamp: {}'.format(data['ts']))) - tree.add_node(Node('Tags: {}'.format(data['tags']))) - tree.add_node(Node('Main Input: {}'.format(data['main_input']))) - tree.add_node(Node('Main Output: {}'.format(data['main_output']))) - tree.add_node(Node('Main Error: {}'.format(data['main_error']))) - - calls_node = Node('Calls') - tree.add_node(calls_node) - - for call in data['calls']: - call_node = Node('Call') - calls_node.add_node(call_node) - - for step in call['stack']: - step_node = Node('Step: {}'.format(step['path'])) - call_node.add_node(step_node) - if 'expanded' in step: - expanded_node = Node('Expanded') - step_node.add_node(expanded_node) - for expanded_step in step['expanded']: - expanded_step_node = Node('Step: {}'.format(expanded_step['path'])) - expanded_node.add_node(expanded_step_node) - - return tree - -# Usage -tree = print_call_stack(json_like) -tree - - - - -tree - - - - -with tru_recorder as recording: - llm_response = rag_chain.invoke("What is Task Decomposition?") - -print(llm_response) - - -# ## Retrieve records and feedback - - - -# The record of the app invocation can be retrieved from the `recording`: - -rec = recording.get() # use .get if only one record -# recs = recording.records # use .records if multiple - -print(rec) - - - - -# The results of the feedback functions can be rertireved from -# `Record.feedback_results` or using the `wait_for_feedback_result` method. The -# results if retrieved directly are `Future` instances (see -# `concurrent.futures`). You can use `as_completed` to wait until they have -# finished evaluating or use the utility method: - -for feedback, feedback_result in rec.wait_for_feedback_results().items(): - print(feedback.name, feedback_result.result) - -# See more about wait_for_feedback_results: -# help(rec.wait_for_feedback_results) - - - - -records, feedback = tru.get_records_and_feedback(app_ids=["Chain1_ChatApplication"]) - -records.head() - - - - -tru.get_leaderboard(app_ids=["Chain1_ChatApplication"]) - - -# ## Explore in a Dashboard - - - -tru.run_dashboard() # open a local streamlit app to explore - -# tru.stop_dashboard() # stop if needed - - -# Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. - -# Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py deleted file mode 100644 index 0eec29fe3..000000000 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # ๐ Llama-Index Quickstart -# -# In this quickstart you will create a simple Llama Index app and learn how to log it and get feedback on an LLM response. -# -# For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb) - -# ## Setup -# -# ### Install dependencies -# Let's install some of the dependencies for this notebook if we don't have them already - - - -# pip install trulens_eval llama_index openai - - -# ### Add API keys -# For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation. - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ### Import from TruLens - - - -from trulens_eval import Tru -tru = Tru() - - -# ### Download data -# -# This example uses the text of Paul Grahamโs essay, [โWhat I Worked Onโ](https://paulgraham.com/worked.html), and is the canonical llama-index example. -# -# The easiest way to get it is to [download it via this link](https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt) and save it in a folder called data. You can do so with the following command: - - - -get_ipython().system('wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/') - - -# ### Create Simple LLM Application -# -# This example uses LlamaIndex which internally uses an OpenAI LLM. - - - -from llama_index.core import VectorStoreIndex, SimpleDirectoryReader - -documents = SimpleDirectoryReader("data").load_data() -index = VectorStoreIndex.from_documents(documents) - -query_engine = index.as_query_engine() - - -# ### Send your first request - - - -response = query_engine.query("What did the author do growing up?") -print(response) - - -# ## Initialize Feedback Function(s) - - - -import numpy as np - -# Initialize provider class -from trulens_eval.feedback.provider.openai import OpenAI -openai = OpenAI() - -# select context to be used in feedback. the location of context is app specific. -from trulens_eval.app import App -context = App.select_context(query_engine) - -# imports for feedback -from trulens_eval import Feedback - -# Define a groundedness feedback function -from trulens_eval.feedback import Groundedness -grounded = Groundedness(groundedness_provider=OpenAI()) -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons) - .on(context.collect()) # collect context chunks into a list - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = Feedback(openai.relevance).on_input_output() - -# Question/statement relevance between question and each context chunk. -f_qs_relevance = ( - Feedback(openai.qs_relevance) - .on_input() - .on(context) - .aggregate(np.mean) -) - - -# ## Instrument app for logging with TruLens - - - -from trulens_eval import TruLlama -tru_query_engine_recorder = TruLlama(query_engine, - app_id='LlamaIndex_App1', - feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance]) - - - - -# or as context manager -with tru_query_engine_recorder as recording: - query_engine.query("What did the author do growing up?") - - -# ## Retrieve records and feedback - - - -# The record of the app invocation can be retrieved from the `recording`: - -rec = recording.get() # use .get if only one record -# recs = recording.records # use .records if multiple - -print(rec) - - - - -tru.run_dashboard() - - - - -# The results of the feedback functions can be rertireved from -# `Record.feedback_results` or using the `wait_for_feedback_result` method. The -# results if retrieved directly are `Future` instances (see -# `concurrent.futures`). You can use `as_completed` to wait until they have -# finished evaluating or use the utility method: - -for feedback, feedback_result in rec.wait_for_feedback_results().items(): - print(feedback.name, feedback_result.result) - -# See more about wait_for_feedback_results: -# help(rec.wait_for_feedback_results) - - - - -records, feedback = tru.get_records_and_feedback(app_ids=["LlamaIndex_App1"]) - -records.head() - - - - -tru.get_leaderboard(app_ids=["LlamaIndex_App1"]) - - -# ## Explore in a Dashboard - - - -tru.run_dashboard() # open a local streamlit app to explore - -# tru.stop_dashboard() # stop if needed - - -# Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py deleted file mode 100644 index c542e5cdf..000000000 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # ๐ TruLens Quickstart -# -# In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response. -# -# For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb) - - - -# ! pip install trulens_eval chromadb openai - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ## Get Data -# -# In this case, we'll just initialize some simple text in the notebook. - - - -university_info = """ -The University of Washington, founded in 1861 in Seattle, is a public research university -with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. -As the flagship institution of the six public universities in Washington state, -UW encompasses over 500 buildings and 20 million square feet of space, -including one of the largest library systems in the world. -""" - - -# ## Create Vector Store -# -# Create a chromadb vector store in memory. - - - -from openai import OpenAI -oai_client = OpenAI() - -oai_client.embeddings.create( - model="text-embedding-ada-002", - input=university_info - ) - - - - -import chromadb -from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction - -embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), - model_name="text-embedding-ada-002") - - -chroma_client = chromadb.Client() -vector_store = chroma_client.get_or_create_collection(name="Universities", - embedding_function=embedding_function) - - -# Add the university_info to the embedding database. - - - -vector_store.add("uni_info", documents=university_info) - - -# ## Build RAG from scratch -# -# Build a custom RAG from scratch, and add TruLens custom instrumentation. - - - -from trulens_eval import Tru -from trulens_eval.tru_custom_app import instrument -tru = Tru() - - - - -class RAG_from_scratch: - @instrument - def retrieve(self, query: str) -> list: - """ - Retrieve relevant text from vector store. - """ - results = vector_store.query( - query_texts=query, - n_results=2 - ) - return results['documents'][0] - - @instrument - def generate_completion(self, query: str, context_str: list) -> str: - """ - Generate answer from context. - """ - completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages= - [ - {"role": "user", - "content": - f"We have provided context information below. \n" - f"---------------------\n" - f"{context_str}" - f"\n---------------------\n" - f"Given this information, please answer the question: {query}" - } - ] - ).choices[0].message.content - return completion - - @instrument - def query(self, query: str) -> str: - context_str = self.retrieve(query) - completion = self.generate_completion(query, context_str) - return completion - -rag = RAG_from_scratch() - - -# ## Set up feedback functions. -# -# Here we'll use groundedness, answer relevance and context relevance to detect hallucination. - - - -from trulens_eval import Feedback, Select -from trulens_eval.feedback import Groundedness -from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI - -import numpy as np - -# Initialize provider class -fopenai = fOpenAI() - -grounded = Groundedness(groundedness_provider=fopenai) - -# Define a groundedness feedback function -f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness") - .on(Select.RecordCalls.retrieve.rets.collect()) - .on_output() - .aggregate(grounded.grounded_statements_aggregator) -) - -# Question/answer relevance between overall question and answer. -f_qa_relevance = ( - Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance") - .on(Select.RecordCalls.retrieve.args.query) - .on_output() -) - -# Question/statement relevance between question and each context chunk. -f_context_relevance = ( - Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance") - .on(Select.RecordCalls.retrieve.args.query) - .on(Select.RecordCalls.retrieve.rets.collect()) - .aggregate(np.mean) -) - - -# ## Construct the app -# Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval - - - -from trulens_eval import TruCustomApp -tru_rag = TruCustomApp(rag, - app_id = 'RAG v1', - feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance]) - - -# ## Run the app -# Use `tru_rag` as a context manager for the custom RAG-from-scratch app. - - - -with tru_rag as recording: - rag.query("When was the University of Washington founded?") - - - - -tru.get_leaderboard(app_ids=["RAG v1"]) - - - - -tru.run_dashboard() - diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py deleted file mode 100644 index fde8df134..000000000 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # ๐ Text to Text Quickstart -# -# In this quickstart you will create a simple text to text application and learn how to log it and get feedback. -# -# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/text2text_quickstart.ipynb) - -# ## Setup -# ### Add API keys -# For this quickstart you will need an OpenAI Key. - - - -# ! pip install trulens_eval openai - - - - -import os -os.environ["OPENAI_API_KEY"] = "sk-..." - - -# ### Import from TruLens - - - -# Create openai client -from openai import OpenAI -client = OpenAI() - -# Imports main tools: -from trulens_eval import Feedback, OpenAI as fOpenAI, Tru -tru = Tru() -tru.reset_database() - - -# ### Create Simple Text to Text Application -# -# This example uses a bare bones OpenAI LLM, and a non-LLM just for demonstration purposes. - - - -def llm_standalone(prompt): - return client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a question and answer bot, and you answer super upbeat."}, - {"role": "user", "content": prompt} - ] - ).choices[0].message.content - - -# ### Send your first request - - - -prompt_input="How good is language AI?" -prompt_output = llm_standalone(prompt_input) -prompt_output - - -# ## Initialize Feedback Function(s) - - - -# Initialize OpenAI-based feedback function collection class: -fopenai = fOpenAI() - -# Define a relevance function from openai -f_relevance = Feedback(fopenai.relevance).on_input_output() - - -# ## Instrument the callable for logging with TruLens - - - -from trulens_eval import TruBasicApp -tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id="Happy Bot", feedbacks=[f_relevance]) - - - - -with tru_llm_standalone_recorder as recording: - tru_llm_standalone_recorder.app(prompt_input) - - -# ## Explore in a Dashboard - - - -tru.run_dashboard() # open a local streamlit app to explore - -# tru.stop_dashboard() # stop if needed - - -# Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. - -# ## Or view results directly in your notebook - - - -tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all - diff --git a/trulens_eval/generated_files/all_tools.ipynb b/trulens_eval/generated_files/all_tools.ipynb deleted file mode 100644 index c68bd16f2..000000000 --- a/trulens_eval/generated_files/all_tools.ipynb +++ /dev/null @@ -1,2070 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐ Langchain Quickstart\n", - "\n", - "In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup\n", - "### Add API keys\n", - "For this quickstart you will need Open AI and Huggingface keys" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install trulens_eval==0.23.0 openai==1.3.7 langchain chromadb langchainhub bs4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import from LangChain and TruLens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports main tools:\n", - "from trulens_eval import TruChain, Feedback, Tru\n", - "tru = Tru()\n", - "tru.reset_database()\n", - "\n", - "# Imports from langchain to build app\n", - "import bs4\n", - "from langchain import hub\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.document_loaders import WebBaseLoader\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.schema import StrOutputParser\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain.vectorstores import Chroma\n", - "from langchain_core.runnables import RunnablePassthrough" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load documents" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "loader = WebBaseLoader(\n", - " web_paths=(\"https://lilianweng.github.io/posts/2023-06-23-agent/\",),\n", - " bs_kwargs=dict(\n", - " parse_only=bs4.SoupStrainer(\n", - " class_=(\"post-content\", \"post-title\", \"post-header\")\n", - " )\n", - " ),\n", - ")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Vector Store" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=1000,\n", - " chunk_overlap=200\n", - ")\n", - "\n", - "splits = text_splitter.split_documents(docs)\n", - "\n", - "vectorstore = Chroma.from_documents(\n", - " documents=splits,\n", - " embedding=OpenAIEmbeddings()\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create RAG" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "retriever = vectorstore.as_retriever()\n", - "\n", - "prompt = hub.pull(\"rlm/rag-prompt\")\n", - "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", - "\n", - "def format_docs(docs):\n", - " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", - "\n", - "rag_chain = (\n", - " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", - " | prompt\n", - " | llm\n", - " | StrOutputParser()\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Send your first request" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rag_chain.invoke(\"What is Task Decomposition?\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Feedback Function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval.feedback.provider import OpenAI\n", - "import numpy as np\n", - "\n", - "# Initialize provider class\n", - "openai = OpenAI()\n", - "\n", - "# select context to be used in feedback. the location of context is app specific.\n", - "from trulens_eval.app import App\n", - "context = App.select_context(rag_chain)\n", - "\n", - "from trulens_eval.feedback import Groundedness\n", - "grounded = Groundedness(groundedness_provider=OpenAI())\n", - "# Define a groundedness feedback function\n", - "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons)\n", - " .on(context.collect()) # collect context chunks into a list\n", - " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", - ")\n", - "\n", - "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n", - "# Question/statement relevance between question and each context chunk.\n", - "f_context_relevance = (\n", - " Feedback(openai.qs_relevance)\n", - " .on_input()\n", - " .on(context)\n", - " .aggregate(np.mean)\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instrument chain for logging with TruLens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru_recorder = TruChain(rag_chain,\n", - " app_id='Chain1_ChatApplication',\n", - " feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with tru_recorder as recording:\n", - " llm_response = rag_chain.invoke(\"What is Task Decomposition?\")\n", - "\n", - "display(llm_response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retrieve records and feedback" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The record of the app invocation can be retrieved from the `recording`:\n", - "\n", - "rec = recording.get() # use .get if only one record\n", - "# recs = recording.records # use .records if multiple\n", - "\n", - "display(rec)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The results of the feedback functions can be rertireved from\n", - "# `Record.feedback_results` or using the `wait_for_feedback_result` method. The\n", - "# results if retrieved directly are `Future` instances (see\n", - "# `concurrent.futures`). You can use `as_completed` to wait until they have\n", - "# finished evaluating or use the utility method:\n", - "\n", - "for feedback, feedback_result in rec.wait_for_feedback_results().items():\n", - " print(feedback.name, feedback_result.result)\n", - "\n", - "# See more about wait_for_feedback_results:\n", - "# help(rec.wait_for_feedback_results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "records, feedback = tru.get_records_and_feedback(app_ids=[\"Chain1_ChatApplication\"])\n", - "\n", - "records.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.get_leaderboard(app_ids=[\"Chain1_ChatApplication\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Explore in a Dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.run_dashboard() # open a local streamlit app to explore\n", - "\n", - "# tru.stop_dashboard() # stop if needed" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: Feedback functions evaluated in the deferred manner can be seen in the \"Progress\" page of the TruLens dashboard." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐ Llama-Index Quickstart\n", - "\n", - "In this quickstart you will create a simple Llama Index app and learn how to log it and get feedback on an LLM response.\n", - "\n", - "For evaluation, we will leverage the \"hallucination triad\" of groundedness, context relevance and answer relevance.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "### Install dependencies\n", - "Let's install some of the dependencies for this notebook if we don't have them already" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pip install trulens_eval==0.24.0 llama_index" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Add API keys\n", - "For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import from TruLens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import Tru\n", - "tru = Tru()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download data\n", - "\n", - "This example uses the text of Paul Grahamโs essay, [โWhat I Worked Onโ](https://paulgraham.com/worked.html), and is the canonical llama-index example.\n", - "\n", - "The easiest way to get it is to [download it via this link](https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt) and save it in a folder called data. You can do so with the following command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Simple LLM Application\n", - "\n", - "This example uses LlamaIndex which internally uses an OpenAI LLM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", - "\n", - "documents = SimpleDirectoryReader(\"data\").load_data()\n", - "index = VectorStoreIndex.from_documents(documents)\n", - "\n", - "query_engine = index.as_query_engine()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Send your first request" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "response = query_engine.query(\"What did the author do growing up?\")\n", - "print(response)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Feedback Function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# Initialize provider class\n", - "from trulens_eval.feedback.provider.openai import OpenAI\n", - "openai = OpenAI()\n", - "\n", - "# select context to be used in feedback. the location of context is app specific.\n", - "from trulens_eval.app import App\n", - "context = App.select_context(query_engine)\n", - "\n", - "# imports for feedback\n", - "from trulens_eval import Feedback\n", - "\n", - "# Define a groundedness feedback function\n", - "from trulens_eval.feedback import Groundedness\n", - "grounded = Groundedness(groundedness_provider=OpenAI())\n", - "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons)\n", - " .on(context.collect()) # collect context chunks into a list\n", - " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", - ")\n", - "\n", - "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n", - "\n", - "# Question/statement relevance between question and each context chunk.\n", - "f_qs_relevance = (\n", - " Feedback(openai.qs_relevance)\n", - " .on_input()\n", - " .on(context)\n", - " .aggregate(np.mean)\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instrument app for logging with TruLens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import TruLlama\n", - "tru_query_engine_recorder = TruLlama(query_engine,\n", - " app_id='LlamaIndex_App1',\n", - " feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# or as context manager\n", - "with tru_query_engine_recorder as recording:\n", - " query_engine.query(\"What did the author do growing up?\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retrieve records and feedback" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The record of the app invocation can be retrieved from the `recording`:\n", - "\n", - "rec = recording.get() # use .get if only one record\n", - "# recs = recording.records # use .records if multiple\n", - "\n", - "display(rec)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.run_dashboard()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The results of the feedback functions can be rertireved from\n", - "# `Record.feedback_results` or using the `wait_for_feedback_result` method. The\n", - "# results if retrieved directly are `Future` instances (see\n", - "# `concurrent.futures`). You can use `as_completed` to wait until they have\n", - "# finished evaluating or use the utility method:\n", - "\n", - "for feedback, feedback_result in rec.wait_for_feedback_results().items():\n", - " print(feedback.name, feedback_result.result)\n", - "\n", - "# See more about wait_for_feedback_results:\n", - "# help(rec.wait_for_feedback_results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "records, feedback = tru.get_records_and_feedback(app_ids=[\"LlamaIndex_App1\"])\n", - "\n", - "records.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.get_leaderboard(app_ids=[\"LlamaIndex_App1\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Explore in a Dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.run_dashboard() # open a local streamlit app to explore\n", - "\n", - "# tru.stop_dashboard() # stop if needed" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐ TruLens Quickstart\n", - "\n", - "In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response.\n", - "\n", - "For evaluation, we will leverage the \"hallucination triad\" of groundedness, context relevance and answer relevance.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install trulens_eval==0.23.0 chromadb==0.4.18 openai==1.3.7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get Data\n", - "\n", - "In this case, we'll just initialize some simple text in the notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "university_info = \"\"\"\n", - "The University of Washington, founded in 1861 in Seattle, is a public research university\n", - "with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.\n", - "As the flagship institution of the six public universities in Washington state,\n", - "UW encompasses over 500 buildings and 20 million square feet of space,\n", - "including one of the largest library systems in the world.\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Vector Store\n", - "\n", - "Create a chromadb vector store in memory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "oai_client = OpenAI()\n", - "\n", - "oai_client.embeddings.create(\n", - " model=\"text-embedding-ada-002\",\n", - " input=university_info\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import chromadb\n", - "from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction\n", - "\n", - "embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'),\n", - " model_name=\"text-embedding-ada-002\")\n", - "\n", - "\n", - "chroma_client = chromadb.Client()\n", - "vector_store = chroma_client.get_or_create_collection(name=\"Universities\",\n", - " embedding_function=embedding_function)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Add the university_info to the embedding database." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "vector_store.add(\"uni_info\", documents=university_info)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build RAG from scratch\n", - "\n", - "Build a custom RAG from scratch, and add TruLens custom instrumentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import Tru\n", - "from trulens_eval.tru_custom_app import instrument\n", - "tru = Tru()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class RAG_from_scratch:\n", - " @instrument\n", - " def retrieve(self, query: str) -> list:\n", - " \"\"\"\n", - " Retrieve relevant text from vector store.\n", - " \"\"\"\n", - " results = vector_store.query(\n", - " query_texts=query,\n", - " n_results=2\n", - " )\n", - " return results['documents'][0]\n", - "\n", - " @instrument\n", - " def generate_completion(self, query: str, context_str: list) -> str:\n", - " \"\"\"\n", - " Generate answer from context.\n", - " \"\"\"\n", - " completion = oai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " temperature=0,\n", - " messages=\n", - " [\n", - " {\"role\": \"user\",\n", - " \"content\": \n", - " f\"We have provided context information below. \\n\"\n", - " f\"---------------------\\n\"\n", - " f\"{context_str}\"\n", - " f\"\\n---------------------\\n\"\n", - " f\"Given this information, please answer the question: {query}\"\n", - " }\n", - " ]\n", - " ).choices[0].message.content\n", - " return completion\n", - "\n", - " @instrument\n", - " def query(self, query: str) -> str:\n", - " context_str = self.retrieve(query)\n", - " completion = self.generate_completion(query, context_str)\n", - " return completion\n", - "\n", - "rag = RAG_from_scratch()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up feedback functions.\n", - "\n", - "Here we'll use groundedness, answer relevance and context relevance to detect hallucination." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import Feedback, Select\n", - "from trulens_eval.feedback import Groundedness\n", - "from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI\n", - "\n", - "import numpy as np\n", - "\n", - "# Initialize provider class\n", - "fopenai = fOpenAI()\n", - "\n", - "grounded = Groundedness(groundedness_provider=fopenai)\n", - "\n", - "# Define a groundedness feedback function\n", - "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n", - " .on(Select.RecordCalls.retrieve.rets.collect())\n", - " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", - ")\n", - "\n", - "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = (\n", - " Feedback(fopenai.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", - " .on(Select.RecordCalls.retrieve.args.query)\n", - " .on_output()\n", - ")\n", - "\n", - "# Question/statement relevance between question and each context chunk.\n", - "f_context_relevance = (\n", - " Feedback(fopenai.qs_relevance_with_cot_reasons, name = \"Context Relevance\")\n", - " .on(Select.RecordCalls.retrieve.args.query)\n", - " .on(Select.RecordCalls.retrieve.rets.collect())\n", - " .aggregate(np.mean)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Construct the app\n", - "Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import TruCustomApp\n", - "tru_rag = TruCustomApp(rag,\n", - " app_id = 'RAG v1',\n", - " feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run the app\n", - "Use `tru_rag` as a context manager for the custom RAG-from-scratch app." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with tru_rag as recording:\n", - " rag.query(\"When was the University of Washington founded?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.get_leaderboard(app_ids=[\"RAG v1\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.run_dashboard()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prototype Evals\n", - "This notebook shows the use of the dummy feedback function provider which\n", - "behaves like the huggingface provider except it does not actually perform any\n", - "network calls and just produces constant results. It can be used to prototype\n", - "feedback function wiring for your apps before invoking potentially slow (to\n", - "run/to load) feedback functions.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/prototype_evals.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install trulens_eval==0.23.0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import Feedback\n", - "from trulens_eval import Tru\n", - "\n", - "tru = Tru()\n", - "\n", - "tru.run_dashboard()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set keys" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build the app" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "oai_client = OpenAI()\n", - "\n", - "from trulens_eval.tru_custom_app import instrument\n", - "\n", - "class APP:\n", - " @instrument\n", - " def completion(self, prompt):\n", - " completion = oai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " temperature=0,\n", - " messages=\n", - " [\n", - " {\"role\": \"user\",\n", - " \"content\": \n", - " f\"Please answer the question: {prompt}\"\n", - " }\n", - " ]\n", - " ).choices[0].message.content\n", - " return completion\n", - " \n", - "llm_app = APP()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create dummy feedback\n", - "\n", - "By setting the provider as `Dummy()`, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval.feedback.provider.hugs import Dummy\n", - "\n", - "# hugs = Huggingface()\n", - "hugs = Dummy()\n", - "\n", - "f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the app" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add trulens as a context manager for llm_app with dummy feedback\n", - "from trulens_eval import TruCustomApp\n", - "tru_app = TruCustomApp(llm_app,\n", - " app_id = 'LLM App v1',\n", - " feedbacks = [f_positive_sentiment])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run the app" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with tru_app as recording:\n", - " llm_app.completion('give me a good name for a colorful sock company')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.get_leaderboard(app_ids=[tru_app.app_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐ Logging Human Feedback\n", - "\n", - "In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/human_feedback.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install trulens_eval==0.23.0 openai==1.3.7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from trulens_eval import Tru\n", - "from trulens_eval import TruCustomApp\n", - "\n", - "tru = Tru()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set Keys\n", - "\n", - "For this example, you need an OpenAI key." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up your app\n", - "\n", - "Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "oai_client = OpenAI()\n", - "\n", - "from trulens_eval.tru_custom_app import instrument\n", - "\n", - "class APP:\n", - " @instrument\n", - " def completion(self, prompt):\n", - " completion = oai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " temperature=0,\n", - " messages=\n", - " [\n", - " {\"role\": \"user\",\n", - " \"content\": \n", - " f\"Please answer the question: {prompt}\"\n", - " }\n", - " ]\n", - " ).choices[0].message.content\n", - " return completion\n", - " \n", - "llm_app = APP()\n", - "\n", - "# add trulens as a context manager for llm_app\n", - "tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run the app" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with tru_app as recording:\n", - " llm_app.completion(\"Give me 10 names for a colorful sock company\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the record to add the feedback to.\n", - "record = recording.get()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a mechamism for recording human feedback.\n", - "\n", - "Be sure to click an emoji in the record to record `human_feedback` to log." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from ipywidgets import Button, HBox, VBox\n", - "\n", - "thumbs_up_button = Button(description='๐')\n", - "thumbs_down_button = Button(description='๐')\n", - "\n", - "human_feedback = None\n", - "\n", - "def on_thumbs_up_button_clicked(b):\n", - " global human_feedback\n", - " human_feedback = 1\n", - "\n", - "def on_thumbs_down_button_clicked(b):\n", - " global human_feedback\n", - " human_feedback = 0\n", - "\n", - "thumbs_up_button.on_click(on_thumbs_up_button_clicked)\n", - "thumbs_down_button.on_click(on_thumbs_down_button_clicked)\n", - "\n", - "HBox([thumbs_up_button, thumbs_down_button])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add the human feedback to a particular app and record\n", - "tru.add_feedback(\n", - " name=\"Human Feedack\",\n", - " record_id=record.record_id,\n", - " app_id=tru_app.app_id,\n", - " result=human_feedback\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## See the result logged with your app." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tru.get_leaderboard(app_ids=[tru_app.app_id])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐ Ground Truth Evaluations\n", - "\n", - "In this quickstart you will create a evaluate a LangChain app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right.\n", - "\n", - "Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/groundtruth_evals.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Add API keys\n", - "For this quickstart, you will need Open AI keys." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install trulens_eval==0.23.0 openai==1.3.7" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from trulens_eval import Tru\n", - "\n", - "tru = Tru()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Simple LLM Application" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "oai_client = OpenAI()\n", - "\n", - "from trulens_eval.tru_custom_app import instrument\n", - "\n", - "class APP:\n", - " @instrument\n", - " def completion(self, prompt):\n", - " completion = oai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " temperature=0,\n", - " messages=\n", - " [\n", - " {\"role\": \"user\",\n", - " \"content\": \n", - " f\"Please answer the question: {prompt}\"\n", - " }\n", - " ]\n", - " ).choices[0].message.content\n", - " return completion\n", - " \n", - "llm_app = APP()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Feedback Function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .\n", - "โ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .\n" - ] - } - ], - "source": [ - "from trulens_eval import Feedback\n", - "from trulens_eval.feedback import GroundTruthAgreement\n", - "\n", - "golden_set = [\n", - " {\"query\": \"who invented the lightbulb?\", \"response\": \"Thomas Edison\"},\n", - " {\"query\": \"ยฟquien invento la bombilla?\", \"response\": \"Thomas Edison\"}\n", - "]\n", - "\n", - "f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = \"Ground Truth\").on_input_output()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instrument chain for logging with TruLens" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# add trulens as a context manager for llm_app\n", - "from trulens_eval import TruCustomApp\n", - "tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1', feedbacks = [f_groundtruth])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Instrumented query engine can operate as a context manager:\n", - "with tru_app as recording:\n", - " llm_app.completion(\"ยฟquien invento la bombilla?\")\n", - " llm_app.completion(\"who invented the lightbulb?\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## See results" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | Ground Truth | \n", - "positive_sentiment | \n", - "Human Feedack | \n", - "latency | \n", - "total_cost | \n", - "
---|---|---|---|---|---|
app_id | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
LLM App v1 | \n", - "1.0 | \n", - "0.38994 | \n", - "1.0 | \n", - "1.75 | \n", - "0.000076 | \n", - "