docs: updated testset generation (#510)

shahules786 · jjmachan · web-flow · commit 0e9339673f6e · 2024-01-24T19:01:19.000-08:00
Co-authored-by: Jithin James &lt;jamesjithin97@gmail.com&gt;
diff --git a/docs/_static/imgs/question_types.png b/docs/_static/imgs/question_types.png
diff --git a/docs/concepts/testset_generation.md b/docs/concepts/testset_generation.md
@@ -57,40 +57,25 @@ Checkout [llama-index](https://gpt-index.readthedocs.io/en/stable/core_modules/d
 
 
 ```{code-block} python
-:caption: Customising test set generation 
-from ragas.testset import TestsetGenerator
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.chat_models import ChatOpenAI
-from ragas.llms import LangchainLLM
+:caption: Customising test data distribution 
+from ragas.testset.generator import TestsetGenerator
+from ragas.testset.evolutions import simple, reasoning, multi_context
 
 # documents = load your documents
 
-# Add custom llms and embeddings
-generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo"))
-critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-4"))
-embeddings_model = OpenAIEmbeddings()
+# generator with openai models
+generator = TestsetGenerator.with_openai()
 
 # Change resulting question type distribution
-testset_distribution = {
-    "simple": 0.25,
-    "reasoning": 0.5,
-    "multi_context": 0.0,
-    "conditional": 0.25,
+distributions = {
+    simple: 0.5,
+    multi_context: 0.4,
+    reasoning: 0.1
 }
 
-# percentage of conversational question
-chat_qa = 0.2
-
-
-test_generator = TestsetGenerator(
-    generator_llm=generator_llm,
-    critic_llm=critic_llm,
-    embeddings_model=embeddings_model,
-    testset_distribution=testset_distribution,
-    chat_qa=chat_qa,
-)
-
-testset = test_generator.generate(documents, test_size=5)
+# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
+testset = generator.generate_with_langchain_docs(documents, 10, distributions) 
+testset.to_pandas()
 
 ```
 
@@ -109,16 +94,6 @@ test_df.head()
 
  Analyze the frequency of different question types in the created dataset
 
- ```{code-block} python
- :caption: bar graph of question types
-import seaborn as sns
-sns.set(rc={'figure.figsize':(9,6)})
-
-test_data_dist = test_df.question_type.value_counts().to_frame().reset_index()
-sns.set_theme(style="whitegrid")
-g = sns.barplot(y='count',x='question_type', data=test_data_dist)
-g.set_title("Question type distribution",fontdict = { 'fontsize': 20})
- ```
 
 <p align="left">
 <img src="../_static/imgs/question_types.png" alt="test-outputs" width="450" height="400" />
diff --git a/docs/getstarted/testset_generation.md b/docs/getstarted/testset_generation.md
@@ -11,30 +11,23 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key"
 
 ## Documents
 
-To begin, we require a collection of documents to generate synthetic Question/Context/Answer samples. Here, we will employ the llama-index document loaders to retrieve documents.
+To begin, we require a collection of documents to generate synthetic Question/Context/Answer samples. Here, we will employ the langchain document loader to load documents.
 
 ```{code-block} python
-:caption: Load documents from Semantic Scholar
-from llama_index import download_loader
-
-SemanticScholarReader = download_loader("SemanticScholarReader")
-loader = SemanticScholarReader()
-# Narrow down the search space
-query_space = "large language models"
-# Increase the limit to obtain more documents
-documents = loader.load_data(query=query_space, limit=10)
+:caption: Load documents from directory
+from langchain.document_loaders import DirectoryLoader
+loader = DirectoryLoader("your-directory")
+documents = loader.load()
 ```
 
 :::{note}
 Each Document object contains a metadata dictionary, which can be used to store additional information about the document which can be accessed with  `Document.metadata`. Please ensure that the metadata dictionary contains a key called `file_name` as this will be used in the generation process. The `file_name` attribute in metadata is used to identify chunks belonging to the same document. For example, pages belonging to the same research publication can be identifies using filename.
 
-An example of how to do this for `SemanticScholarReader` is shown below.
+An example of how to do this is shown below.
 
 ```{code-block} python
-for d in documents:
-    d.metadata["file_name"] = d.metadata["title"]
-
-documents[0].metadata
+for document in documents:
+    document.metadata['file_name'] = document.metadata['source']
 ```
 :::
 
@@ -46,11 +39,15 @@ We will now import and use Ragas' `Testsetgenerator` to promptly generate a synt
 
 ```{code-block} python
 :caption: Create 10 samples using default configuration
-from ragas.testset import TestsetGenerator
+from ragas.testset.generator import TestsetGenerator
+from ragas.testset.evolutions import simple, reasoning, multi_context
+
+# generator with openai models
+generator = TestsetGenerator.with_openai()
 
-testsetgenerator = TestsetGenerator.from_default()
-test_size = 10
-testset = testsetgenerator.generate(documents, test_size=test_size)
+# generate testset
+testset = generator.generate_with_langchain_docs(documents, test_size=10)
+testset.to_pandas()
 ```
 
 Subsequently, we can export the results into a Pandas DataFrame.
diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
@@ -13,17 +13,25 @@
 from ragas.executor import Executor
 from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
 from ragas.testset.docstore import Document, DocumentStore, InMemoryDocumentStore
-from ragas.testset.evolutions import ComplexEvolution, CurrentNodes, DataRow
+from ragas.testset.evolutions import (
+    ComplexEvolution,
+    CurrentNodes,
+    DataRow,
+    multi_context,
+    reasoning,
+    simple,
+)
 from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter
 
 if t.TYPE_CHECKING:
     from llama_index.readers.schema import Document as LlamaindexDocument
     from langchain_core.documents import Document as LCDocument
 
-Distributions = t.Dict[t.Any, float]
-
 logger = logging.getLogger(__name__)
 
+Distributions = t.Dict[t.Any, float]
+DEFAULT_DISTRIBUTION = {simple: 0.5, reasoning: 0.25, multi_context: 0.25}
+
 
 @dataclass
 class TestDataset:
@@ -126,7 +134,7 @@ def generate_with_langchain_docs(
     def generate(
         self,
         test_size: int,
-        distributions: Distributions = {},
+        distributions: Distributions = DEFAULT_DISTRIBUTION,
         with_debugging_logs=False,
     ):
         # init filters and evolutions