Bug fixes and improvements (#89)

shahules786 · web-flow · commit 1cda1c7dea11 · 2023-08-08T23:34:22.000+05:30
## What

Improvements and bug fixes for metrics

## Why

These bugs appeared to cause drift in results from metrics assessments
to ragas metrics. Some improvements
- fixed temperature for self-check
- Improved prompts for context_relevancy
- post-processing changes for answer_relevancy
diff --git a/src/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py
@@ -52,6 +52,9 @@ class AnswerRelevancy(MetricWithLLM):
     batch_size: int = 15
     strictness: int = 3
 
+    def __post_init__(self: t.Self):
+        self.temperature = 0.2 if self.strictness > 0 else 0
+
     def init_model(self: t.Self):
         self.embedding = OpenAIEmbeddings()  # type: ignore
 
@@ -80,14 +83,18 @@ def _score_batch(
                 prompts.append(ChatPromptTemplate.from_messages([human_prompt]))
 
             results = generate(
-                prompts, self.llm, n=self.strictness, callbacks=batch_group
+                prompts,
+                self.llm,
+                n=self.strictness,
+                temperature=self.temperature,
+                callbacks=batch_group,
             )
             results = [[i.text for i in r] for r in results.generations]
 
             scores = []
             for question, gen_questions in zip(questions, results):
                 cosine_sim = self.calculate_similarity(question, gen_questions)
-                scores.append(cosine_sim.max())
+                scores.append(cosine_sim.mean())
 
         return scores
 
diff --git a/src/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py
@@ -27,9 +27,12 @@
 
 question: Were Scott Derrickson and Ed Wood of the same nationality?
 context :\nScott Derrickson (born July 16, 1966) is an American director, screenwriter and producer He lives in Los Angeles, California He is best known for directing horror films such as "Sinister", "The Exorcism of Emily Rose", and "Deliver Us From Evil", as well as the 2016 Marvel Cinematic Universe installment, "Doctor Strange"Tyler Bates is an American musician, music producer, and composer for films, television, and video games. Adam Collis is an American filmmaker and actor.Conrad Brooks is an American actor.Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.
-Now given a question and context, extract the minimum number of sentences from the given context required to answer the question completely. 
 sentences:Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.
 
+question: How many were killed in the Tiananmen Square incident?
+context:\nTiananmen Square incident, also called June Fourth incident or 6/4, series of protests and demonstrations in China in the spring of 1989 that culminated on the night of June 3–4 with a government crackdown on the demonstrators in Tiananmen Square in Beijing.
+sentences: No candidate sentences found.
+
 question:{question}
 context:\n{context}
 sentences:"""  # noqa: E501
@@ -117,6 +120,7 @@ def __post_init__(self: t.Self):
             raise ValueError(
                 "model_name must be provided when agreement_metric is bert_score"
             )
+        self.temperature = 0.2 if self.strictness > 0 else 0
 
     def init_model(self: t.Self):
         self.sent_agreement = SentenceAgreement(
@@ -164,7 +168,11 @@ def _score_batch(
 
             responses: list[list[str]] = []
             results = generate(
-                prompts, self.llm, n=self.strictness, callbacks=batch_group
+                prompts,
+                self.llm,
+                n=self.strictness,
+                temperature=self.temperature,
+                callbacks=batch_group,
             )
             responses = [[i.text for i in r] for r in results.generations]
 
@@ -174,11 +182,7 @@ def _score_batch(
                 overlap_scores = []
                 context_sents = sent_tokenize(context)
                 for output in n_response:
-                    indices = [
-                        context.find(sent)
-                        for sent in sent_tokenize(output)
-                        if context.find(sent) != -1
-                    ]
+                    indices = sent_tokenize(output)
                     overlap_scores.append(len(indices) / len(context_sents))
                 if self.strictness > 1:
                     agr_score = self.sent_agreement.evaluate(n_response)
diff --git a/src/ragas/metrics/faithfulnes.py b/src/ragas/metrics/faithfulnes.py
@@ -135,7 +135,9 @@ def _score_batch(
                     )
                     score = score / len(list_statements[i])
                 else:
-                    score = max(0, output.count("verdict: no")) / len(list_statements[i])
+                    score = max(0, output.count("verdict: no")) / len(
+                        list_statements[i]
+                    )
 
                 scores.append(1 - score)
 
diff --git a/src/ragas/metrics/llms.py b/src/ragas/metrics/llms.py
@@ -21,10 +21,12 @@ def generate(
     prompts: list[ChatPromptTemplate],
     llm: BaseLLM | BaseChatModel,
     n: t.Optional[int] = None,
+    temperature: float = 0,
     callbacks: t.Optional[Callbacks] = None,
 ) -> LLMResult:
     old_n = None
     n_swapped = False
+    llm.temperature = temperature
     if n is not None:
         if isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI):
             old_n = llm.n

Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,9 @@ def _score_batch(`
`135`	`135`	`)`
`136`	`136`	`score = score / len(list_statements[i])`
`137`	`137`	`else:`
`138`		`- score = max(0, output.count("verdict: no")) / len(list_statements[i])`
	`138`	`+ score = max(0, output.count("verdict: no")) / len(`
	`139`	`+ list_statements[i]`
	`140`	`+ )`
`139`	`141`
`140`	`142`	`scores.append(1 - score)`
`141`	`143`