Skip to content

Commit 1cda1c7

Browse files
authored
Bug fixes and improvements (#89)
## What Improvements and bug fixes for metrics ## Why These bugs appeared to cause drift in results from metrics assessments to ragas metrics. Some improvements - fixed temperature for self-check - Improved prompts for context_relevancy - post-processing changes for answer_relevancy
1 parent 03081f7 commit 1cda1c7

File tree

4 files changed

+25
-10
lines changed

4 files changed

+25
-10
lines changed

src/ragas/metrics/answer_relevance.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ class AnswerRelevancy(MetricWithLLM):
5252
batch_size: int = 15
5353
strictness: int = 3
5454

55+
def __post_init__(self: t.Self):
56+
self.temperature = 0.2 if self.strictness > 0 else 0
57+
5558
def init_model(self: t.Self):
5659
self.embedding = OpenAIEmbeddings() # type: ignore
5760

@@ -80,14 +83,18 @@ def _score_batch(
8083
prompts.append(ChatPromptTemplate.from_messages([human_prompt]))
8184

8285
results = generate(
83-
prompts, self.llm, n=self.strictness, callbacks=batch_group
86+
prompts,
87+
self.llm,
88+
n=self.strictness,
89+
temperature=self.temperature,
90+
callbacks=batch_group,
8491
)
8592
results = [[i.text for i in r] for r in results.generations]
8693

8794
scores = []
8895
for question, gen_questions in zip(questions, results):
8996
cosine_sim = self.calculate_similarity(question, gen_questions)
90-
scores.append(cosine_sim.max())
97+
scores.append(cosine_sim.mean())
9198

9299
return scores
93100

src/ragas/metrics/context_relevance.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727
2828
question: Were Scott Derrickson and Ed Wood of the same nationality?
2929
context :\nScott Derrickson (born July 16, 1966) is an American director, screenwriter and producer He lives in Los Angeles, California He is best known for directing horror films such as "Sinister", "The Exorcism of Emily Rose", and "Deliver Us From Evil", as well as the 2016 Marvel Cinematic Universe installment, "Doctor Strange"Tyler Bates is an American musician, music producer, and composer for films, television, and video games. Adam Collis is an American filmmaker and actor.Conrad Brooks is an American actor.Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.
30-
Now given a question and context, extract the minimum number of sentences from the given context required to answer the question completely.
3130
sentences:Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.
3231
32+
question: How many were killed in the Tiananmen Square incident?
33+
context:\nTiananmen Square incident, also called June Fourth incident or 6/4, series of protests and demonstrations in China in the spring of 1989 that culminated on the night of June 3–4 with a government crackdown on the demonstrators in Tiananmen Square in Beijing.
34+
sentences: No candidate sentences found.
35+
3336
question:{question}
3437
context:\n{context}
3538
sentences:""" # noqa: E501
@@ -117,6 +120,7 @@ def __post_init__(self: t.Self):
117120
raise ValueError(
118121
"model_name must be provided when agreement_metric is bert_score"
119122
)
123+
self.temperature = 0.2 if self.strictness > 0 else 0
120124

121125
def init_model(self: t.Self):
122126
self.sent_agreement = SentenceAgreement(
@@ -164,7 +168,11 @@ def _score_batch(
164168

165169
responses: list[list[str]] = []
166170
results = generate(
167-
prompts, self.llm, n=self.strictness, callbacks=batch_group
171+
prompts,
172+
self.llm,
173+
n=self.strictness,
174+
temperature=self.temperature,
175+
callbacks=batch_group,
168176
)
169177
responses = [[i.text for i in r] for r in results.generations]
170178

@@ -174,11 +182,7 @@ def _score_batch(
174182
overlap_scores = []
175183
context_sents = sent_tokenize(context)
176184
for output in n_response:
177-
indices = [
178-
context.find(sent)
179-
for sent in sent_tokenize(output)
180-
if context.find(sent) != -1
181-
]
185+
indices = sent_tokenize(output)
182186
overlap_scores.append(len(indices) / len(context_sents))
183187
if self.strictness > 1:
184188
agr_score = self.sent_agreement.evaluate(n_response)

src/ragas/metrics/faithfulnes.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,9 @@ def _score_batch(
135135
)
136136
score = score / len(list_statements[i])
137137
else:
138-
score = max(0, output.count("verdict: no")) / len(list_statements[i])
138+
score = max(0, output.count("verdict: no")) / len(
139+
list_statements[i]
140+
)
139141

140142
scores.append(1 - score)
141143

src/ragas/metrics/llms.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ def generate(
2121
prompts: list[ChatPromptTemplate],
2222
llm: BaseLLM | BaseChatModel,
2323
n: t.Optional[int] = None,
24+
temperature: float = 0,
2425
callbacks: t.Optional[Callbacks] = None,
2526
) -> LLMResult:
2627
old_n = None
2728
n_swapped = False
29+
llm.temperature = temperature
2830
if n is not None:
2931
if isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI):
3032
old_n = llm.n

0 commit comments

Comments
 (0)