Skip to content

Commit

Permalink
[TLM] Add similarity measure option (#351)
Browse files Browse the repository at this point in the history
  • Loading branch information
huiwengoh authored Jan 27, 2025
1 parent 5477a71 commit 9446669
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 1 deletion.
1 change: 1 addition & 0 deletions cleanlab_studio/internal/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
}
TLM_NUM_CANDIDATE_RESPONSES_RANGE: Tuple[int, int] = (1, 20) # (min, max)
TLM_NUM_CONSISTENCY_SAMPLES_RANGE: Tuple[int, int] = (0, 20) # (min, max)
TLM_SIMILARITY_MEASURES: Set[str] = {"semantic", "string"}
TLM_REASONING_EFFORT_VALUES: Set[str] = {"none", "low", "medium", "high"}
TLM_VALID_LOG_OPTIONS: Set[str] = {"perplexity", "explanation"}
TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS: Set[str] = {"perplexity"}
Expand Down
7 changes: 7 additions & 0 deletions cleanlab_studio/internal/tlm/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
_VALID_TLM_MODELS,
TLM_NUM_CANDIDATE_RESPONSES_RANGE,
TLM_NUM_CONSISTENCY_SAMPLES_RANGE,
TLM_SIMILARITY_MEASURES,
TLM_REASONING_EFFORT_VALUES,
TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS,
TLM_VALID_KWARGS,
Expand Down Expand Up @@ -232,6 +233,12 @@ def validate_tlm_options(options: Any) -> None:
f"Invalid type {type(val)}, use_self_reflection must be a boolean"
)

elif option == "similarity_measure":
if val not in TLM_SIMILARITY_MEASURES:
raise ValidationError(
f"Invalid value for similarity_measure: {val}, valid measures include: {TLM_SIMILARITY_MEASURES}"
)

elif option == "reasoning_effort":
if val not in TLM_REASONING_EFFORT_VALUES:
raise ValidationError(
Expand Down
5 changes: 5 additions & 0 deletions cleanlab_studio/studio/trustworthy_language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,10 @@ class TLMOptions(TypedDict):
and helping catch answers that are obviously incorrect/bad for a prompt asking for a well-defined answer that LLMs should be able to handle.
Setting this to False disables the use of self-reflection and may produce worse TLM trustworthiness scores, but will reduce costs/runtimes.
similarity_measure (str, default = "semantic"): Controls how the trustworthiness scoring algorithm measures similarity between possible
responses/outputs considered by the model. Set this to "string" to get faster results.
Supported measures include "semantic" and "string".
reasoning_effort (str, default = "high"): Controls how much the LLM reasons when considering alternative possible responses and double-checking responses.
Higher efforts here produce better TLM trustworthiness scores, but at higher costs/runtimes, reduce this value to get faster results.
Supported efforts include "none", "low", "medium", "high".
Expand All @@ -841,6 +845,7 @@ class TLMOptions(TypedDict):
num_candidate_responses: NotRequired[int]
num_consistency_samples: NotRequired[int]
use_self_reflection: NotRequired[bool]
similarity_measure: NotRequired[str]
reasoning_effort: NotRequired[str]
log: NotRequired[List[str]]
custom_eval_criteria: NotRequired[List[Dict[str, Any]]]
Expand Down
2 changes: 1 addition & 1 deletion cleanlab_studio/version.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Note to developers:
# Consider if backend's MIN_CLI_VERSION needs updating when pushing any changes to this file.

__version__ = "2.5.15"
__version__ = "2.5.16"

SCHEMA_VERSION = "0.2.0"
MIN_SCHEMA_VERSION = "0.1.0"
Expand Down
4 changes: 4 additions & 0 deletions tests/tlm/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
_TLM_MAX_TOKEN_RANGE,
_VALID_TLM_MODELS,
_VALID_TLM_QUALITY_PRESETS,
TLM_SIMILARITY_MEASURES,
TLM_REASONING_EFFORT_VALUES,
)
from cleanlab_studio.internal.tlm.concurrency import TlmRateHandler
Expand Down Expand Up @@ -86,6 +87,7 @@ def _get_options_dictionary(model: Optional[str]) -> dict:
add_num_candidate_responses = np.random.choice([True, False])
add_num_consistency_samples = np.random.choice([True, False])
add_use_self_reflection = np.random.choice([True, False])
add_similarity_measure = np.random.choice([True, False])
add_reasoning_effort = np.random.choice([True, False])
add_log_explanation = np.random.choice([True, False])
add_log_perplexity_score = np.random.choice([True, False])
Expand All @@ -101,6 +103,8 @@ def _get_options_dictionary(model: Optional[str]) -> dict:
options["num_candidate_responses"] = int(np.random.randint(1, 5))
if add_num_consistency_samples:
options["num_consistency_samples"] = int(np.random.randint(0, 10))
if add_similarity_measure:
options["similarity_measure"] = random.choice(list(TLM_SIMILARITY_MEASURES))
if add_reasoning_effort:
options["reasoning_effort"] = random.choice(list(TLM_REASONING_EFFORT_VALUES))

Expand Down

0 comments on commit 9446669

Please sign in to comment.