[TLM] Add similarity measure option (#351)

cleanlab · Jan 27, 2025 · 9446669 · 9446669
1 parent 5477a71
commit 9446669
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 1 deletion.
diff --git a/cleanlab_studio/internal/constants.py b/cleanlab_studio/internal/constants.py
@@ -30,6 +30,7 @@
 }
 TLM_NUM_CANDIDATE_RESPONSES_RANGE: Tuple[int, int] = (1, 20)  # (min, max)
 TLM_NUM_CONSISTENCY_SAMPLES_RANGE: Tuple[int, int] = (0, 20)  # (min, max)
+TLM_SIMILARITY_MEASURES: Set[str] = {"semantic", "string"}
 TLM_REASONING_EFFORT_VALUES: Set[str] = {"none", "low", "medium", "high"}
 TLM_VALID_LOG_OPTIONS: Set[str] = {"perplexity", "explanation"}
 TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS: Set[str] = {"perplexity"}

diff --git a/cleanlab_studio/internal/tlm/validation.py b/cleanlab_studio/internal/tlm/validation.py
@@ -8,6 +8,7 @@
     _VALID_TLM_MODELS,
     TLM_NUM_CANDIDATE_RESPONSES_RANGE,
     TLM_NUM_CONSISTENCY_SAMPLES_RANGE,
+    TLM_SIMILARITY_MEASURES,
     TLM_REASONING_EFFORT_VALUES,
     TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS,
     TLM_VALID_KWARGS,
@@ -232,6 +233,12 @@ def validate_tlm_options(options: Any) -> None:
                     f"Invalid type {type(val)}, use_self_reflection must be a boolean"
                 )
 
+        elif option == "similarity_measure":
+            if val not in TLM_SIMILARITY_MEASURES:
+                raise ValidationError(
+                    f"Invalid value for similarity_measure: {val}, valid measures include: {TLM_SIMILARITY_MEASURES}"
+                )
+
         elif option == "reasoning_effort":
             if val not in TLM_REASONING_EFFORT_VALUES:
                 raise ValidationError(

diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py
@@ -822,6 +822,10 @@ class TLMOptions(TypedDict):
         and helping catch answers that are obviously incorrect/bad for a prompt asking for a well-defined answer that LLMs should be able to handle.
         Setting this to False disables the use of self-reflection and may produce worse TLM trustworthiness scores, but will reduce costs/runtimes.
 
+        similarity_measure (str, default = "semantic"): Controls how the trustworthiness scoring algorithm measures similarity between possible
+        responses/outputs considered by the model. Set this to "string" to get faster results.
+        Supported measures include "semantic" and "string".
+
         reasoning_effort (str, default = "high"): Controls how much the LLM reasons when considering alternative possible responses and double-checking responses.
         Higher efforts here produce better TLM trustworthiness scores, but at higher costs/runtimes, reduce this value to get faster results.
         Supported efforts include "none", "low", "medium", "high".
@@ -841,6 +845,7 @@ class TLMOptions(TypedDict):
     num_candidate_responses: NotRequired[int]
     num_consistency_samples: NotRequired[int]
     use_self_reflection: NotRequired[bool]
+    similarity_measure: NotRequired[str]
     reasoning_effort: NotRequired[str]
     log: NotRequired[List[str]]
     custom_eval_criteria: NotRequired[List[Dict[str, Any]]]

diff --git a/cleanlab_studio/version.py b/cleanlab_studio/version.py
@@ -1,7 +1,7 @@
 # Note to developers:
 # Consider if backend's MIN_CLI_VERSION needs updating when pushing any changes to this file.
 
-__version__ = "2.5.15"
+__version__ = "2.5.16"
 
 SCHEMA_VERSION = "0.2.0"
 MIN_SCHEMA_VERSION = "0.1.0"

diff --git a/tests/tlm/conftest.py b/tests/tlm/conftest.py
@@ -12,6 +12,7 @@
     _TLM_MAX_TOKEN_RANGE,
     _VALID_TLM_MODELS,
     _VALID_TLM_QUALITY_PRESETS,
+    TLM_SIMILARITY_MEASURES,
     TLM_REASONING_EFFORT_VALUES,
 )
 from cleanlab_studio.internal.tlm.concurrency import TlmRateHandler
@@ -86,6 +87,7 @@ def _get_options_dictionary(model: Optional[str]) -> dict:
     add_num_candidate_responses = np.random.choice([True, False])
     add_num_consistency_samples = np.random.choice([True, False])
     add_use_self_reflection = np.random.choice([True, False])
+    add_similarity_measure = np.random.choice([True, False])
     add_reasoning_effort = np.random.choice([True, False])
     add_log_explanation = np.random.choice([True, False])
     add_log_perplexity_score = np.random.choice([True, False])
@@ -101,6 +103,8 @@ def _get_options_dictionary(model: Optional[str]) -> dict:
         options["num_candidate_responses"] = int(np.random.randint(1, 5))
     if add_num_consistency_samples:
         options["num_consistency_samples"] = int(np.random.randint(0, 10))
+    if add_similarity_measure:
+        options["similarity_measure"] = random.choice(list(TLM_SIMILARITY_MEASURES))
     if add_reasoning_effort:
         options["reasoning_effort"] = random.choice(list(TLM_REASONING_EFFORT_VALUES))