Skip to content
13 changes: 13 additions & 0 deletions pyrit/score/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)

from pyrit.exceptions import (
BadRequestException,
InvalidJsonException,
PyritException,
pyrit_json_retry,
Expand Down Expand Up @@ -729,6 +730,7 @@ async def _score_value_with_llm_async(
score_value still needs to be normalized and validated.

Raises:
BadRequestException: If the scorer's LLM response is blocked by content filtering.
ValueError: If required keys are missing from the response or if the response format is invalid.
InvalidJsonException: If the response is not valid JSON.
Exception: For other unexpected errors during scoring.
Expand Down Expand Up @@ -781,6 +783,17 @@ async def _score_value_with_llm_async(

response_json: str = ""
try:
# A content-filter block yields a single error piece with no parseable text piece,
# so raise a clear error here instead of failing on the missing text piece below.
if all(piece.is_blocked() for piece in response[0].message_pieces):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a piece is blocked, every piece is marked blocked. The piece's converted_value has the error JSON. There is partial content in the metadata, but no non-blocked text piece.

So this should be updated because it will never get partial content.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, you're right. A content-filter block always comes back as a single fully-blocked error piece (partial content lives in prompt_metadata, not a separate text piece), so the old comment's salvageable text piece reasoning was wrong. Fixed the comment in 0aa44b9 to just say a block yields a single error piece with no parseable text. The all() check is functionally equivalent to checking the first piece here; kept it as a harmless defensive check rather than changing behavior.

raise BadRequestException(
message=(
f"The scorer's LLM response was blocked by content filtering while scoring "
f"prompt ID: {scored_prompt_id}. Consider using a scorer endpoint with "
f"content filtering disabled for red-teaming workflows."
)
)

# Get the text piece which contains the JSON response containing the score_value and rationale from the LLM
text_piece = next(
piece for piece in response[0].message_pieces if piece.converted_value_data_type == "text"
Expand Down
33 changes: 33 additions & 0 deletions tests/unit/score/test_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1548,6 +1548,39 @@ async def test_score_value_with_llm_skips_reasoning_piece(good_json):
assert result.score_rationale == "Valid response"


async def test_score_value_with_llm_raises_when_scorer_response_blocked():
"""When the scorer's own LLM response is blocked by content filtering, raise BadRequestException."""
from pyrit.exceptions import BadRequestException

chat_target = MagicMock(PromptTarget)
chat_target.get_identifier.return_value = get_mock_target_identifier("MockChatTarget")

blocked_piece = MessagePiece(
role="assistant",
original_value="",
original_value_data_type="error",
converted_value="",
converted_value_data_type="error",
conversation_id="test-convo",
response_error="blocked",
)
blocked_response = Message(message_pieces=[blocked_piece])
chat_target.send_prompt_async = AsyncMock(return_value=[blocked_response])

scorer = MockScorer()

with pytest.raises(BadRequestException, match="blocked by content filtering"):
await scorer._score_value_with_llm_async(
prompt_target=chat_target,
system_prompt="system_prompt",
message_value="message_value",
message_data_type="text",
scored_prompt_id="test-prompt-id",
category="category",
objective="task",
)


# ── Helpers for score_blocked_content tests ──────────────────────────────────


Expand Down
Loading