Skip to content

Commit bc611a8

Browse files
committed
Improve overlap detection
1 parent 695b0d4 commit bc611a8

File tree

1 file changed

+24
-2
lines changed

1 file changed

+24
-2
lines changed

src/guardrails/_base_client.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,9 +316,31 @@ def _mask_text(text: str) -> str:
316316
detected_value = decoded_text_for_masking[result.start : result.end]
317317
entity_type = result.entity_type
318318

319-
# Find candidate that contains this PII
319+
# Find candidate that overlaps with this PII
320+
# Use comprehensive overlap logic matching pii.py implementation
320321
for candidate in candidates_for_masking:
321-
if candidate.decoded_text and detected_value.lower() in candidate.decoded_text.lower():
322+
if not candidate.decoded_text:
323+
continue
324+
325+
candidate_lower = candidate.decoded_text.lower()
326+
detected_lower = detected_value.lower()
327+
328+
# Check if candidate's decoded text overlaps with the detection
329+
# Handle partial encodings where encoded span may include extra characters
330+
# e.g., %3A%6a%6f%65%40 → ":joe@" but only "joe@" is in email "[email protected]"
331+
has_overlap = (
332+
candidate_lower in detected_lower # Candidate is substring of detection
333+
or detected_lower in candidate_lower # Detection is substring of candidate
334+
or (
335+
len(candidate_lower) >= 3
336+
and any( # Any 3-char chunk overlaps
337+
candidate_lower[i : i + 3] in detected_lower
338+
for i in range(0, len(candidate_lower) - 2, 2) # Step by 2 for efficiency
339+
)
340+
)
341+
)
342+
343+
if has_overlap:
322344
candidates_to_mask.append((candidate, entity_type))
323345
break
324346

0 commit comments

Comments
 (0)