File tree Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Original file line number Diff line number Diff line change @@ -316,9 +316,31 @@ def _mask_text(text: str) -> str:
316316 detected_value = decoded_text_for_masking [result .start : result .end ]
317317 entity_type = result .entity_type
318318
319- # Find candidate that contains this PII
319+ # Find candidate that overlaps with this PII
320+ # Use comprehensive overlap logic matching pii.py implementation
320321 for candidate in candidates_for_masking :
321- if candidate .decoded_text and detected_value .lower () in candidate .decoded_text .lower ():
322+ if not candidate .decoded_text :
323+ continue
324+
325+ candidate_lower = candidate .decoded_text .lower ()
326+ detected_lower = detected_value .lower ()
327+
328+ # Check if candidate's decoded text overlaps with the detection
329+ # Handle partial encodings where encoded span may include extra characters
330+ # e.g., %3A%6a%6f%65%40 → ":joe@" but only "joe@" is in email "[email protected] " 331+ has_overlap = (
332+ candidate_lower in detected_lower # Candidate is substring of detection
333+ or detected_lower in candidate_lower # Detection is substring of candidate
334+ or (
335+ len (candidate_lower ) >= 3
336+ and any ( # Any 3-char chunk overlaps
337+ candidate_lower [i : i + 3 ] in detected_lower
338+ for i in range (0 , len (candidate_lower ) - 2 , 2 ) # Step by 2 for efficiency
339+ )
340+ )
341+ )
342+
343+ if has_overlap :
322344 candidates_to_mask .append ((candidate , entity_type ))
323345 break
324346
You can’t perform that action at this time.
0 commit comments