Skip to content

Commit

Permalink
EM version without middleware interaction
Browse files Browse the repository at this point in the history
  • Loading branch information
enoch3712 committed Nov 19, 2024
1 parent 8f14fdb commit 780f9d3
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
2 changes: 1 addition & 1 deletion extract_thinker/masking/llm_masking_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ async def mask_content(self, content: str) -> MaskContract:
messages_step1 = [
{
"role": "system",
"content": "You are an AI assistant that masks sensitive information in text."
"content": "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values or non-PII data."
},
{
"role": "user",
Expand Down
30 changes: 26 additions & 4 deletions tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def test_mask():

# Act
test_text = "Mr. George Collins lives at 123 Main St, Anytown, USA 12345.\n His phone number is 555-1234.\nJane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210.\nThe company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. \nFor customer service, call 1-800-555-9876 or email [email protected]. \nSarah Lee, our HR manager, can be contacted at 444-333-2222 or [email protected].\nThe project budget is $250,000, with an additional $50,000 allocated for contingencies. \nMonthly maintenance costs are estimated at $3,500. \nFor international clients, please use +1-555-987-6543. \nOur tax ID number is 12-3456789."

# Act

result = asyncio.run(process.mask_content(test_text))

# Assert
Expand Down Expand Up @@ -84,8 +83,18 @@ def test_mask():
# Test unmasking
unmasked_content = process.unmask_content(result.masked_text, result.mapping)

# Optionally, verify the entire unmasked content matches the original
assert unmasked_content == test_text, "Unmasked content does not match the original content"
# Normalize strings by standardizing whitespace and newlines
def normalize_string(s: str) -> str:
# Replace all whitespace sequences (including newlines) with a single space
# and strip leading/trailing whitespace
return ' '.join(s.split())

# Test unmasking with normalized strings
normalized_unmasked = normalize_string(unmasked_content)
normalized_original = normalize_string(test_text)

# Compare normalized strings
assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content"

def test_simple_use_case():
# Arrange
Expand Down Expand Up @@ -133,6 +142,19 @@ def test_deterministic_hashing():

test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01."

# Normalize strings by standardizing whitespace and newlines
def normalize_string(s: str) -> str:
# Replace all whitespace sequences (including newlines) with a single space
# and strip leading/trailing whitespace
return ' '.join(s.split())

# Test unmasking with normalized strings
normalized_unmasked = normalize_string(result.masked_text)
normalized_original = normalize_string(test_text)

# Compare normalized strings
assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content"

# Act
result = asyncio.run(process.mask_content(test_text))

Expand Down

0 comments on commit 780f9d3

Please sign in to comment.