Skip to content

Commit cb99c80

Browse files
authored
add support for JSONPath to entity type field extraction (Netflix#2994)
* add support for JSONPath to entity type field extraction * delete old function * add custom type defintion for entity type pari * add custom type defintion for entity type pari * improve custom type * compile dependencies and remove unused import * Update description of field on entitytype edit sheet component * remove duplicate entities, parse valid jsonpath, and fix uuid bug * Update src/dispatch/signal/service.py
1 parent 4a2ea9f commit cb99c80

File tree

9 files changed

+386
-102
lines changed

9 files changed

+386
-102
lines changed

requirements-base.in

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ jinja2
1818
jira==2.0.0
1919
atlassian-python-api==3.32.0
2020
joblib
21+
jsonpath_ng
2122
numpy
2223
oauth2client
2324
pandas

requirements-base.txt

+16-2
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,9 @@ cymem==2.0.7
9090
# spacy
9191
# thinc
9292
decorator==5.1.1
93-
# via validators
93+
# via
94+
# jsonpath-ng
95+
# validators
9496
defusedxml==0.7.1
9597
# via jira
9698
deprecated==1.2.13
@@ -103,6 +105,10 @@ email-validator==1.3.1
103105
# via -r requirements-base.in
104106
emails==0.6
105107
# via -r requirements-base.in
108+
exceptiongroup==1.1.0
109+
# via
110+
# hypothesis
111+
# pytest
106112
fastapi==0.90.1
107113
# via -r requirements-base.in
108114
frozenlist==1.3.3
@@ -169,6 +175,8 @@ jira==2.0.0
169175
# via -r requirements-base.in
170176
joblib==1.2.0
171177
# via -r requirements-base.in
178+
jsonpath-ng==1.5.3
179+
# via -r requirements-base.in
172180
jsonschema==4.17.3
173181
# via
174182
# hypothesis-jsonschema
@@ -233,6 +241,8 @@ pdpyras==4.5.2
233241
# via -r requirements-base.in
234242
pluggy==1.0.0
235243
# via pytest
244+
ply==3.11
245+
# via jsonpath-ng
236246
premailer==3.10.0
237247
# via emails
238248
preshed==3.0.8
@@ -347,6 +357,7 @@ six==1.16.0
347357
# google-auth
348358
# google-auth-httplib2
349359
# jira
360+
# jsonpath-ng
350361
# junit-xml
351362
# oauth2client
352363
# patsy
@@ -410,7 +421,9 @@ text-unidecode==1.3
410421
thinc==8.1.5
411422
# via spacy
412423
tomli==2.0.1
413-
# via schemathesis
424+
# via
425+
# pytest
426+
# schemathesis
414427
tomli-w==1.0.0
415428
# via schemathesis
416429
tqdm==4.64.1
@@ -423,6 +436,7 @@ typing-extensions==4.4.0
423436
# via
424437
# pydantic
425438
# schemathesis
439+
# starlette
426440
uritemplate==4.1.1
427441
# via google-api-python-client
428442
urllib3==1.26.13

src/dispatch/entity/models.py

+3
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class EntityBase(DispatchBase):
5151

5252

5353
class EntityCreate(EntityBase):
54+
def __hash__(self):
55+
return hash((self.id, self.value))
56+
5457
id: Optional[PrimaryKey]
5558
entity_type: EntityTypeCreate
5659
project: ProjectRead

src/dispatch/entity/service.py

+141-77
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from datetime import datetime, timedelta
2-
from typing import Optional, Sequence
2+
from typing import Generator, Optional, Sequence, Union, NewType, NamedTuple
33
import re
44

5+
import jsonpath_ng
56
from pydantic.error_wrappers import ErrorWrapper, ValidationError
67
from sqlalchemy.orm import Session, joinedload
78

@@ -163,114 +164,177 @@ def get_signal_instances_with_entity(
163164
return signal_instances
164165

165166

167+
EntityTypePair = NewType(
168+
"EntityTypePair",
169+
NamedTuple(
170+
"EntityTypePairTuple",
171+
[
172+
("entity_type", EntityType),
173+
("regex", Union[re.Pattern[str], None]),
174+
("json_path", Union[jsonpath_ng.JSONPath, None]),
175+
],
176+
),
177+
)
178+
179+
166180
def find_entities(
167181
db_session: Session, signal_instance: SignalInstance, entity_types: Sequence[EntityType]
168182
) -> list[Entity]:
169-
"""Find entities of the given types in the raw data of a signal instance.
183+
"""
184+
Find entities in a SignalInstance based on a list of EntityTypes.
170185
171186
Args:
172-
db_session (Session): SQLAlchemy database session.
173-
signal_instance (SignalInstance): SignalInstance to search for entities in.
174-
entity_types (list[EntityType]): List of EntityType objects to search for.
187+
db_session (Session): The database session to use for entity creation.
188+
signal_instance (SignalInstance): The SignalInstance to extract entities from.
189+
entity_types (Sequence[EntityType]): A list of EntityTypes to search for in the SignalInstance.
175190
176191
Returns:
177-
list[Entity]: List of Entity objects found.
178-
179-
Example:
180-
>>> signal_instance = SignalInstance(
181-
... raw={
182-
... "name": "John Smith",
183-
... "email": "[email protected]",
184-
... "phone": "555-555-1212",
185-
... "address": {
186-
... "street": "123 Main St",
187-
... "city": "Anytown",
188-
... "state": "CA",
189-
... "zip": "12345"
190-
... },
191-
... "notes": "Customer is interested in buying a product."
192-
... }
193-
... )
194-
>>> entity_types = [
195-
... EntityType(name="Name", field="name", regular_expression=r"\b[A-Z][a-z]+ [A-Z][a-z]+\b"),
196-
... EntityType(name="Phone", field=None, regular_expression=r"\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\b"),
197-
... EntityType(name="Street", field="address.street"),
198-
... ]
199-
>>> entities = find_entities(db_session, signal_instance, entity_types)
200-
201-
Notes:
202-
This function uses depth-first search to traverse the raw data of the signal instance. It searches for
203-
the regular expressions specified in the EntityType objects in the values of the dictionary, list, and
204-
string objects encountered during the traversal. The search can be limited to a specific key in the
205-
dictionary objects by specifying a value for the 'field' attribute of the EntityType object.
192+
list[Entity]: A list of entities found in the SignalInstance.
206193
"""
207194

208-
def _search(key, val, entity_type_pairs):
209-
# Create a list to hold any entities that are found in this value
210-
entities = []
211-
212-
# If this value has been searched before, return the cached entities
213-
if id(val) in cache:
214-
return cache[id(val)]
215-
195+
def _find_entites_by_regex(
196+
val: Union[dict, str, list],
197+
signal_instance: SignalInstance,
198+
entity_type_pairs: list[EntityTypePair],
199+
) -> Generator[EntityCreate, None, None]:
200+
"""
201+
Find entities in a value using regular expressions.
202+
203+
Args:
204+
val: The value to search for entities in.
205+
signal_instance (SignalInstance): The SignalInstance being processed.
206+
entity_type_pairs (list): A list of (entity_type, entity_regex, field) tuples to search for.
207+
208+
Yields:
209+
EntityCreate: An entity found in the value.
210+
211+
Examples:
212+
>>> entity_type_pairs = [
213+
... (
214+
... EntityType("PERSON", r"([A-Z][a-z]+)+"),
215+
... re.compile(r"([A-Z][a-z]+)+"),
216+
... None
217+
... ),
218+
... (
219+
... EntityType("DATE", r"(\d{4}(-\d{2}){2}|\d{4}\/\d{2}\/\d{2})"), # noqa
220+
... re.compile(r"(\d{4}(-\d{2}){2}|\\d{4}\/\d{2}\/\d{2})"), # noqa
221+
... None
222+
... )
223+
... ]
224+
225+
>>> signal_instance = SignalInstance(raw={"text": "John Doe was born on 1987-05-12."})
226+
227+
>>> entities = list(_find_entites_by_regex(signal_instance.raw, signal_instance, entity_type_pairs))
228+
229+
>>> entities[0].value
230+
'John Doe'
231+
>>> entities[0].entity_type.name
232+
'PERSON'
233+
>>> entities[1].value
234+
'1987-05-12'
235+
>>> entities[1].entity_type.name
236+
'DATE'
237+
"""
216238
# If the value is a dictionary, search its key-value pairs recursively
217239
if isinstance(val, dict):
218-
for subkey, subval in val.items():
219-
entities.extend(_search(subkey, subval, entity_type_pairs))
240+
for _, subval in val.items():
241+
yield from _find_entites_by_regex(
242+
subval,
243+
signal_instance,
244+
entity_type_pairs,
245+
)
220246

221247
# If the value is a list, search its items recursively
222248
elif isinstance(val, list):
223249
for item in val:
224-
entities.extend(_search(None, item, entity_type_pairs))
250+
yield from _find_entites_by_regex(
251+
item,
252+
signal_instance,
253+
entity_type_pairs,
254+
)
225255

226256
# If the value is a string, search it for entity matches
227257
elif isinstance(val, str):
228-
for entity_type, entity_regex, field in entity_type_pairs:
229-
# If a field was specified for this entity type, only search that field
230-
if not field or key == field:
231-
if entity_regex is None:
232-
# If no regular expression was specified, return the value of the field/key
233-
entity = EntityCreate(
234-
value=val,
235-
entity_type=entity_type,
236-
project=signal_instance.project,
237-
)
238-
entities.append(entity)
239-
else:
240-
# Search the string for matches to the entity type's regular expression
241-
if match := entity_regex.search(val):
242-
entity = EntityCreate(
243-
value=match.group(0),
244-
entity_type=entity_type,
245-
project=signal_instance.project,
246-
)
247-
entities.append(entity)
248-
249-
# Cache the entities found for this value
250-
cache[id(val)] = entities
251-
return entities
258+
for entity_type, entity_regex, _ in entity_type_pairs:
259+
# Search the string for matches to the entity type's regular expression
260+
if match := entity_regex.search(val):
261+
yield EntityCreate(
262+
value=match.group(0),
263+
entity_type=entity_type,
264+
project=signal_instance.project,
265+
)
266+
267+
def _find_entities_by_regex_and_jsonpath_expression(
268+
signal_instance: SignalInstance,
269+
entity_type_pairs: list[EntityTypePair],
270+
) -> Generator[EntityCreate, None, None]:
271+
"""
272+
Yield entities found in a SignalInstance by searching its fields using regular expressions and JSONPath expressions.
273+
274+
Args:
275+
signal_instance: The SignalInstance to extract entities from.
276+
entity_type_pairs: A list of (entity_type, entity_regex, field) tuples to search for.
277+
278+
Yields:
279+
EntityCreate: An entity found in the SignalInstance.
280+
"""
281+
for entity_type, entity_regex, field in entity_type_pairs:
282+
if field:
283+
try:
284+
matches = field.find(signal_instance.raw)
285+
for match in matches:
286+
if isinstance(match.value, str):
287+
if entity_regex is None:
288+
yield EntityCreate(
289+
value=match.value,
290+
entity_type=entity_type,
291+
project=signal_instance.project,
292+
)
293+
else:
294+
if match := entity_regex.search(match.value):
295+
yield EntityCreate(
296+
value=match.group(0),
297+
entity_type=entity_type,
298+
project=signal_instance.project,
299+
)
300+
except jsonpath_ng.PathNotFound:
301+
# field not found in signal_instance.raw
302+
pass
252303

253304
# Create a list of (entity type, regular expression, field) tuples
254305
entity_type_pairs = [
255-
(type, re.compile(type.regular_expression) if type.regular_expression else None, type.field)
306+
(
307+
type,
308+
re.compile(type.regular_expression) if type.regular_expression else None,
309+
jsonpath_ng.parse(type.field) if type.field else None,
310+
)
256311
for type in entity_types
257312
if isinstance(type.regular_expression, str) or type.field is not None
258313
]
259314

260-
# Initialize a cache of previously searched values
261-
cache = {}
315+
# Filter the entity type pairs based on the field
316+
filtered_entity_type_pairs = [
317+
(entity_type, entity_regex, field)
318+
for entity_type, entity_regex, field in entity_type_pairs
319+
if not field
320+
]
262321

263-
# Traverse the signal data using depth-first search
322+
# Use the recursive search function to find entities in the raw data
264323
entities = [
265324
entity
266-
for key, val in signal_instance.raw.items()
267-
for entity in _search(key, val, entity_type_pairs)
325+
for _, val in signal_instance.raw.items()
326+
for entity in _find_entites_by_regex(val, signal_instance, filtered_entity_type_pairs)
268327
]
269328

270-
# Create the entities in the database and add them to the signal instance
329+
entities.extend(
330+
_find_entities_by_regex_and_jsonpath_expression(signal_instance, entity_type_pairs)
331+
)
332+
333+
# Filter out duplicate entities
334+
entities = list(set(entities))
335+
271336
entities_out = [
272337
get_by_value_or_create(db_session=db_session, entity_in=entity_in) for entity_in in entities
273338
]
274339

275-
# Return the list of entities found
276340
return entities_out

src/dispatch/signal/service.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import uuid
23
import hashlib
34
from typing import Optional
45

0 commit comments

Comments
 (0)