@@ -23,6 +23,7 @@ def __init__(self):
23
23
self .money_pattern = None
24
24
self .email_pattern = None
25
25
self .email_domain_pattern = None
26
+ self .email_prefix_pattern = None
26
27
self .url_pattern = None
27
28
self .phone_number_pattern = None
28
29
self .ip_address_pattern = None
@@ -32,6 +33,7 @@ def __init__(self):
32
33
self .strict_qq_pattern = None
33
34
self .cell_phone_pattern = None
34
35
self .landline_phone_pattern = None
36
+ self .phone_prefix_pattern = None
35
37
self .extract_parentheses_pattern = None
36
38
self .remove_parentheses_pattern = None
37
39
self .parentheses_pattern = PARENTHESES_PATTERN
@@ -47,6 +49,7 @@ def _extract_base(pattern, text, with_offset=False):
47
49
""" 正则抽取器的基础函数
48
50
49
51
Args:
52
+ pattern(re.compile): 正则表达式对象
50
53
text(str): 字符串文本
51
54
with_offset(bool): 是否携带 offset (抽取内容字段在文本中的位置信息)
52
55
@@ -55,15 +58,9 @@ def _extract_base(pattern, text, with_offset=False):
55
58
56
59
"""
57
60
if with_offset :
58
- '''
59
- if pattern == self.strict_qq_pattern:
60
- for item in pattern.finditer(text):
61
- pdb.set_trace()
62
- pdb.set_trace()
63
- #'''
64
61
results = [{'text' : item .group (1 ),
65
62
'offset' : (item .span ()[0 ] - 1 , item .span ()[1 ] - 1 )}
66
- for item in pattern .finditer (text )]
63
+ for item in pattern .finditer (text )]
67
64
else :
68
65
results = [item .group (1 ) for item in pattern .finditer (text )]
69
66
@@ -95,8 +92,9 @@ def clean_text(self, text, remove_html_tag=True,
95
92
convert_full2half = True ,
96
93
remove_exception_char = True , remove_url = True ,
97
94
remove_redundant_char = True , remove_parentheses = True ,
98
- remove_email = True , remove_phone_number = True ):
99
- """ 清洗文本
95
+ remove_email = True , remove_phone_number = True ,
96
+ delete_prefix = False ):
97
+ """ 清洗文本,关键字参数均默认为 True
100
98
101
99
Args:
102
100
text(str): 待清理文本
@@ -108,6 +106,7 @@ def clean_text(self, text, remove_html_tag=True,
108
106
remove_url(bool): 是否删除 url 链接
109
107
remove_email(bool): 是否删除 email
110
108
remove_phone_number(bool): 是否删除电话号码
109
+ delete_prefix(bool): 是否删除 email 和 电话号码的前缀,如 `E-mail: [email protected] `
111
110
112
111
Returns:
113
112
str: 清理后的文本
@@ -127,9 +126,9 @@ def clean_text(self, text, remove_html_tag=True,
127
126
if remove_url :
128
127
text = self .remove_url (text )
129
128
if remove_email :
130
- text = self .remove_email (text )
129
+ text = self .remove_email (text , delete_prefix = delete_prefix )
131
130
if remove_phone_number :
132
- text = self .remove_phone_number (text )
131
+ text = self .remove_phone_number (text , delete_prefix = delete_prefix )
133
132
134
133
return text
135
134
@@ -167,12 +166,11 @@ def extract_email(self, text, detail=False):
167
166
168
167
detail_results = list ()
169
168
for item in results :
170
- domain_name = self .email_domain_pattern .search (
171
- item ['text' ]).group (1 )
169
+ domain_name = self .email_domain_pattern .search (item ['text' ]).group (1 )
172
170
item .update ({'domain_name' : domain_name })
173
171
detail_results .append (item )
174
172
return detail_results
175
-
173
+
176
174
def extract_id_card (self , text , detail = False ):
177
175
""" 提取文本中的 ID 身份证号
178
176
@@ -218,7 +216,7 @@ def extract_money(self, text, detail=False):
218
216
detail(bool): 返回字符串的详细信息 offset,默认为 False
219
217
220
218
Returns:
221
- list: email列表
219
+ list: 货币金额列表
222
220
223
221
Examples:
224
222
>>> import jionlp as jio
@@ -285,7 +283,7 @@ def extract_qq(self, text, detail=False, strict=True):
285
283
strict(bool): QQ号很容易和其他数字混淆,因此选择采用严格或宽松规则匹配
286
284
287
285
Returns:
288
- list: email列表
286
+ list: QQ 号列表
289
287
290
288
"""
291
289
if self .qq_pattern is None :
@@ -400,21 +398,50 @@ def extract_parentheses(self, text, parentheses=PARENTHESES_PATTERN, detail=Fals
400
398
401
399
return content_list
402
400
403
- def remove_email (self , text ):
401
+ def remove_email (self , text , delete_prefix = False ):
404
402
""" 删除文本中的 email
405
403
406
404
Args:
407
405
text(str): 字符串文本
406
+ delete_prefix(bool): 删除电子邮箱前的前缀符,如 `E-mail: [email protected] `
407
+ 由于计算前缀符的匹配,该方法计算效率会慢。
408
408
409
409
Returns:
410
410
str: 删除 email 后的文本
411
411
412
412
"""
413
413
if self .email_pattern is None :
414
414
self .email_pattern = re .compile (EMAIL_PATTERN )
415
-
415
+ self .email_prefix_pattern = re .compile (EMAIL_PREFIX_PATTERN )
416
+
416
417
text = '' .join (['#' , text , '#' ])
417
- return self .email_pattern .sub ('' , text )[1 :- 1 ]
418
+ if not delete_prefix :
419
+ text = self .email_pattern .sub ('' , text )
420
+ return text [1 :- 1 ]
421
+ else :
422
+
423
+ results = self ._extract_base (self .email_pattern , text , with_offset = True )
424
+ prefix_results = self ._extract_base (self .email_prefix_pattern , text , with_offset = True )
425
+
426
+ offset_list = [item ['offset' ][0 ] for item in results ]
427
+
428
+ clean_prefix_offsets = [
429
+ item ['offset' ] for item in prefix_results if item ['offset' ][1 ] in offset_list ]
430
+
431
+ final_text_list = list ()
432
+ for idx , item in enumerate (clean_prefix_offsets ):
433
+ if idx == 0 :
434
+ final_text_list .append (text [0 : item [0 ]+ 1 ])
435
+
436
+ if idx == len (clean_prefix_offsets ) - 1 :
437
+ final_text_list .append (text [item [1 ]+ 1 :])
438
+ else :
439
+ final_text_list .append (text [item [1 ]+ 1 : clean_prefix_offsets [idx + 1 ][0 ]+ 1 ])
440
+
441
+ text = '' .join (final_text_list )
442
+ text = self .email_pattern .sub ('' , text )
443
+
444
+ return text [1 :- 1 ]
418
445
419
446
def remove_exception_char (self , text ):
420
447
""" 删除文本中的异常字符
@@ -514,26 +541,57 @@ def remove_parentheses(self, text, parentheses=PARENTHESES_PATTERN):
514
541
return text
515
542
length = len (text )
516
543
517
- def remove_phone_number (self , text ):
544
+ def remove_phone_number (self , text , delete_prefix = False ):
518
545
""" 删除文本中的电话号码
519
546
520
547
Args:
521
548
text(str): 字符串文本
549
+ delete_prefix(bool): 删除电话号码前缀,如 `电 话:198xxxxxxxx`
522
550
523
551
Returns:
524
552
str: 删除电话号码后的文本
525
553
526
554
"""
527
555
if self .cell_phone_pattern is None :
528
556
self .cell_phone_pattern = re .compile (CELL_PHONE_PATTERN )
529
-
557
+ self .phone_prefix_pattern = re .compile (PHONE_PREFIX_PATTERN )
558
+
530
559
if self .landline_phone_pattern is None :
531
560
self .landline_phone_pattern = re .compile (LANDLINE_PHONE_PATTERN )
561
+ self .phone_prefix_pattern = re .compile (PHONE_PREFIX_PATTERN )
532
562
533
563
text = '' .join (['#' , text , '#' ])
534
- text = self .cell_phone_pattern .sub ('' , text )
535
- text = self .landline_phone_pattern .sub ('' , text )
536
-
564
+
565
+ if not delete_prefix :
566
+ text = self .cell_phone_pattern .sub ('' , text )
567
+ text = self .landline_phone_pattern .sub ('' , text )
568
+
569
+ else :
570
+ cell_results = self ._extract_base (self .cell_phone_pattern , text , with_offset = True )
571
+ landline_results = self ._extract_base (self .landline_phone_pattern , text , with_offset = True )
572
+ results = sorted (cell_results + landline_results , key = lambda i : i ['offset' ][0 ])
573
+
574
+ prefix_results = self ._extract_base (self .phone_prefix_pattern , text , with_offset = True )
575
+
576
+ offset_list = [item ['offset' ][0 ] for item in results ]
577
+
578
+ clean_prefix_offsets = [
579
+ item ['offset' ] for item in prefix_results if item ['offset' ][1 ] in offset_list ]
580
+
581
+ final_text_list = list ()
582
+ for idx , item in enumerate (clean_prefix_offsets ):
583
+ if idx == 0 :
584
+ final_text_list .append (text [0 : item [0 ]+ 1 ])
585
+
586
+ if idx == len (clean_prefix_offsets ) - 1 :
587
+ final_text_list .append (text [item [1 ]+ 1 :])
588
+ else :
589
+ final_text_list .append (text [item [1 ]+ 1 : clean_prefix_offsets [idx + 1 ][0 ]+ 1 ])
590
+
591
+ text = '' .join (final_text_list )
592
+ text = self .cell_phone_pattern .sub ('' , text )
593
+ text = self .landline_phone_pattern .sub ('' , text )
594
+
537
595
return text [1 :- 1 ]
538
596
539
597
def remove_qq (self , text , strict = True ):
0 commit comments