Skip to content

Commit c13d630

Browse files
committed
update remove_* rules
1 parent e07d84c commit c13d630

8 files changed

+168
-29
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<a alt="Downloads">
1212
<img src="https://img.shields.io/badge/downloads-6k-yellow" /></a>
1313
<a alt="Version">
14-
<img src="https://img.shields.io/badge/version-1.3.50-green" /></a>
14+
<img src="https://img.shields.io/badge/version-1.3.51-green" /></a>
1515
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
1616
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
1717
</p>

jionlp/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# description: Preprocessing tool for Chinese NLP
99
"""
1010

11-
__version__ = '1.3.50'
11+
__version__ = '1.3.51'
1212

1313

1414
import os
@@ -78,6 +78,7 @@
7878
│ | 2021-10-25 | update extract money and parse money | │
7979
│ | 2021-11-10 | add logger tuner | │
8080
│ | 2021-12-04 | add chinese word segmentor tools | │
81+
│ | 2022-03-02 | update email & tel rules | │
8182
│ │
8283
╰──────────────────────────────────────────────────────────────────────────╯
8384
"""

jionlp/rule/extractor.py

+82-24
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(self):
2323
self.money_pattern = None
2424
self.email_pattern = None
2525
self.email_domain_pattern = None
26+
self.email_prefix_pattern = None
2627
self.url_pattern = None
2728
self.phone_number_pattern = None
2829
self.ip_address_pattern = None
@@ -32,6 +33,7 @@ def __init__(self):
3233
self.strict_qq_pattern = None
3334
self.cell_phone_pattern = None
3435
self.landline_phone_pattern = None
36+
self.phone_prefix_pattern = None
3537
self.extract_parentheses_pattern = None
3638
self.remove_parentheses_pattern = None
3739
self.parentheses_pattern = PARENTHESES_PATTERN
@@ -47,6 +49,7 @@ def _extract_base(pattern, text, with_offset=False):
4749
""" 正则抽取器的基础函数
4850
4951
Args:
52+
pattern(re.compile): 正则表达式对象
5053
text(str): 字符串文本
5154
with_offset(bool): 是否携带 offset (抽取内容字段在文本中的位置信息)
5255
@@ -55,15 +58,9 @@ def _extract_base(pattern, text, with_offset=False):
5558
5659
"""
5760
if with_offset:
58-
'''
59-
if pattern == self.strict_qq_pattern:
60-
for item in pattern.finditer(text):
61-
pdb.set_trace()
62-
pdb.set_trace()
63-
#'''
6461
results = [{'text': item.group(1),
6562
'offset': (item.span()[0] - 1, item.span()[1] - 1)}
66-
for item in pattern.finditer(text)]
63+
for item in pattern.finditer(text)]
6764
else:
6865
results = [item.group(1) for item in pattern.finditer(text)]
6966

@@ -95,8 +92,9 @@ def clean_text(self, text, remove_html_tag=True,
9592
convert_full2half=True,
9693
remove_exception_char=True, remove_url=True,
9794
remove_redundant_char=True, remove_parentheses=True,
98-
remove_email=True, remove_phone_number=True):
99-
""" 清洗文本
95+
remove_email=True, remove_phone_number=True,
96+
delete_prefix=False):
97+
""" 清洗文本,关键字参数均默认为 True
10098
10199
Args:
102100
text(str): 待清理文本
@@ -108,6 +106,7 @@ def clean_text(self, text, remove_html_tag=True,
108106
remove_url(bool): 是否删除 url 链接
109107
remove_email(bool): 是否删除 email
110108
remove_phone_number(bool): 是否删除电话号码
109+
delete_prefix(bool): 是否删除 email 和 电话号码的前缀,如 `E-mail: [email protected]`
111110
112111
Returns:
113112
str: 清理后的文本
@@ -127,9 +126,9 @@ def clean_text(self, text, remove_html_tag=True,
127126
if remove_url:
128127
text = self.remove_url(text)
129128
if remove_email:
130-
text = self.remove_email(text)
129+
text = self.remove_email(text, delete_prefix=delete_prefix)
131130
if remove_phone_number:
132-
text = self.remove_phone_number(text)
131+
text = self.remove_phone_number(text, delete_prefix=delete_prefix)
133132

134133
return text
135134

@@ -167,12 +166,11 @@ def extract_email(self, text, detail=False):
167166

168167
detail_results = list()
169168
for item in results:
170-
domain_name = self.email_domain_pattern.search(
171-
item['text']).group(1)
169+
domain_name = self.email_domain_pattern.search(item['text']).group(1)
172170
item.update({'domain_name': domain_name})
173171
detail_results.append(item)
174172
return detail_results
175-
173+
176174
def extract_id_card(self, text, detail=False):
177175
""" 提取文本中的 ID 身份证号
178176
@@ -218,7 +216,7 @@ def extract_money(self, text, detail=False):
218216
detail(bool): 返回字符串的详细信息 offset,默认为 False
219217
220218
Returns:
221-
list: email列表
219+
list: 货币金额列表
222220
223221
Examples:
224222
>>> import jionlp as jio
@@ -285,7 +283,7 @@ def extract_qq(self, text, detail=False, strict=True):
285283
strict(bool): QQ号很容易和其他数字混淆,因此选择采用严格或宽松规则匹配
286284
287285
Returns:
288-
list: email列表
286+
list: QQ 号列表
289287
290288
"""
291289
if self.qq_pattern is None:
@@ -400,21 +398,50 @@ def extract_parentheses(self, text, parentheses=PARENTHESES_PATTERN, detail=Fals
400398

401399
return content_list
402400

403-
def remove_email(self, text):
401+
def remove_email(self, text, delete_prefix=False):
404402
""" 删除文本中的 email
405403
406404
Args:
407405
text(str): 字符串文本
406+
delete_prefix(bool): 删除电子邮箱前的前缀符,如 `E-mail: [email protected]`
407+
由于计算前缀符的匹配,该方法计算效率会慢。
408408
409409
Returns:
410410
str: 删除 email 后的文本
411411
412412
"""
413413
if self.email_pattern is None:
414414
self.email_pattern = re.compile(EMAIL_PATTERN)
415-
415+
self.email_prefix_pattern = re.compile(EMAIL_PREFIX_PATTERN)
416+
416417
text = ''.join(['#', text, '#'])
417-
return self.email_pattern.sub('', text)[1:-1]
418+
if not delete_prefix:
419+
text = self.email_pattern.sub('', text)
420+
return text[1:-1]
421+
else:
422+
423+
results = self._extract_base(self.email_pattern, text, with_offset=True)
424+
prefix_results = self._extract_base(self.email_prefix_pattern, text, with_offset=True)
425+
426+
offset_list = [item['offset'][0] for item in results]
427+
428+
clean_prefix_offsets = [
429+
item['offset'] for item in prefix_results if item['offset'][1] in offset_list]
430+
431+
final_text_list = list()
432+
for idx, item in enumerate(clean_prefix_offsets):
433+
if idx == 0:
434+
final_text_list.append(text[0: item[0]+1])
435+
436+
if idx == len(clean_prefix_offsets) - 1:
437+
final_text_list.append(text[item[1]+1:])
438+
else:
439+
final_text_list.append(text[item[1]+1: clean_prefix_offsets[idx + 1][0]+1])
440+
441+
text = ''.join(final_text_list)
442+
text = self.email_pattern.sub('', text)
443+
444+
return text[1:-1]
418445

419446
def remove_exception_char(self, text):
420447
""" 删除文本中的异常字符
@@ -514,26 +541,57 @@ def remove_parentheses(self, text, parentheses=PARENTHESES_PATTERN):
514541
return text
515542
length = len(text)
516543

517-
def remove_phone_number(self, text):
544+
def remove_phone_number(self, text, delete_prefix=False):
518545
""" 删除文本中的电话号码
519546
520547
Args:
521548
text(str): 字符串文本
549+
delete_prefix(bool): 删除电话号码前缀,如 `电 话:198xxxxxxxx`
522550
523551
Returns:
524552
str: 删除电话号码后的文本
525553
526554
"""
527555
if self.cell_phone_pattern is None:
528556
self.cell_phone_pattern = re.compile(CELL_PHONE_PATTERN)
529-
557+
self.phone_prefix_pattern = re.compile(PHONE_PREFIX_PATTERN)
558+
530559
if self.landline_phone_pattern is None:
531560
self.landline_phone_pattern = re.compile(LANDLINE_PHONE_PATTERN)
561+
self.phone_prefix_pattern = re.compile(PHONE_PREFIX_PATTERN)
532562

533563
text = ''.join(['#', text, '#'])
534-
text = self.cell_phone_pattern.sub('', text)
535-
text = self.landline_phone_pattern.sub('', text)
536-
564+
565+
if not delete_prefix:
566+
text = self.cell_phone_pattern.sub('', text)
567+
text = self.landline_phone_pattern.sub('', text)
568+
569+
else:
570+
cell_results = self._extract_base(self.cell_phone_pattern, text, with_offset=True)
571+
landline_results = self._extract_base(self.landline_phone_pattern, text, with_offset=True)
572+
results = sorted(cell_results + landline_results, key=lambda i: i['offset'][0])
573+
574+
prefix_results = self._extract_base(self.phone_prefix_pattern, text, with_offset=True)
575+
576+
offset_list = [item['offset'][0] for item in results]
577+
578+
clean_prefix_offsets = [
579+
item['offset'] for item in prefix_results if item['offset'][1] in offset_list]
580+
581+
final_text_list = list()
582+
for idx, item in enumerate(clean_prefix_offsets):
583+
if idx == 0:
584+
final_text_list.append(text[0: item[0]+1])
585+
586+
if idx == len(clean_prefix_offsets) - 1:
587+
final_text_list.append(text[item[1]+1:])
588+
else:
589+
final_text_list.append(text[item[1]+1: clean_prefix_offsets[idx + 1][0]+1])
590+
591+
text = ''.join(final_text_list)
592+
text = self.cell_phone_pattern.sub('', text)
593+
text = self.landline_phone_pattern.sub('', text)
594+
537595
return text[1:-1]
538596

539597
def remove_qq(self, text, strict=True):

jionlp/rule/rule_pattern.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
# 该规则用于抽取与判定手机号的归属地,即抽取前三位、中间4位
1717
CELL_PHONE_CHECK_PATTERN = r'((1[3-9][0-9]))([- ])?\d{4}([- ])?\d{4}'
1818

19+
# 手机|电话 号码前缀,例如:`Tel: 18902437922`,用于删除前缀
20+
PHONE_PREFIX_PATTERN = r'(([tT](el(ephone)?|EL(EPHONE)?)|[cC](ell(phone)?|ELL(PHONE)?)|' \
21+
r'((联系)?电[ \t\u3000]*话|手[ \t\u3000]*机)(号(码)?)?)(:|:)?[\t \u3000]*)' \
22+
r'(?=[^:: \t\u3000])'
23+
1924
# ---------------------------------------------------------------------
2025
# 中文字符正则
2126
ANCIENT_CHINESE_CHAR_PATTERN = '[一-龥㐀-䶵]' # 在 gb13000.1 基础上扩展 6582 个古汉字,共 27484 个汉字
@@ -38,6 +43,10 @@
3843
# 抽取邮箱的域名
3944
EMAIL_DOMAIN_PATTERN = r'(?<=@)([0-9a-zA-Z]+)(?=\.)'
4045

46+
# 抽取邮箱的前缀,一般为:`email: [email protected]`,用于删除前缀
47+
EMAIL_PREFIX_PATTERN = r'(([eE](\-|—)?(mail|MAIL)|(电子)?邮箱)(:|:)?[\t \u3000]*)' \
48+
r'(?=[^:: \t\u3000])'
49+
4150
# ---------------------------------------------------------------------
4251
# 转义符号
4352
ESCAPE_CHAR_PATTERN = '\t\n\a\b\f\r\v'
@@ -291,7 +300,7 @@
291300
# URL
292301
URL_PATTERN = r'(?<=[^.])((?:(?:https?|ftp|file)://|(?<![a-zA-Z\-\.])www\.)' \
293302
r'[\-A-Za-z0-9\+&@\(\)#/%\?=\~_|!:\,\.\;]+[\-A-Za-z0-9\+&@#/%=\~_\|])' \
294-
r'(?=[<\u4E00-\u9FA5¥,。;!?、“”‘’>()—《》…● ])'
303+
r'(?=[<\u4E00-\u9FA5¥,。;!?、“”‘’>()—《》…● \t\n])'
295304

296305

297306
#######################################################################

test/test_main.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
from test_money_parser import TestMoneyParser
99
from test_time_extractor import TestTimeExtractor
1010
from test_money_extractor import TestMoneyExtractor
11-
from text_remove_url import TestRemoveUrl
11+
from test_remove_url import TestRemoveUrl
12+
from test_remove_email import TestRemoveEmail
13+
from test_remove_phone_number import TestRemovePhoneNumber
1214

1315

1416
if __name__ == '__main__':
@@ -23,8 +25,11 @@
2325
TestMoneyParser('test_money_parser'), # 测试 金额抽取与规范化
2426
TestTimeExtractor('test_time_extractor'), # 测试 时间实体抽取
2527
TestMoneyExtractor('test_money_extractor'), # 测试 货币金额实体抽取
26-
TestRemoveUrl('test_remove_url') # 测试 清洗文本中的超链接
28+
TestRemoveUrl('test_remove_url'), # 测试 清洗文本中的超链接
29+
TestRemoveEmail('test_remove_email'), # 测试 清洗文本中的 email
30+
TestRemovePhoneNumber('test_remove_phone_number') # 测试 清洗文本中的电话号码
2731
]
32+
2833
suite.addTests(tests)
2934

3035
runner = unittest.TextTestRunner(verbosity=1)

test/test_remove_email.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# -*- coding=utf-8 -*-
2+
3+
import unittest
4+
5+
import jionlp as jio
6+
7+
8+
class TestRemoveEmail(unittest.TestCase):
9+
""" 测试清除 email 工具 """
10+
11+
def test_remove_email(self):
12+
""" test func remove_email """
13+
14+
email_text_list = [
15+
['Beihang University E-mail 给她打电话啊 Email: [email protected] , 中国[email protected]。',
16+
'Beihang University E-mail 给她打电话啊 , 中国。'],
17+
]
18+
19+
for item in email_text_list:
20+
clean_text = jio.remove_email(item[0], delete_prefix=True)
21+
print(item[0])
22+
self.assertEqual(clean_text, item[1])
23+
24+
25+
if __name__ == '__main__':
26+
27+
suite = unittest.TestSuite()
28+
test_remove_email = [TestRemoveEmail('test_remove_email')]
29+
suite.addTests(test_remove_email)
30+
31+
runner = unittest.TextTestRunner(verbosity=1)
32+
runner.run(suite)
33+

test/test_remove_phone_number.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# -*- coding=utf-8 -*-
2+
3+
import unittest
4+
5+
import jionlp as jio
6+
7+
8+
class TestRemovePhoneNumber(unittest.TestCase):
9+
""" 测试清除 phone_number 工具 """
10+
11+
def test_remove_phone_number(self):
12+
""" test func remove_phone_number """
13+
14+
phone_number_text_list = [
15+
[' 电话:(010)37283893 他手机号多少?18702812943. 还有一个是17209374283 [email protected]联系电话: (0351)89082910',
16+
' 他手机号多少?. 还有一个是 [email protected]'],
17+
]
18+
19+
for item in phone_number_text_list:
20+
clean_text = jio.remove_phone_number(item[0], delete_prefix=True)
21+
print(item[0])
22+
self.assertEqual(clean_text, item[1])
23+
24+
25+
if __name__ == '__main__':
26+
27+
suite = unittest.TestSuite()
28+
test_remove_phone_number = [TestRemovePhoneNumber('test_remove_phone_number')]
29+
suite.addTests(test_remove_phone_number)
30+
31+
runner = unittest.TextTestRunner(verbosity=1)
32+
runner.run(suite)
33+
File renamed without changes.

0 commit comments

Comments
 (0)