Skip to content

Commit d0fe809

Browse files
committed
update time_parser and idiom_solitarie
1 parent 0ac7975 commit d0fe809

9 files changed

+592
-195
lines changed

README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<a alt="Downloads">
1212
<img src="https://img.shields.io/badge/downloads-5k-yellow" /></a>
1313
<a alt="Version">
14-
<img src="https://img.shields.io/badge/version-1.3.34-green" /></a>
14+
<img src="https://img.shields.io/badge/version-1.3.35-green" /></a>
1515
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
1616
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
1717
</p>
@@ -29,7 +29,7 @@
2929
#### 功能主要包括:文本清洗,删除HTML标签、删除异常字符、删除冗余字符,转换全角字母、数字、空格为半角,抽取及删除E-mail及域名、抽取及删除(手机号、座机号)电话号码、抽取及删除QQ号、抽取及删除括号内容、抽取及删除身份证号、抽取及删除IP地址、抽取及删除URL超链接、抽取及删除货币金额与单位,金额数字转大写汉字,时间语义解析,解析身份证号信息、解析手机号码归属地、解析座机区号归属地、解析手机号码运营商,按行快速读写文件,(多功能)停用词过滤,(优化的)分句,地址解析,新闻地域识别,繁简体转换,汉字转拼音,汉字偏旁、字形、四角编码、五笔编码拆解,基于词典的情感分析,色情数据过滤,反动数据过滤,关键短语抽取,抽取式文本摘要,成语接龙,成语词典、歇后语词典、新华字典、新华词典、停用词典、中国地名词典、中国县级地名变更词典、世界地名词典,时间实体抽取,基于词典的NER,NER的字、词级别转换,NER的entity和tag格式转换,NER模型的预测阶段加速并行工具集,NER标注和模型预测的结果差异对比,NER标注数据集分割与统计,NER实体收集、文本分类标注数据集的分割与统计、回译数据增强、相邻近汉字换位数据增强、同音词替换数据增强、随机增删字符数据增强、实体替换数据增强、公历转农历日期、农历转公历日期
3030

3131

32-
#### Update 2021-09-09
32+
#### Update 2021-09-20
3333
## 新增 [时间实体抽取](../../wiki/NER-说明文档#user-content-时间实体抽取)
3434

3535
#### jio.ner.extract_time 从文本中抽取时间实体(不依赖模型,纯规则)。
@@ -47,7 +47,7 @@ print(res)
4747

4848
```
4949

50-
#### Update 2021-07-26
50+
#### Update 2021-09-20
5151
## 新增 [时间语义解析](../../wiki/时间语义解析-说明文档#user-content-时间语义解析)
5252

5353
#### jio.parse_time 给定时间字符串,解析其为时间戳、时长等。
@@ -75,7 +75,7 @@ print(res)
7575
```
7676

7777
- 目前支持年月日、时分秒、星期、季节、季度、节日、农历、时间范围、时间段、时间周期、模糊时间代词等解析。
78-
- TODO: 细节与参数仍有待完善
78+
- 支持对未来时间优先选择,参数为`ret_future(bool)`
7979
- [关于**时间语义解析**](../../wiki/时间语义解析-说明文档)
8080
- 目前支持的所有 [测试用例](../../blob/master/test/test_time_parser.py)
8181

image/qr_code_for_collection.png

-971 Bytes
Loading

jionlp/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# description: Preprocessing tool for Chinese NLP
99
"""
1010

11-
__version__ = '1.3.34'
11+
__version__ = '1.3.35'
1212

1313
import os
1414

jionlp/gadget/idiom_solitaire.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ def __call__(self, cur_idiom, same_pinyin=True, check_idiom=False,
6565
if self.idiom_list is None:
6666
self._prepare()
6767

68+
if cur_idiom == '' or type(cur_idiom) is not str:
69+
logging.warning('please insert a Chinese idiom.')
70+
return ''
71+
6872
if restart:
6973
# 重新开始游戏,清空历史记录
7074
self.already_used_idioms = set()
@@ -102,7 +106,7 @@ def __call__(self, cur_idiom, same_pinyin=True, check_idiom=False,
102106
cur_last_char = cur_idiom[-1]
103107
backup_idioms = list()
104108
for idiom_obj in self.idiom_list:
105-
if idiom_obj in self.already_used_idioms:
109+
if idiom_obj['idiom'] in self.already_used_idioms:
106110
continue
107111

108112
if cur_last_char == idiom_obj['idiom'][0]:

jionlp/gadget/time_parser.py

+505-179
Large diffs are not rendered by default.

jionlp/rule/rule_pattern.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,11 @@
274274
SELF_EVI_LUNAR_MONTH_STRING = r'((闰)?[正冬腊]|闰([一二三四五六七八九十]|十[一二]|[1-9]|1[012]))月'
275275

276276
# 周
277-
# 支持了 minute 中 `刻` 的计数 [123一二两三]
278-
WEEK_NUM_STRING = r'[一二两三四五六七八九1-9]' # 1~9
277+
WEEK_NUM_STRING = r'[一二两三四五六七八九十0-9]{1,3}' # 1~52
279278
WEEK_STRING = r'(周|星期|礼拜)'
280279

281280
# 日
282-
DAY_NUM_STRING = r'(([0]?[1-9]|[12]\d|3[01])|([一二]?十)?[一二三四五六七八九]|(三十)?[一]|[二三]?十)'
281+
DAY_NUM_STRING = r'(([0]?[1-9]|[12]\d|3[01])|([一二]?十)?[一二三四五六七八九]|(三十)?[一]|[二三]?十)' # 1~31
283282
DAY_STRING = DAY_NUM_STRING + r'[日号]'
284283
BLUR_DAY_STRING = r'([上中下]旬|初|中|底|末)'
285284
# 允许 `初8` 阿拉伯数字出现,但不允许 `廿2`、`23` 等作为农历`日`
@@ -321,6 +320,7 @@
321320

322321
# time_delta 数字正则
323322
DELTA_NUM_STRING = r'(([一两二三四五六七八九十百千万零]+点)?[一两二三四五六七八九十百千万零]+|([\d十百千万,]+\.)?[\d十百千万,]+)'
323+
QUARTER_NUM_STRING = r'[一两二三四1-4]'
324324

325325
# 单个数字正则
326326
SINGLE_NUM_STRING = r'[一两二三四五六七八九十\d]'
@@ -335,7 +335,8 @@
335335
WEEK_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?((个(多)?)?(星期|礼拜)|周(?!年))', I, r'俩(星期|礼拜)'])
336336
HOUR_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?(个(多)?)?(小时|钟头)', I,
337337
'半(个(多)?)?(小时|钟头)', I, '俩(小时|钟头)', I, SINGLE_NUM_STRING, '个半(小时|钟头)'])
338-
MINUTE_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?分(钟)?(半)?', I, '一刻钟', I, '半分钟', I,
338+
QUARTER_DELTA_STRING = ''.join([QUARTER_NUM_STRING, '刻钟'])
339+
MINUTE_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?分(钟)?(半)?', I, '半分钟', I,
339340
SINGLE_NUM_STRING, '+分半(钟)?'])
340341
SECOND_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?秒(钟)?'])
341342

test/test_idiom_solitaire.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# -*- coding=utf-8 -*-
2+
3+
import unittest
4+
5+
import jionlp as jio
6+
7+
8+
class TestIdiomSolitaire(unittest.TestCase):
9+
""" 测试地址解析工具 """
10+
11+
def test_idiom_solitaire(self):
12+
""" test func idiom_solitaire """
13+
14+
idiom = '道阻且长'
15+
idiom = jio.idiom_solitaire(idiom, same_pinyin=False, same_tone=True)
16+
self.assertEqual(idiom[0], '长')
17+
18+
idiom = jio.idiom_solitaire('', same_pinyin=False, same_tone=True)
19+
self.assertEqual(idiom, '')
20+
21+
22+
if __name__ == '__main__':
23+
24+
suite = unittest.TestSuite()
25+
test_idiom_solitaire = [TestIdiomSolitaire('test_idiom_solitaire')]
26+
suite.addTests(test_idiom_solitaire)
27+
28+
runner = unittest.TextTestRunner(verbosity=1)
29+
runner.run(suite)
30+

test/test_main.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,18 @@
44
from test_text_aug import TestTextAug
55
from test_time_parser import TestTimeParser
66
from test_location_parser import TestLocationParser
7+
from test_idiom_solitaire import TestIdiomSolitaire
78

89

910
if __name__ == '__main__':
1011

1112
suite = unittest.TestSuite()
1213

1314
tests = [
14-
TestTimeParser('test_time_parser'), # 测试时间解析
15-
TestLocationParser('test_location_parser'), # 测试地址解析
16-
TestTextAug('test_ReplaceEntity'), # 测试实体替换增强
15+
TestTimeParser('test_time_parser'), # 测试 时间解析
16+
TestLocationParser('test_location_parser'), # 测试 地址解析
17+
TestTextAug('test_ReplaceEntity'), # 测试 实体替换增强
18+
TestIdiomSolitaire('test_idiom_solitaire'), # 测试 成语接龙
1719
]
1820
suite.addTests(tests)
1921

test/test_time_parser.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class TestTimeParser(unittest.TestCase):
1313
def test_time_parser(self):
1414
""" test func time_parser """
1515

16-
_ts_1 = 1623604000
17-
_ts_2 = 1630480532
16+
_ts_1 = 1623604000 # 2021-06-14 01:06:40
17+
_ts_2 = 1630480532 # 2021-09-01 15:15:32
1818
print('time stamp for test: ',
1919
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(_ts_1)))
2020

@@ -31,6 +31,7 @@ def test_time_parser(self):
3131
['2019.05.29 15:20-2020.01.12 12:10', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2019-05-29 15:20:00', '2020-01-12 12:10:59']}],
3232
['6·30', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-30 00:00:00', '2021-06-30 23:59:59']}],
3333
['2018', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2018-01-01 00:00:00', '2018-12-31 23:59:59']}],
34+
['2021-09-0910:09', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-09 10:09:00', '2021-09-09 10:09:59']}],
3435

3536
# 年、月、日(标准)
3637
['2015年8月12日', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2015-08-12 00:00:00', '2015-08-12 23:59:59']}],
@@ -114,7 +115,7 @@ def test_time_parser(self):
114115
['二十几年前', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['1991-01-01 00:00:00', '2001-12-31 23:59:59']}],
115116
['1000多年之后', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['3020-01-01 00:00:00', 'inf']}],
116117
['几十年之后', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2041-01-01 00:00:00', '2121-12-31 23:59:59']}],
117-
['一刻钟后', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-14 01:21:40', '2021-06-14 01:22:40']}],
118+
['一刻钟后', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-14 01:21:40', '2021-06-14 01:36:40']}],
118119

119120
# time span 式 `从……至……` 年、月、日、时、分、秒
120121
['2017年8月11日至8月22日', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2017-08-11 00:00:00', '2017-08-22 23:59:59']}],
@@ -146,6 +147,12 @@ def test_time_parser(self):
146147
['三年前', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2018-01-01 00:00:00', '2018-12-31 23:59:59']}],
147148
['二〇三五年前', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-06-14 01:06:40', '2035-12-31 23:59:59']}],
148149

150+
# time_span,limit 型
151+
['前天中午到明天晚上', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-12 12:00:00', '2021-06-15 23:59:59']}],
152+
['前年11月到去年3月', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2019-11-01 00:00:00', '2020-03-31 23:59:59']}],
153+
['2014年11月到去年3月', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2014-11-01 00:00:00', '2020-03-31 23:59:59']}],
154+
['2014年11月到下个月9号', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2014-11-01 00:00:00', '2021-07-09 23:59:59']}],
155+
149156
# time_span,枚举型
150157
['9月10号,11号,12号,13号', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-09-10 00:00:00', '2021-09-13 23:59:59']}],
151158

@@ -193,6 +200,11 @@ def test_time_parser(self):
193200
['6月1日周六早上10点钟', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-01 10:00:00', '2021-06-01 10:59:59']}], # 当设定 strict 时会报错
194201
['上个礼拜天', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-13 00:00:00', '2021-06-13 23:59:59']}],
195202

203+
# 年、周
204+
['20年第52周', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2020-12-28 00:00:00', '2021-01-03 23:59:59']}],
205+
['21年第一个礼拜', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-01-04 00:00:00', '2021-01-10 23:59:59']}],
206+
['今年第三十七个星期', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-09-13 00:00:00', '2021-09-19 23:59:59']}],
207+
196208
# 年、月、模糊日
197209
['6月上旬', {'year': 2021}, {'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-01 00:00:00', '2021-06-10 23:59:59']}],
198210
['1999年7月下旬', time.time(), {'type': 'time_span', 'definition': 'blur', 'time': ['1999-07-11 00:00:00', '1999-07-31 23:59:59']}],
@@ -203,6 +215,12 @@ def test_time_parser(self):
203215
# 限定年、月、模糊日
204216
['去年6月上旬', {'year': 2021}, {'type': 'time_span', 'definition': 'blur', 'time': ['2020-06-01 00:00:00', '2020-06-10 23:59:59']}],
205217

218+
# 超模糊 2 字
219+
['前两天', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-07 00:00:00', '2021-06-12 23:59:59']}],
220+
['前两年', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2016-01-01 00:00:00', '2019-12-31 23:59:59']}],
221+
['前两个钟头', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-13 19:00:00', '2021-06-13 23:59:59']}],
222+
['前两分钟', _ts_1, {'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-14 00:57:00', '2021-06-14 01:04:59']}],
223+
206224
# 限定日
207225
['前天', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-08-30 00:00:00', '2021-08-30 23:59:59']}],
208226
['后天', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-16 00:00:00', '2021-06-16 23:59:59']}],
@@ -284,6 +302,7 @@ def test_time_parser(self):
284302
['两日', None, {'type': 'time_delta', 'definition': 'accurate', 'time': {'day': 2.0}}],
285303
['俩礼拜', None, {'type': 'time_delta', 'definition': 'accurate', 'time': {'day': 14.0}}],
286304
['36天5小时30分', None, {'type': 'time_delta', 'definition': 'accurate', 'time': {'day': 36.0, 'hour': 5.0, 'minute': 30.0}}],
305+
['1刻钟', None, {'type': 'time_delta', 'definition': 'accurate', 'time': {'minute': 15.0}}],
287306

288307
# 法律时间
289308
['3年以上7年以下', None, {'type': 'time_delta', 'definition': 'blur', 'time': [{'year': 3.0}, {'year': 7.0}]}],
@@ -441,6 +460,21 @@ def test_time_parser(self):
441460
print(item[0])
442461
self.assertEqual(time_res, item[2])
443462

463+
time_string_list = [
464+
# 未来时间扩展
465+
['8号晚上9点', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-08 21:00:00', '2021-09-08 21:59:59']}],
466+
['1号晚上9点', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-01 21:00:00', '2021-09-01 21:59:59']}],
467+
['3月8号', _ts_2, {'type': 'time_span', 'definition': 'accurate', 'time': ['2022-03-08 00:00:00', '2022-03-08 23:59:59']}],
468+
['10月8号', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-10-08 00:00:00', '2021-10-08 23:59:59']}],
469+
['周一', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-06 00:00:00', '2021-09-06 23:59:59']}],
470+
['妇女节', _ts_2, {'type': 'time_point', 'definition': 'accurate', 'time': ['2022-03-08 00:00:00', '2022-03-08 23:59:59']}],
471+
]
472+
473+
for item in time_string_list:
474+
time_res = jio.parse_time(item[0], time_base=item[1], ret_future=True) #, strict=True)
475+
print(item[0])
476+
self.assertEqual(time_res, item[2])
477+
444478

445479
if __name__ == '__main__':
446480

0 commit comments

Comments
 (0)