Skip to content

Commit 950a38e

Browse files
committed
fix time_parser bug
1 parent 3ff290c commit 950a38e

File tree

7 files changed

+44
-4
lines changed

7 files changed

+44
-4
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<a alt="Downloads">
1212
<img src="https://pepy.tech/badge/jionlp/month" /></a>
1313
<a alt="Version">
14-
<img src="https://img.shields.io/badge/version-1.4.13-green" /></a>
14+
<img src="https://img.shields.io/badge/version-1.4.14-green" /></a>
1515
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
1616
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
1717
</p>

README_en.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<a alt="Downloads">
1212
<img src="https://pepy.tech/badge/jionlp/month" /></a>
1313
<a alt="Version">
14-
<img src="https://img.shields.io/badge/version-1.4.10-green" /></a>
14+
<img src="https://img.shields.io/badge/version-1.4.14-green" /></a>
1515
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
1616
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
1717
</p>

jionlp/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# website: www.jionlp.com/
1010
"""
1111

12-
__version__ = '1.4.13'
12+
__version__ = '1.4.14'
1313

1414

1515
import os

jionlp/algorithm/ner/time_extractor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def _prepare(self):
8787
self.four_num_year_pattern = re.compile(r'^[\d]{4}$')
8888
self.unit_pattern = re.compile(r'(多)?[万亿元]') # 四数字后接单位,说明非年份
8989

90-
self.single_char_time = ['春', '夏', '秋', '冬']
90+
self.single_char_time = set(['春', '夏', '秋', '冬'])
9191

9292
def __call__(self, text, time_base=time.time(), with_parsing=True, ret_all=False,
9393
ret_type='str', ret_future=False, period_results_num=None):

jionlp/gadget/time_parser.py

+28
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,12 @@ def _preprocess_regular_expression(self):
434434
''.join([bracket(LIMIT_YEAR_STRING), '第', bracket(WEEK_NUM_STRING),
435435
'(个)?', WEEK_STRING]))
436436

437+
# 1月1 此类不全的日期,缺少日
438+
# 注意,此种情况只针对 日 是 阿拉伯数字的情况,若是汉字 日,如 “五月二十”,则按农历进行解析,
439+
# 此时,则不存在日期的 “日” 的缺失。
440+
self.num_month_num_pattern = re.compile(
441+
''.join(['^', MONTH_NUM_STRING, '月', '([12]\d|3[01]|[0]?[1-9])', '$']))
442+
437443
# 公历固定节日
438444
self.year_fixed_solar_festival_pattern = re.compile(
439445
''.join([bracket_absence(YEAR_STRING), FIXED_SOLAR_FESTIVAL]))
@@ -967,6 +973,21 @@ def _adjust_underlying_future_time(self, time_string):
967973

968974
return time_string
969975

976+
def _compensate_num_month_num(self, time_string):
977+
""" 一种特定的日期类型,“1月1”,没指明 “日”。因此需要进行补全,然后再进行处理。
978+
979+
Args:
980+
time_string:
981+
982+
Returns:
983+
984+
"""
985+
matched_res = self.num_month_num_pattern.search(time_string)
986+
if matched_res is not None:
987+
return time_string + '日'
988+
else:
989+
return time_string
990+
970991
def parse_time_span_point(self, time_string):
971992
# 按照 “从 …… 至 ……” 进行解析
972993
first_time_string, second_time_string = self.parse_span_2_2_point(time_string)
@@ -976,6 +997,7 @@ def parse_time_span_point(self, time_string):
976997
old_time_base_handler = self.time_base_handler
977998
try:
978999
if first_time_string is not None and second_time_string is None:
1000+
first_time_string = self._compensate_num_month_num(first_time_string)
9791001

9801002
first_full_time_handler, _, _, blur_time = self.parse_time_point(
9811003
first_time_string, self.time_base_handler)
@@ -990,6 +1012,8 @@ def parse_time_span_point(self, time_string):
9901012
second_full_time_handler = self.time_base_handler
9911013
elif first_time_string is not None and second_time_string is not None:
9921014

1015+
first_time_string = self._compensate_num_month_num(first_time_string)
1016+
second_time_string = self._compensate_num_month_num(second_time_string)
9931017
first_time_string, second_time_string = self._compensate_string(
9941018
time_string, first_time_string, second_time_string)
9951019

@@ -1013,6 +1037,8 @@ def parse_time_span_point(self, time_string):
10131037
second_full_time_handler[4:] = [0, 0]
10141038

10151039
elif first_time_string is None and second_time_string is not None:
1040+
second_time_string = self._compensate_num_month_num(second_time_string)
1041+
10161042
_, second_full_time_handler, _, blur_time = self.parse_time_point(
10171043
second_time_string, self.time_base_handler)
10181044

@@ -1037,6 +1063,8 @@ def parse_time_span_point(self, time_string):
10371063
time_string, self.time_base_handler)
10381064
else:
10391065
# 非 time span,按 time_point 解析
1066+
time_string = self._compensate_num_month_num(time_string)
1067+
10401068
first_full_time_handler, second_full_time_handler, time_type, \
10411069
blur_time = self.parse_time_point(
10421070
time_string, self.time_base_handler)

test/test_time_extractor.py

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def test_time_extractor(self):
2424
['有十分之一的概率,股票赔钱了。', []],
2525
['住在南京网2021-09-21热度 578瞰地', [{'text': '2021-09-21', 'offset': [5, 15], 'type': 'time_point'}]],
2626
['根据财税2016 36号文', [{'text': '2016', 'offset': [4, 8], 'type': 'time_span'}]],
27+
['他在10月22出生', [{'text': '10月22', 'offset': [2, 7], 'type': 'time_point'}]],
28+
['1月3至2月10', [{'text': '1月3至2月10', 'offset': [0, 8], 'type': 'time_span'}]],
2729
]
2830

2931
for item in text_string_list:

test/test_time_parser.py

+10
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,12 @@ def test_time_parser(self):
311311
['上一个月', _ts_1,
312312
{'type': 'time_point', 'definition': 'blur', 'time': ['2021-05-01 00:00:00', '2021-05-31 23:59:59']}],
313313

314+
# 残缺 月、日,
315+
['1月3', _ts_1,
316+
{'type': 'time_point', 'definition': 'accurate', 'time': ['2021-01-03 00:00:00', '2021-01-03 23:59:59']}],
317+
['十月31', _ts_1,
318+
{'type': 'time_point', 'definition': 'accurate', 'time': ['2021-10-31 00:00:00', '2021-10-31 23:59:59']}],
319+
314320
# 限定月、模糊日
315321
['本月初', _ts_1,
316322
{'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-01 00:00:00', '2021-06-05 23:59:59']}],
@@ -460,6 +466,10 @@ def test_time_parser(self):
460466
['今晚八点以后', _ts_1,
461467
{'type': 'time_span', 'definition': 'accurate', 'time': ['2021-06-14 20:00:00', 'inf']}],
462468

469+
# 残缺型 ……至……
470+
['1月3至2月10', _ts_1,
471+
{'type': 'time_span', 'definition': 'accurate', 'time': ['2021-01-03 00:00:00', '2021-02-10 23:59:59']}],
472+
463473
# time_span,limit 型
464474
['前天中午到明天晚上', _ts_1,
465475
{'type': 'time_span', 'definition': 'blur', 'time': ['2021-06-12 12:00:00', '2021-06-15 23:59:59']}],

0 commit comments

Comments
 (0)