Skip to content

Commit 5e4e5ff

Browse files
committed
fix money re string bug, some chars should not be at the beginning of the money string
1 parent e4d1d89 commit 5e4e5ff

File tree

6 files changed

+12
-5
lines changed

6 files changed

+12
-5
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<a alt="Downloads">
1212
<img src="https://pepy.tech/badge/jionlp/month" /></a>
1313
<a alt="Version">
14-
<img src="https://img.shields.io/badge/version-1.4.19-green" /></a>
14+
<img src="https://img.shields.io/badge/version-1.4.20-green" /></a>
1515
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
1616
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
1717
</p>

jionlp/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# website: www.jionlp.com
1010
"""
1111

12-
__version__ = '1.4.19'
12+
__version__ = '1.4.20'
1313

1414

1515
import os

jionlp/dictionary/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
# license: Apache License 2.0
55
66
# github: https://github.com/dongrixinyu/JioNLP
7-
# description: Preprocessing tool for Chinese NLP
7+
# description: Preprocessing & Parsing tool for Chinese NLP
8+
# website: www.jionlp.com
89

910

1011
from .dictionary_loader import char_distribution_loader

jionlp/gadget/money_parser.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
# license: Apache License 2.0
55
66
# github: https://github.com/dongrixinyu/JioNLP
7-
# description: Preprocessing tool for Chinese NLP
7+
# description: Preprocessing & Parsing tool for Chinese NLP
8+
# website: www.jionlp.com
9+
810

911
"""
1012
TODO:

jionlp/rule/rule_pattern.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,9 @@
438438
# MONEY_SPAN_GAP_MIDDLE = r'(\~+|\-+|~+|-+|至(?!少)|(?<![达不])到)'
439439

440440
# 该金额抽取正则将会包含一些错例,例如“2019-07-18”此类字符串也将被抽取出,即范围覆盖面过大,且还能被正确解析,因此需要进行错例删除正则进行处理
441-
MONEY_CHAR_STRING = r'((将)?近|只有|仅|(大)?约(莫|合)?|大概|至少(要)?|不(到|足|超过)?|逾|高于|(高)?达(到)?|^上|(超)?过|超|' \
441+
# (?! ) 指的是字符串首部不应该包含的字符
442+
MONEY_CHAR_STRING = r'(?!(余|多))' \
443+
r'((将)?近|只有|仅|(大)?约(莫|合)?|大概|至少(要)?|不(到|足|超过)?|逾|高于|(高)?达(到)?|^上|(超)?过|超|' \
442444
r'以上|以下|左右|上下|港币|人民币|(新)?台币|(分|角|毛|块|元)钱?|(人民|港|日|澳|(新)?台)币|圆(整)?|英镑|' \
443445
r'美(金|分|刀)|马克|法郎|卢布|泰铢|元((人民|港|日|澳|韩|(新)?台)币)?|(美|港|澳门|日|韩|缅|马|新加坡|欧|' \
444446
r'加|加拿大|新西兰|澳|澳大利亚)元|(越(南)?)盾|雷亚尔|' \

test/test_money_extractor.py

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ def test_money_extractor(self):
3737
[]],
3838
['到了次日的凌晨2许时就发现车被盗了,该车价值2000余元',
3939
[{'text': '2000余元', 'offset': [22, 28], 'type': 'money'}]],
40+
['年末结转和结余10.56亿元。',
41+
[{'text': '10.56亿元', 'offset': [7, 14], 'type': 'money'}]],
4042
]
4143

4244
for item in money_string_list:

0 commit comments

Comments
 (0)