From 4001b591de3c2cb68d7424543ba512e177bf1021 Mon Sep 17 00:00:00 2001 From: Jiang XueQian Date: Mon, 22 Jan 2018 14:23:45 +0800 Subject: [PATCH] better tools.py --- NLP/tool.py | 49 ++++++++++++++++++++----------------------------- README.md | 9 ++++----- 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/NLP/tool.py b/NLP/tool.py index f9f69af..08cadc1 100644 --- a/NLP/tool.py +++ b/NLP/tool.py @@ -1,36 +1,27 @@ from .cut import Scissor from .mark import Mark + import json import lzma -class Tool: - def __init__(self): - ''' - 对分词类和词性标注类进行初始化 - ''' - self.M = Mark(json.loads(lzma.open('NLP/pos.json.xz').read().decode('utf-8')), \ - json.loads(lzma.open('NLP/bposf.json.xz').read().decode('utf-8')), \ - json.loads(lzma.open('NLP/iposf.json.xz').read().decode('utf-8'))) - self.S = Scissor(json.loads(lzma.open('NLP/wf.json.xz').read().decode('UTF-8')), \ - json.loads(lzma.open('NLP/iw.json.xz').read().decode('UTF-8'))) - - def Cut_mark(self, s): - ''' - 对一个字符串进行分词和词性标注 - :param s: 待处理的字符串 - :return: list类型的处理结果 - ''' - result = [] - cut = self.S.Cut(s) - mark = self.M.Sentemark(cut) - for i in range(0,len(cut)): - result.append(cut[i]+'/'+mark[i]) - return result - -if __name__ =='__main__': - t = Tool() - print('Enter: ') - s = input() - t.Cut_mark(s) +import pkg_resources as pkgres +rc = lambda n: pkgres.resource_filename(__name__, n) +M = Mark(json.loads(lzma.open(rc('pos.json.xz')).read().decode('utf-8')), json.loads(lzma.open(rc('bposf.json.xz')).read().decode('utf-8')), json.loads(lzma.open(rc('iposf.json.xz')).read().decode('utf-8'))) +S = Scissor(json.loads(lzma.open(rc('wf.json.xz')).read().decode('UTF-8')), json.loads(lzma.open(rc('iw.json.xz')).read().decode('UTF-8'))) +def CutMark(s): + ''' + 对一个字符串进行分词和词性标注 + :param s: 待处理的字符串 + :return: list类型的处理结果 + ''' + result = [] + cut = S.Cut(s) + mark = M.Sentemark(cut) + return (cut, mark) + +if __name__ =='__main__': + print('Enter: ') + s = input() + CutMark(s) diff --git a/README.md b/README.md index 85a1e85..4c65831 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ * 其他源文件用来产生词典 - * *.json.xz 为压缩后的词典 + * \*.json.xz 为压缩后的词典 ## 使用方法 @@ -41,8 +41,7 @@ ``` ``` ->>> from NLP import tool ->>> t = tool.Tool() ->>> t.Cut_mark('自然语言通常是指一种自然地随文化演化的语言。') -['自然/n', '语言/n', '通常/d', '是/vshi', '指/v', '一种/mq', '自然/n', '地/ude2', '随/p', '文化/n', '演化/vn', '的/ude1', '语言/n', '。/w'] +>>> from NLP.tool import CutMark +>>> CutMark('自然语言通常是指一种自然地随文化演化的语言。') +(['自然', '语言', '通常', '是', '指', '一种', '自然', '地', '随', '文化', '演化', '的', '语言', '。'], ['n', 'n', 'd', 'vshi', 'v', 'mq', 'n', 'ude2', 'p', 'n', 'vn', 'ude1', 'n', 'w']) ```