Add mutliprocess cut method in Version-3.3.0

XiaoMi · Feb 22, 2021 · ea66493 · ea66493
1 parent 9398ae5
commit ea66493
Show file tree

Hide file tree

Showing 14 changed files with 219 additions and 198 deletions.
diff --git a/minlp-tokenizer/.gitignore → .gitignore b/minlp-tokenizer/.gitignore → .gitignore
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
@@ -0,0 +1,26 @@
+scanner:
+    diff_only: False  # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned.
+    linter: pycodestyle  # Other option is flake8
+
+pycodestyle:  # Same as scanner.linter value. Other option is flake8
+    max-line-length: 120  # Default is 79 in PEP 8
+    ignore:  # Errors and warnings to ignore
+        - W504  # line break after binary operator
+        - E402  # module level import not at top of file
+        - E731  # do not assign a lambda expression, use a def
+        - C406  # Unnecessary list literal - rewrite as a dict literal.
+        - E741  # ambiguous variable name
+
+no_blank_comment: True  # If True, no comment is made on PR without any errors.
+descending_issues_order: False  # If True, PEP 8 issues in message will be displayed in descending order of line numbers in the file
+
+message:  # Customize the comment made by the bot
+    opened:  # Messages when a new PR is submitted
+        header: "Hello @{name}! Thanks for opening this PR. "
+                # The keyword {name} is converted into the author's username
+        footer: "Do see the [Hitchhiker's guide to code style](https://goo.gl/hqbW4r)"
+                # The messages can be written as they would over GitHub
+    updated:  # Messages when new commits are added to the PR
+        header: "Hello @{name}! Thanks for updating this PR. "
+        footer: ""  # Why to comment the link to the style guide everytime? :)
+    no_errors: "There are currently no PEP 8 issues detected in this Pull Request. Cheers! :beers: "
diff --git a/minlp-tokenizer/README.md b/minlp-tokenizer/README.md
@@ -21,41 +21,89 @@ pip install minlp-tokenizer
 
 ## 3. 使用API
 
+- 分词（逐句或者列表）：
 ```python
 from minlptokenizer.tokenizer import MiNLPTokenizer
 
 tokenizer = MiNLPTokenizer(granularity='fine')  # fine：细粒度，coarse：粗粒度，默认为细粒度
-print(tokenizer.cut('今天天气怎么样？'))
+print(tokenizer.cut('今天天气怎么样？'))  # 单句分词
+# ['今天','天气','怎么样']  
+print(tokenizer.cut(['今天天气怎么样', '小米的价值观是真诚与热爱']))  # list分词，list长度小于128
+# [['今天','天气','怎么样'],['小米','的','价值观','是','真诚','与','热爱']]
 ```
 
-## 4. 自定义用户词典
+- 通过迭代器进行批量分词：
+```python
+from minlptokenizer.tokenizer import MiNLPTokenizer, batch_generator
 
-- 通过用户词典List添加：
- ```python
+texts = ['小米的价值观是真诚与热爱'] * 2048
+tokenizer = MiNLPTokenizer(granularity='fine')
+batch_iter = batch_generator(texts, size=128)  # MiNLP提供迭代器，也可自行迭代
+for batch in batch_iter:
+    print(tokenizer.cut(batch))
+
+```
+
+- 多进程批量分词：
+```python
 from minlptokenizer.tokenizer import MiNLPTokenizer
 
-tokenizer = MiNLPTokenizer(['word1', 'word2'], granularity='fine') #用户自定义干预词典传入
- ```
+texts = ['小米的价值观是真诚与热爱'] * 2048
+tokenizer = MiNLPTokenizer(granularity='fine')
+result = tokenizer.cut_batch_multiprocess(texts, granularity='fine', n_jobs=2)  # n_jobs:分词进程数量(默认为2)
+```
+
+- 文件分词(适用于大文件分词)：
+```python
+from minlptokenizer.tokenizer import MiNLPTokenizer
 
-- 通过文件路径方式添加
+tokenizer = MiNLPTokenizer(granularity='fine')
+tokenizer.cut_from_file(file_path='/path/to/your/file', # file_path:待分词文件，每行一句
+                        save_path='path/to/result', # save_path:分词结果保存位置
+                        n_jobs=2)  # n_jobs:进程数
+```
+## 4. 自定义用户词典
+
+- List添加/文件路径方式：
  ```python
 from minlptokenizer.tokenizer import MiNLPTokenizer
 
-tokenizer = MiNLPTokenizer('/path/to/your/lexicon/file', granularity='coarse')  # 构造函数的参数为用户词典路径
+tokenizer = MiNLPTokenizer(file_or_list=['word1', 'word2'], granularity='fine')  # 用户自定义干预词典传入
+tokenizer = MiNLPTokenizer(file_or_list='/path/to/your/lexicon/file', granularity='coarse')  # 构造函数的参数为用户词典路径
  ```
 
-## 5. 未来计划
+## 5. 注意事项
+由于Windows环境下Python multiprocessing和Linux不同，Linux基于Fork实现多进程，Windows则是启动新进程。因此，在Windows环境下使用多进程分词(cut_batch_multiprocess)
+或文件分词(cut_from_file)时，请务必保证调用时在 if \_\_name__=='\_\_main__'的保护区域内，例如：
+```python
+from minlptokenizer.tokenizer import MiNLPTokenizer
+
+# Windows 环境下使用多进程分词
+if __name__ == '__main__':
+    texts = ['小米的价值观是真诚与热爱'] * 2048
+    tokenizer = MiNLPTokenizer(granularity='fine')
+    result = tokenizer.cut_batch_multiprocess(texts, granularity='fine', n_jobs=2)
+    tokenizer.cut_from_file(file_path='/path/to/your/file', save_path='path/to/result', n_jobs=2)
+```
+
+## 6. 未来计划
 
 MiNLP是小米AI实验室NLP团队开发的小米自然语言处理平台，目前已经具备词法、句法、语义等数十个功能模块，在公司业务中得到了广泛应用。
 第一阶段我们开源了MiNLP的中文分词功能，后续我们将陆续开源词性标注、命名实体识别、句法分析等功能，和开发者一起打造功能强大、效果领先的NLP工具集。
 
-## 6. 参与开发
+## 7. 参与开发
 
 我们欢迎开发者向MiNLP-Tokenizer贡献代码，也欢迎提出各种Issue和反馈意见。
 开发流程详见CONTRIBUTING.md。
 
-## 7.在学术成果中使用
+## 8. 开发者致谢
+
+感谢社区众多的开发者对MiNLP-Tokenizer提出的支持、意见、鼓励和建议。在此特别感谢以下开发者为MiNLP-Tokenizer分词工具贡献了PR：
+ - 2020.12.4  aseaday 贡献了有关批量分词的速度优化代码，在V100/RTX TITAN的环境下，批量分词速度由40-50KB/s提升至140-150KB/s。
+
+## 9. 在学术成果中使用
 
 如果您在学术成果中使用了MiNLP中文分词工具，请按如下格式引用：
   - 中文：郭元凯, 史亮, 陈宇鹏, 孟二利, 王斌. MiNLP-Tokenizer：小米中文分词工具. 2020.
   - 英文：Yuankai Guo, Liang Shi, Yupeng Chen, Erli Meng, Bin Wang. MiNLP-Tokenzier: XiaoMi Chinese Word Segmenter. 2020.
+
diff --git a/minlp-tokenizer/minlptokenizer/config.py b/minlp-tokenizer/minlptokenizer/config.py
@@ -18,19 +18,17 @@
     'tokenizer_granularity': {
         'fine': {
             'model': 'model/zh/b-fine-cnn-crf-an2cn.pb',
-            'trans': 'trans/b-fine.300d.trans'
         },
         'coarse': {
             'model': 'model/zh/b-coarse-cnn-crf-an2cn.pb',
-            'trans': 'trans/b-coarse.300d.trans'
         }
     },
     'tokenizer_limit': {
-        'max_batch_size': 512,
+        'max_batch_size': 128,
         'max_string_length': 1024
     },
     'lexicon_files': [
         'lexicon/default.txt',
         'lexicon/chengyu.txt',
-    ],
+    ]
 }
diff --git a/minlp-tokenizer/minlptokenizer/crf_viterbi.py b/minlp-tokenizer/minlptokenizer/crf_viterbi.py
diff --git a/minlp-tokenizer/minlptokenizer/lexicon.py b/minlp-tokenizer/minlptokenizer/lexicon.py
@@ -14,11 +14,14 @@
 
 import ahocorasick
 from collections import Iterable
+import numpy as np
+from minlptokenizer.tag import Tag
 
 DEFAULT_INTERFERE_FACTOR = 2
 
 
 class Lexicon:
+    lexicon = None
 
     def __init__(self, file_or_list=None):
         self.ac = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
@@ -48,27 +51,29 @@ def add_words(self, file_or_list):
             for word in filter(lambda t: t and not t.startswith('#'), file_or_list):
                 self.ac.add_word(word)
 
-    def parse_unary_score(self, text, unary_score):
+    def product_factor(self, texts):
         """
-        干预发射权重
-        :param text:原始文本
-        :param unary_score: 发射概率矩阵
-        :return:
+        根据用户词典生成句子对应的干预权重矩阵
+        :param texts: 目标句子
+        :return: 干预权重矩阵
         """
-        if self.ac.get_stats()["nodes_count"] == 0:
-            return
         if self.ac.kind is not ahocorasick.AHOCORASICK:
             self.ac.make_automaton()
-        for (end_pos, length) in self.ac.iter(text):
-            start_pos = end_pos - length + 1
-            if length == 1:
-                unary_score[start_pos][1] = self.max_socre(unary_score[start_pos])  # S
-            else:
-                unary_score[start_pos][2] = self.max_socre(unary_score[start_pos])  # B
-                unary_score[end_pos][4] = self.max_socre(unary_score[end_pos])  # E
-                for i in range(start_pos + 1, end_pos):
-                    unary_score[i][3] = self.max_socre(unary_score[i])  # M
-        return unary_score
+        max_len = max(map(len, texts))
+        matrix = []
+        for text in texts:
+            factor_matrix = np.ones(shape=[max_len, Tag.__len__()])
+            for (end_pos, length) in self.ac.iter(text):
+                start_pos = end_pos - length + 1
+                if length == 1:
+                    factor_matrix[start_pos][1] *= self.interfere_factor
+                else:
+                    factor_matrix[start_pos][2] *= self.interfere_factor
+                    factor_matrix[end_pos][4] *= self.interfere_factor
+                    for i in range(start_pos + 1, end_pos):
+                        factor_matrix[i][3] *= self.interfere_factor
+            matrix.append(factor_matrix)
+        return matrix
 
     def max_socre(self, scores):
         return self.interfere_factor * abs(max(scores))

diff --git a/minlp-tokenizer/minlptokenizer/model/zh/b-coarse-cnn-crf-an2cn.pb b/minlp-tokenizer/minlptokenizer/model/zh/b-coarse-cnn-crf-an2cn.pb
diff --git a/minlp-tokenizer/minlptokenizer/model/zh/b-fine-cnn-crf-an2cn.pb b/minlp-tokenizer/minlptokenizer/model/zh/b-fine-cnn-crf-an2cn.pb