first packaging

isi-nlp · Jun 11, 2019 · 0291ce9 · 0291ce9
1 parent 9bb5f40
commit 0291ce9
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 8 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,17 @@
+MIT License
+Copyright (c) 2018 YOUR NAME
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,72 @@
+# Dependencies
+
+```bash
+unidecode
+emoji
+*kenlm
+fuzzy
+scikit-learn
+pyxdameraulevenshtein 
+pygtrie
+numpy     
+```
+
+*Install kenlm wrapper from github:
+
+```bash
+pip install https://github.com/kpu/kenlm/archive/master.zip
+```
+
+# Usage
+
+```python
+
+    # Load english dictionary
+    english_vocab = load_english_vocab(...)
+    english_vocab.update(load_english_vocab(...))
+
+    # Load bilingual lexicon dictionary
+    foreign_dict = load_lexicon_norm(...)
+
+    # Load target language model
+    lm = kenlm.Model(...)
+
+    # Train a ngram model if needed
+    # ngram_train(foreign_dict, 'hin-tfidf-ngram_algo')
+
+    # Ulf's romanizer
+    romanizer = partial(romanize,
+                        romanization_path=...,
+                        language_code="hin")
+
+    soundex_inst = fuzzy.DMetaphone()
+    soundex_algo = lambda x: soundex_inst(x)[0].decode('utf-8') if soundex_inst(x)[0] is not None else x
+    english_encoded_vocab = {e: soundex_algo(e) for e in english_vocab if e}
+
+    # load the ngram model
+    ngram_algo = pickle.loads(open(..., "rb").read())
+
+    soundex_model = partial(soundex_similarity,
+                            encoded_english_vocab=english_encoded_vocab,
+                            romanizer=romanizer,
+                            soundex=soundex_algo)
+
+    lev_model = partial(lev_similarity, backup=soundex_model)
+    ngram_model = partial(ngram_similarity, model=ngram_algo, backup=lev_model)
+    final_model = partial(exact_similarity, backup=ngram_model)
+
+    for line in open(...):
+        source, target = line.strip('\n').split('\t')
+        oovs = extract_oov(target, source, english_vocab=english_vocab, romanization=True)
+        best, mods = translate_oov(target, oovs, foreign_dict, final_model, lm.score)
+
+        if best != target:
+
+            for oov in oovs:
+                alt = list(mods[oov].keys())[0]
+                trans = mods[oov][alt]
+                debug.debug(f"{romanizer(oov)} -> {romanizer(alt)} : {list(trans)}")
+
+            debug.debug(best)
+            debug.debug("*"*100)
+```
diff --git a/elisa_dnt/__main__.py b/elisa_dnt/__main__.py
@@ -20,32 +20,33 @@
     parser.add_argument('--fa_output', type=str,
                         help="[Post]File path to the output file")
 
-    parser.add_argument('--fb_src', nargs=1,
+    parser.add_argument('--fb_src', type=str,
                         help='[Pre]File path to the source file')
-    parser.add_argument('--fb_src_output', nargs=1,
+    parser.add_argument('--fb_src_output', type=str,
                         help='[Pre]File path to the source output file')
-    parser.add_argument('--fb_ini_output', nargs=1,
+    parser.add_argument('--fb_ini_output', type=str,
                         help='[Pre]File path to the source ini file')
-    parser.add_argument('--fb_tgt', nargs=1, required=False,
+    parser.add_argument('--fb_tgt', type=str, required=False,
                         help='[Pre]File path to the target file')
     parser.add_argument('--pb_cross', dest='pb_cross', default=False, action='store_true',
                         help='[Pre]Parameter for whether use reference target file for regex extraction')
-    parser.add_argument('--fb_visual', nargs=1,
+    parser.add_argument('--fb_visual', type=str,
                         help="[Pre]File path to visualization html file")
 
     args = parser.parse_args()
+    print(args)
 
     scheme = args.p_scheme
 
     if args.p_step == "post":
         restore(args.fa_dnt_src, args.fa_dnt_ini, args.fa_output, args.p_scheme)
         exit(0)
 
-    RULES = {key: re.compile(value) for key, value in rules[args.scheme].items()}
-    RULES["comb"] = re.compile("(" + "|".join(rules[args.scheme].values()) + ")+")
+    RULES = {key: re.compile(value) for key, value in rules[args.p_scheme].items()}
+    RULES["comb"] = re.compile("(" + "|".join(rules[args.p_scheme].values()) + ")+")
 
     if args.fb_visual:
-        with open(args.fb_visual[0], "w") as o:
+        with open(args.fb_visual, "w") as o:
             o.write("""
                 <link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro&display=swap&subset=cyrillic,cyrillic-ext,greek,greek-ext,latin-ext,vietnamese" rel="stylesheet">
                 <style>

diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
diff --git a/setup.py b/setup.py
@@ -0,0 +1,23 @@
+from setuptools import setup
+
+setup(
+    name='elisa-dnt',
+    version='0.0.1',
+    packages=['elisa_dnt'],
+    url='https://github.com/ChenghaoMou/elisa-dnt',
+    license='',
+    author='chenghaomou',
+    author_email='[email protected]',
+    description='Do Not Translate for machine translation',
+    install_requires=[
+        'emoji',
+        'regex',
+    ],
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
+        'Intended Audience :: Developers',  # Define that your audience are developers
+        'License :: OSI Approved :: MIT License',  # Again, pick a license
+        'Programming Language :: Python :: 3.7',  # Specify which pyhton versions that you want to support
+    ],
+)