diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..3754756 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,17 @@ +MIT License +Copyright (c) 2018 YOUR NAME +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c22dcac --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +# Dependencies + +```bash +unidecode +emoji +*kenlm +fuzzy +scikit-learn +pyxdameraulevenshtein +pygtrie +numpy +``` + +*Install kenlm wrapper from github: + +```bash +pip install https://github.com/kpu/kenlm/archive/master.zip +``` + +# Usage + +```python + + # Load english dictionary + english_vocab = load_english_vocab(...) + english_vocab.update(load_english_vocab(...)) + + # Load bilingual lexicon dictionary + foreign_dict = load_lexicon_norm(...) + + # Load target language model + lm = kenlm.Model(...) + + # Train a ngram model if needed + # ngram_train(foreign_dict, 'hin-tfidf-ngram_algo') + + # Ulf's romanizer + romanizer = partial(romanize, + romanization_path=..., + language_code="hin") + + soundex_inst = fuzzy.DMetaphone() + soundex_algo = lambda x: soundex_inst(x)[0].decode('utf-8') if soundex_inst(x)[0] is not None else x + english_encoded_vocab = {e: soundex_algo(e) for e in english_vocab if e} + + # load the ngram model + ngram_algo = pickle.loads(open(..., "rb").read()) + + soundex_model = partial(soundex_similarity, + encoded_english_vocab=english_encoded_vocab, + romanizer=romanizer, + soundex=soundex_algo) + + lev_model = partial(lev_similarity, backup=soundex_model) + ngram_model = partial(ngram_similarity, model=ngram_algo, backup=lev_model) + final_model = partial(exact_similarity, backup=ngram_model) + + for line in open(...): + source, target = line.strip('\n').split('\t') + oovs = extract_oov(target, source, english_vocab=english_vocab, romanization=True) + best, mods = translate_oov(target, oovs, foreign_dict, final_model, lm.score) + + if best != target: + + for oov in oovs: + alt = list(mods[oov].keys())[0] + trans = mods[oov][alt] + debug.debug(f"{romanizer(oov)} -> {romanizer(alt)} : {list(trans)}") + + debug.debug(best) + debug.debug("*"*100) +``` \ No newline at end of file diff --git a/elisa_dnt/__main__.py b/elisa_dnt/__main__.py index 591694e..ed07e1a 100644 --- a/elisa_dnt/__main__.py +++ b/elisa_dnt/__main__.py @@ -20,20 +20,21 @@ parser.add_argument('--fa_output', type=str, help="[Post]File path to the output file") - parser.add_argument('--fb_src', nargs=1, + parser.add_argument('--fb_src', type=str, help='[Pre]File path to the source file') - parser.add_argument('--fb_src_output', nargs=1, + parser.add_argument('--fb_src_output', type=str, help='[Pre]File path to the source output file') - parser.add_argument('--fb_ini_output', nargs=1, + parser.add_argument('--fb_ini_output', type=str, help='[Pre]File path to the source ini file') - parser.add_argument('--fb_tgt', nargs=1, required=False, + parser.add_argument('--fb_tgt', type=str, required=False, help='[Pre]File path to the target file') parser.add_argument('--pb_cross', dest='pb_cross', default=False, action='store_true', help='[Pre]Parameter for whether use reference target file for regex extraction') - parser.add_argument('--fb_visual', nargs=1, + parser.add_argument('--fb_visual', type=str, help="[Pre]File path to visualization html file") args = parser.parse_args() + print(args) scheme = args.p_scheme @@ -41,11 +42,11 @@ restore(args.fa_dnt_src, args.fa_dnt_ini, args.fa_output, args.p_scheme) exit(0) - RULES = {key: re.compile(value) for key, value in rules[args.scheme].items()} - RULES["comb"] = re.compile("(" + "|".join(rules[args.scheme].values()) + ")+") + RULES = {key: re.compile(value) for key, value in rules[args.p_scheme].items()} + RULES["comb"] = re.compile("(" + "|".join(rules[args.p_scheme].values()) + ")+") if args.fb_visual: - with open(args.fb_visual[0], "w") as o: + with open(args.fb_visual, "w") as o: o.write("""