Skip to content

Commit

Permalink
first packaging
Browse files Browse the repository at this point in the history
  • Loading branch information
Chenghao Mou authored and Chenghao Mou committed Jun 11, 2019
1 parent 9bb5f40 commit 0291ce9
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 8 deletions.
17 changes: 17 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
MIT License
Copyright (c) 2018 YOUR NAME
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
72 changes: 72 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Dependencies

```bash
unidecode
emoji
*kenlm
fuzzy
scikit-learn
pyxdameraulevenshtein
pygtrie
numpy
```

*Install kenlm wrapper from github:

```bash
pip install https://github.com/kpu/kenlm/archive/master.zip
```

# Usage

```python

# Load english dictionary
english_vocab = load_english_vocab(...)
english_vocab.update(load_english_vocab(...))

# Load bilingual lexicon dictionary
foreign_dict = load_lexicon_norm(...)

# Load target language model
lm = kenlm.Model(...)

# Train a ngram model if needed
# ngram_train(foreign_dict, 'hin-tfidf-ngram_algo')

# Ulf's romanizer
romanizer = partial(romanize,
romanization_path=...,
language_code="hin")

soundex_inst = fuzzy.DMetaphone()
soundex_algo = lambda x: soundex_inst(x)[0].decode('utf-8') if soundex_inst(x)[0] is not None else x
english_encoded_vocab = {e: soundex_algo(e) for e in english_vocab if e}

# load the ngram model
ngram_algo = pickle.loads(open(..., "rb").read())

soundex_model = partial(soundex_similarity,
encoded_english_vocab=english_encoded_vocab,
romanizer=romanizer,
soundex=soundex_algo)

lev_model = partial(lev_similarity, backup=soundex_model)
ngram_model = partial(ngram_similarity, model=ngram_algo, backup=lev_model)
final_model = partial(exact_similarity, backup=ngram_model)

for line in open(...):
source, target = line.strip('\n').split('\t')
oovs = extract_oov(target, source, english_vocab=english_vocab, romanization=True)
best, mods = translate_oov(target, oovs, foreign_dict, final_model, lm.score)

if best != target:

for oov in oovs:
alt = list(mods[oov].keys())[0]
trans = mods[oov][alt]
debug.debug(f"{romanizer(oov)} -> {romanizer(alt)} : {list(trans)}")

debug.debug(best)
debug.debug("*"*100)
```
17 changes: 9 additions & 8 deletions elisa_dnt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,33 @@
parser.add_argument('--fa_output', type=str,
help="[Post]File path to the output file")

parser.add_argument('--fb_src', nargs=1,
parser.add_argument('--fb_src', type=str,
help='[Pre]File path to the source file')
parser.add_argument('--fb_src_output', nargs=1,
parser.add_argument('--fb_src_output', type=str,
help='[Pre]File path to the source output file')
parser.add_argument('--fb_ini_output', nargs=1,
parser.add_argument('--fb_ini_output', type=str,
help='[Pre]File path to the source ini file')
parser.add_argument('--fb_tgt', nargs=1, required=False,
parser.add_argument('--fb_tgt', type=str, required=False,
help='[Pre]File path to the target file')
parser.add_argument('--pb_cross', dest='pb_cross', default=False, action='store_true',
help='[Pre]Parameter for whether use reference target file for regex extraction')
parser.add_argument('--fb_visual', nargs=1,
parser.add_argument('--fb_visual', type=str,
help="[Pre]File path to visualization html file")

args = parser.parse_args()
print(args)

scheme = args.p_scheme

if args.p_step == "post":
restore(args.fa_dnt_src, args.fa_dnt_ini, args.fa_output, args.p_scheme)
exit(0)

RULES = {key: re.compile(value) for key, value in rules[args.scheme].items()}
RULES["comb"] = re.compile("(" + "|".join(rules[args.scheme].values()) + ")+")
RULES = {key: re.compile(value) for key, value in rules[args.p_scheme].items()}
RULES["comb"] = re.compile("(" + "|".join(rules[args.p_scheme].values()) + ")+")

if args.fb_visual:
with open(args.fb_visual[0], "w") as o:
with open(args.fb_visual, "w") as o:
o.write("""
<link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro&display=swap&subset=cyrillic,cyrillic-ext,greek,greek-ext,latin-ext,vietnamese" rel="stylesheet">
<style>
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
description-file = README.md
23 changes: 23 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from setuptools import setup

setup(
name='elisa-dnt',
version='0.0.1',
packages=['elisa_dnt'],
url='https://github.com/ChenghaoMou/elisa-dnt',
license='',
author='chenghaomou',
author_email='[email protected]',
description='Do Not Translate for machine translation',
install_requires=[
'emoji',
'regex',
],
classifiers=[
'Development Status :: 3 - Alpha',
# Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
'Intended Audience :: Developers', # Define that your audience are developers
'License :: OSI Approved :: MIT License', # Again, pick a license
'Programming Language :: Python :: 3.7', # Specify which pyhton versions that you want to support
],
)

0 comments on commit 0291ce9

Please sign in to comment.