diff --git a/.gitignore b/.gitignore index 326de85..2cfd128 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ statistics.py report.md dist/* elisa_dnt.egg-info/* +/elisa.train_1.swa +/visual.html +/elisa.train_1.swa.del.ini diff --git a/elisa_dnt/__main__.py b/elisa_dnt/__main__.py index 02cbccd..0d644ee 100644 --- a/elisa_dnt/__main__.py +++ b/elisa_dnt/__main__.py @@ -8,74 +8,79 @@ parser = argparse.ArgumentParser(description='DNT process script') - parser.add_argument('p_step', type=str, choices=['pre', 'post'], + parser.add_argument('step', type=str, choices=['pre', 'post'], help="Parameter for choosing between preprocess or postprocess") - parser.add_argument('p_scheme', type=str, choices=['del', 'sub'], + parser.add_argument('scheme', type=str, choices=['del', 'sub'], help="Parameter for scheme") - parser.add_argument('--fa_dnt_src', type=str, + parser.add_argument('--dnt_src', type=str, help='[Post]File path to the dnt source file') - parser.add_argument('--fa_dnt_ini', type=str, + parser.add_argument('--dnt_ini', type=str, help="[Post]File path to the dnt conf file") - parser.add_argument('--fa_output', type=str, + parser.add_argument('--output', type=str, help="[Post]File path to the output file") - parser.add_argument('--fb_src', type=str, + parser.add_argument('--src', type=str, help='[Pre]File path to the source file') - parser.add_argument('--fb_src_output', type=str, + parser.add_argument('--src_output', type=str, help='[Pre]File path to the source output file') - parser.add_argument('--fb_ini_output', type=str, + parser.add_argument('--ini_output', type=str, help='[Pre]File path to the source ini file') - parser.add_argument('--fb_tgt', type=str, required=False, + parser.add_argument('--tgt', type=str, required=False, help='[Pre]File path to the target file') - parser.add_argument('--pb_cross', dest='pb_cross', default=False, action='store_true', + parser.add_argument('--cross', dest='pb_cross', default=False, action='store_true', help='[Pre]Parameter for whether use reference target file for regex extraction') - parser.add_argument('--fb_visual', type=str, + parser.add_argument('--visual', type=str, help="[Pre]File path to visualization html file") args = parser.parse_args() print(args) - scheme = args.p_scheme + scheme = args.scheme + + rules = load_rules(scheme=scheme) + options = generate_options() - if args.p_step == "post": - restore(args.fa_dnt_src, args.fa_dnt_ini, args.fa_output, args.p_scheme) + if args.step == "post": + restore(args.dnt_src, args.dnt_ini, args.output, args.scheme) exit(0) - RULES = {key: re.compile(value) for key, value in rules[args.p_scheme].items()} - RULES["comb"] = re.compile("(" + "|".join(rules[args.p_scheme].values()) + ")+") - - if args.fb_visual: - with open(args.fb_visual, "w") as o: + if args.visual: + with open(args.visual, "w") as o: o.write(""" - + + + + + + """) - path = args.fb_src + path = args.src - split(args.fb_src, args.fb_src_output, args.fb_ini_output, scheme=args.p_scheme, - ref=args.fb_tgt if args.p_scheme == "sub" and args.pb_cross else "", RULES=RULES) + split(args.src, args.src_output, args.ini_output, scheme=args.scheme, + ref=args.tgt if args.scheme == "sub" and args.pb_cross else "", rules=rules) - if args.fb_visual: - if args.fb_tgt == "": + if args.visual: + if args.tgt == "": for line in open(path): - matches = find(line, RULES) + matches = find(line, rules) if matches: - res = visual(line, matches, options, RULES) - with open(args.fb_visual, "a+") as o: + res = visual(line, matches, options, rules) + with open(args.visual, "a+") as o: o.write(f"

{res}

" + "\n") else: - src_lines, tgt_lines = open(path).readlines(), open(args.fb_tgt).readlines() + src_lines, tgt_lines = open(path).readlines(), open(args.tgt).readlines() assert len(src_lines) == len(tgt_lines) for src_line, tgt_line in zip(src_lines, tgt_lines): - src_matches = find(src_line, RULES) - tgt_matches = find(tgt_line, RULES) + src_matches = find(src_line, rules) + tgt_matches = find(tgt_line, rules) src_matches_text = [src_line[m.start:m.end] for m in src_matches] tgt_matches_text = [tgt_line[m.start:m.end] for m in tgt_matches] @@ -88,10 +93,13 @@ tgt_line[m.start:m.end] in x_matches] if args.pb_cross else tgt_matches if x_matches: - res = visual(src_line, x_src_matches, options, RULES) - with open(args.fb_visual, "a+") as o: + res = visual(src_line, x_src_matches, options, rules) + with open(args.visual, "a+") as o: o.write(f"

{res}

" + "\n") - res = visual(tgt_line, x_tgt_matches, options, RULES) - with open(args.fb_visual, "a+") as o: + res = visual(tgt_line, x_tgt_matches, options, rules) + with open(args.visual, "a+") as o: o.write(f"

{res}

" + "\n") + + with open(args.visual, "a+") as o: + o.write('') \ No newline at end of file diff --git a/elisa_dnt/emoji.py b/elisa_dnt/emoji.py new file mode 100644 index 0000000..49b25a0 --- /dev/null +++ b/elisa_dnt/emoji.py @@ -0,0 +1,44 @@ +# encoding: utf-8 +# Created by chenghaomou at 2019-06-27 + +from bs4 import BeautifulSoup +import urllib3 +import argparse + + +def get_emojis(url: str = "http://unicode.org/emoji/charts-12.0/full-emoji-list.html", + output: str = "emojis.ini") -> None: + """ + Parse the official website for all emojis and write them to a file. + + :param url: Official unicode website for emoji list. + + :param output: Output file for the list of emojis. + + :return: None. + + """ + req = urllib3.PoolManager() + res = req.request('GET', url) + soup = BeautifulSoup(res.data, "html.parser") + emojis = set('๐Ÿฆฐ๐Ÿฆฑ๐Ÿฆณ๐Ÿฆฒ๐Ÿป๐Ÿผ๐Ÿฝ๐Ÿพ๐Ÿฟ') + for img in soup.findAll('img', alt=True): + if len(img['alt']) == 1: + emojis.add('{}'.format(img['alt'])) + + with open(output, "w") as output: + output.write('\n'.join(emojis)) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Emoji list retriever from unicode website') + + parser.add_argument('--url', type=str, default="http://unicode.org/emoji/charts-12.0/full-emoji-list.html", + help='Unicode website for emoji') + parser.add_argument('--output', type=str, default='emojis.ini', + help='Output file') + + args = parser.parse_args() + + get_emojis(args.url, args.output) \ No newline at end of file diff --git a/elisa_dnt/emojis.ini b/elisa_dnt/emojis.ini new file mode 100644 index 0000000..e9d8f79 --- /dev/null +++ b/elisa_dnt/emojis.ini @@ -0,0 +1,1273 @@ +๐Ÿฒ +๐Ÿ“ฅ +๐Ÿž +๐Ÿ’จ +๐Ÿ†” +๐Ÿ˜ž +๐Ÿค• +๐Ÿ‚ +๐Ÿฅก +๐Ÿ•ด +๐Ÿ‹ +๐Ÿพ +๐Ÿค› +โ™ˆ +ใŠ— +๐Ÿš‚ +โž– +๐Ÿ‘ +๐Ÿ• +๐Ÿ”˜ +๐ŸŽฒ +โ™พ +๐Ÿฉ +โ›” +๐ŸŒŽ +๐Ÿฅณ +๐Ÿ +๐ŸŸฉ +๐Ÿฆข +๐ŸŽ› +โ›ธ +๐Ÿ™Š +โ›“ +๐ŸŒฎ +๐Ÿšƒ +๐Ÿ“ +๐Ÿ“ฝ +๐Ÿฐ +๐ŸŒ +๐Ÿ–ฑ +๐Ÿคธ +๐Ÿงž +โ€ผ +๐Ÿข +๐Ÿคฟ +๐Ÿˆฒ +๐ŸŽ— +๐Ÿš” +๐Ÿง… +๐Ÿ• +๐Ÿš +๐Ÿฆ” +๐Ÿšฅ +๐Ÿ‘ž +๐Ÿ +๐Ÿœ +๐ŸŠ +๐Ÿฅบ +๐Ÿค +โ› +๐Ÿงก +๐Ÿ™‹ +๐ŸŒฆ +๐Ÿ“† +๐Ÿ“š +๐Ÿ—ป +๐Ÿธ +โ“‚ +๐Ÿˆ‚ +๐Ÿถ +โšซ +๐Ÿ›‘ +๐Ÿงฐ +๐Ÿฅ +๐Ÿš“ +๐Ÿ•Œ +โ™Œ +๐Ÿ“ต +๐Ÿ˜— +๐Ÿ” +๐Ÿ‘ฆ +๐Ÿง› +๐Ÿง +๐ŸŒš +๐Ÿ—ฏ +๐Ÿฆต +๐Ÿ˜ด +๐Ÿ‡ +๐ŸŒ† +๐Ÿฆ• +ยฎ +๐Ÿฆ +๐Ÿง +๐Ÿคฅ +๐Ÿฆ +๐Ÿ“ˆ +๐Ÿ•› +๐Ÿ‘‚ +๐Ÿฆฎ +๐Ÿ“ +๐Ÿ˜… +๐Ÿ•ต +๐Ÿš +๐Ÿ–ผ +๐Ÿฅ‚ +๐Ÿช +๐Ÿ˜™ +๐Ÿ›‹ +๐Ÿ—ฝ +๐Ÿ•– +๐Ÿšฃ +๐Ÿ˜ต +๐ŸšŠ +๐Ÿ”• +๐Ÿงฟ +โ˜ฎ +๐Ÿ— +๐Ÿคน +๐ŸŽ +๐Ÿค™ +๐Ÿ’‘ +๐Ÿฅˆ +๐Ÿงด +๐ŸŒ™ +๐Ÿ“˜ +๐Ÿƒ +๐Ÿ‘ +โณ +๐Ÿšง +โฃ +๐Ÿง† +๐Ÿ›‚ +๐Ÿธ +๐Ÿงผ +๐Ÿ”ซ +๐Ÿ›ถ +๐Ÿ•น +๐Ÿ‘’ +๐Ÿ’– +๐Ÿฆ– +๐ŸŒป +๐Ÿ‘ +๐Ÿ•‹ +๐Ÿ‘ +โœˆ +๐Ÿฆž +๐Ÿ”œ +๐Ÿฏ +โฏ +โœ” +๐Ÿ– +๐Ÿงฌ +๐Ÿ‰ +๐Ÿ˜ฉ +โ˜˜ +๐Ÿค‘ +โ›ณ +๐Ÿ™‚ +๐ŸšŸ +๐Ÿ˜ฑ +๐Ÿ‘ฟ +๐Ÿ”‡ +๐ŸฆŸ +๐Ÿš +๐Ÿ˜„ +๐Ÿœ +๐Ÿšท +๐Ÿ˜ +๐Ÿš— +๐Ÿฉฒ +๐Ÿ˜ฎ +๐Ÿง +๐Ÿ’  +๐Ÿ› +๐Ÿ„ +๐Ÿ‘‡ +๐Ÿ›€ +๐Ÿ’” +โฌ† +๐Ÿ’€ +๐ŸŒ +๐Ÿ‘ฎ +๐ŸŽˆ +โ™ฅ +โฌ +๐Ÿ•Š +ใŠ™ +โ˜ฃ +๐Ÿ’‹ +๐Ÿ›ฅ +๐Ÿคผ +๐ŸŒ• +๐Ÿ˜ญ +๐ŸคŽ +๐Ÿ˜ƒ +๐Ÿ +โฌ‡ +๐Ÿ’ซ +๐Ÿ‘ต +๐Ÿ™Œ +๐Ÿ”ž +๐Ÿค +โ” +๐Ÿ—ƒ +๐Ÿ•ณ +๐Ÿ +๐Ÿ™Ž +โ›„ +๐ŸŒ +๐Ÿ‘จ +๐Ÿฅป +๐Ÿ•ฏ +๐Ÿ“ก +๐ŸŒ„ +๐ŸŒ› +๐Ÿฅ€ +๐Ÿ”บ +๐Ÿ‘ +๐Ÿงจ +๐Ÿ˜’ +โ„ +๐Ÿ“– +๐Ÿงฏ +๐Ÿ•œ +๐Ÿ‘ฅ +๐Ÿ“บ +๐Ÿ—ž +๐ŸŽ‰ +๐Ÿ’Ÿ +๐ŸŒŒ +๐ŸŸ +โ›ฐ +๐Ÿค’ +๐Ÿšน +โ˜ฏ +๐Ÿ˜  +๐ŸŒจ +๐Ÿฅฏ +๐Ÿ’ฑ +๐Ÿฅ  +๐Ÿฆผ +๐Ÿด +๐Ÿฆ +๐ŸงŽ +โฒ +๐Ÿฅ™ +๐Ÿณ +๐Ÿงฆ +๐Ÿ˜ธ +๐Ÿ˜˜ +๐Ÿ™‰ +๐Ÿˆต +๐Ÿคท +๐Ÿ˜ฐ +๐Ÿ• +๐Ÿšฒ +๐Ÿ– +๐Ÿ€ +โ–ซ +๐Ÿง“ +โ›Ž +๐Ÿ +๐Ÿ›• +๐Ÿš€ +๐ŸŸก +๐Ÿ‰ +๐Ÿ•  +๐Ÿ’ƒ +๐Ÿ“‹ +๐Ÿ•ง +๐ŸŽช +๐Ÿ‘ก +๐Ÿšต +๐Ÿš˜ +๐Ÿ“ฑ +โค +๐Ÿ˜ +โš• +๐Ÿ‘ฐ +๐Ÿฟ +๐Ÿ”ก +โ˜ฆ +๐Ÿš +โ›… +๐Ÿฅด +๐Ÿ—„ +๐ŸŽƒ +๐Ÿฅ› +๐Ÿ”ฐ +โ–ช +๐Ÿฅฐ +๐Ÿจ +๐Ÿˆน +๐Ÿ““ +๐ŸŠ +โ›ต +โš™ +๐Ÿฝ +๐Ÿ’„ +๐Ÿต +๐Ÿง€ +๐Ÿ’ฃ +โšฑ +๐ŸŒ +๐Ÿงš +๐ŸŒŸ +๐Ÿคซ +๐Ÿ” +๐Ÿบ +๐Ÿ”น +๐ŸŽฝ +โ™ +๐Ÿ’ก +๐Ÿงญ +๐Ÿ›… +๐Ÿ› +๐ŸŒž +๐Ÿ•“ +๐Ÿฝ +๐Ÿฅ +๐Ÿญ +๐Ÿ™ +๐Ÿ• +๐Ÿ“› +๐Ÿ’— +๐Ÿ‘ฏ +๐Ÿฅถ +๐Ÿ +โ†” +๐Ÿ•ž +๐Ÿฃ +๐ŸŽซ +๐Ÿ… +๐Ÿ’† +๐Ÿ’œ +๐Ÿค +๐Ÿงฅ +๐Ÿ˜ฒ +๐Ÿ‘ฒ +๐Ÿ‘ด +๐Ÿ˜‡ +โช +๐Ÿ’ +๐Ÿพ +๐Ÿ”ด +๐Ÿ”” +๐Ÿฅ‡ +๐Ÿค +๐ŸŒ +๐ŸŽผ +๐Ÿšฝ +๐Ÿ› +โŒจ +๐ŸŒผ +๐Ÿ˜Œ +๐Ÿง‡ +๐Ÿ”ท +๐Ÿšด +๐Ÿฆˆ +โซ +๐Ÿš… +๐Ÿ”ผ +๐Ÿป +โšœ +๐Ÿ‘‰ +โ™จ +โ—ป +๐Ÿˆด +๐Ÿ˜œ +๐Ÿšผ +๐ŸŒ‰ +๐Ÿ“ž +๐Ÿฅฑ +โš“ +๐Ÿ”… +๐Ÿ”‚ +๐ŸŒ… +๐Ÿ‘Ž +๐Ÿ‘” +๐Ÿ‘œ +๐Ÿฉธ +๐Ÿšš +๐Ÿ’ช +๐ŸŽŠ +๐Ÿ’ +โ—พ +๐Ÿ”‹ +๐Ÿ’ท +๐Ÿ™ƒ +๐Ÿง’ +๐Ÿงข +๐Ÿšณ +๐Ÿบ +โœ‰ +๐Ÿ‘ป +๐Ÿฉฑ +๐Ÿ’ฅ +๐Ÿšฉ +๐Ÿ‘ง +๐Ÿšฐ +๐ŸŽจ +๐Ÿจ +๐ŸŽถ +๐Ÿšž +๐ŸŽน +๐Ÿ“ผ +๐ŸŒฉ +๐Ÿง˜ +๐Ÿ€ +๐Ÿ‘ฑ +๐Ÿน +๐Ÿ’บ +โ˜€ +๐Ÿค +๐Ÿงท +๐Ÿ•ข +๐Ÿ”ฌ +๐Ÿช +๐ŸŒ’ +๐ŸŽŽ +๐Ÿ›ฌ +๐ŸŸฆ +๐ŸŒฟ +๐Ÿงต +๐Ÿ“ƒ +โฑ +๐Ÿ“ +๐Ÿ +๐Ÿ‘– +๐Ÿ +โ™€ +๐ŸŸจ +๐Ÿ–Š +๐Ÿค“ +โšพ +๐Ÿˆ +๐Ÿ…ฟ +๐Ÿ‘ผ +๐Ÿ“ณ +๐Ÿฏ +๐ŸŽ‡ +๐Ÿค– +๐ŸŒฒ +๐ŸŒ– +โœด +๐Ÿ“ข +๐Ÿ˜† +๐Ÿ’ป +๐Ÿงช +๐Ÿ˜€ +๐Ÿ™… +๐Ÿง +๐Ÿ‘— +๐Ÿ  +๐Ÿš +๐Ÿคฌ +๐ŸฆŠ +๐Ÿ˜› +๐Ÿ˜ +๐Ÿ›’ +๐Ÿงฉ +๐Ÿ“‚ +๐Ÿฆ‘ +๐Ÿ‘† +๐Ÿ‘ญ +๐Ÿต +๐Ÿคถ +โžก +๐Ÿ‘€ +๐Ÿ• +๐Ÿฅค +๐Ÿฆช +๐Ÿšพ +๐Ÿ”Œ +๐Ÿ›ฐ +๐Ÿฎ +๐Ÿ›น +โ†– +โ™’ +๐Ÿฅฌ +๐Ÿผ +๐Ÿ„ +๐Ÿ˜Ÿ +ใ€ฐ +๐Ÿฆ™ +๐Ÿ’ +๐Ÿฎ +๐Ÿ–ค +๐Ÿ•‘ +๐Ÿ”ฉ +โžฟ +๐Ÿงน +โฐ +โœŒ +๐Ÿ  +๐Ÿ‘ข +๐Ÿคญ +๐Ÿšข +๐Ÿ’ง +๐Ÿ‘Œ +๐Ÿฅฟ +๐Ÿ‘ถ +๐ŸŽพ +๐Ÿšจ +โœ… +๐Ÿคฎ +๐Ÿ˜ฅ +๐Ÿ’… +๐Ÿธ +๐Ÿ˜ฌ +๐Ÿจ +๐Ÿคจ +๐Ÿ†“ +๐Ÿ•ฐ +๐Ÿงœ +๐ŸŸช +๐Ÿ’š +๐Ÿฅฎ +๐Ÿง‘ +๐Ÿ“ +๐Ÿ˜ง +๐Ÿ’ต +๐Ÿ›ค +๐Ÿ” +๐Ÿš‹ +๐Ÿ +๐Ÿงค +โ™ป +๐Ÿ’“ +๐Ÿ’Š +๐Ÿฅฝ +๐Ÿ˜‰ +๐Ÿ‘ท +๐Ÿ›Ž +๐Ÿช +๐Ÿ•• +๐Ÿคข +๐Ÿง™ +โ†— +๐Ÿฉ +๐Ÿณ +๐Ÿฅจ +๐Ÿค +โฌ› +๐ŸŽป +๐Ÿ˜ฏ +๐Ÿ’พ +โ˜• +๐Ÿ“ฒ +๐Ÿ‘‹ +๐Ÿฅ +๐Ÿš‘ +๐Ÿ“ฎ +๐ŸŒœ +๐Ÿ’• +๐Ÿฐ +๐Ÿƒ +๐Ÿ›Œ +๐Ÿ‘ +๐Ÿ‡ +๐Ÿ”ฅ +๐ŸŽก +๐Ÿ’› +๐Ÿฅœ +๐ŸŒน +๐Ÿ“ฌ +๐Ÿ•บ +๐Ÿง— +๐Ÿ…ฐ +๐Ÿง  +๐Ÿงƒ +๐Ÿ’˜ +๐Ÿ’‰ +๐Ÿณ +โ›ฝ +๐Ÿ˜น +๐Ÿ•ถ +โ™ +๐Ÿฆณ +๐Ÿ˜ก +๐Ÿคช +๐ŸŽŸ +๐ŸŒพ +๐Ÿ˜ +๐Ÿงฒ +๐Ÿ‹ +๐Ÿ›ซ +๐Ÿž +โ›ˆ +๐Ÿ”  +๐Ÿงณ +๐Ÿซ +๐Ÿ‘™ +๐ŸŽฅ +๐Ÿฉ +๐Ÿ˜บ +โ˜‚ +๐Ÿฅ„ +๐Ÿš +โ™Ž +โšฐ +๐Ÿฆ’ +๐Ÿ—ณ +๐Ÿ†š +๐Ÿฆน +๐Ÿ‘ค +๐ŸŽฑ +๐Ÿ› +๐Ÿ†— +๐Ÿ +๐ŸŽ€ +๐ŸŒค +๐ŸŠ +๐Ÿšค +๐Ÿ… +๐Ÿ–– +๐ŸŸฅ +๐Ÿฆœ +๐Ÿ‡ +โšฝ +๐Ÿฏ +๐Ÿ”“ +๐Ÿฆ‚ +๐ŸŒท +๐Ÿ‘ฌ +๐ŸŒ˜ +โšก +๐Ÿ”ธ +๐Ÿ”š +๐Ÿ›ƒ +๐Ÿšซ +๐Ÿ†• +๐Ÿ›„ +๐ŸฅŒ +๐Ÿ“ฐ +โ—ผ +๐Ÿ˜‹ +๐Ÿ‘ณ +๐Ÿงง +โ„ข +๐Ÿ€„ +๐ŸŽง +๐Ÿš• +๐Ÿฆฐ +๐Ÿ‘ˆ +๐Ÿš† +๐Ÿฆ€ +๐Ÿ‘ธ +๐Ÿ’ณ +๐ŸŒ  +๐Ÿ +๐Ÿ™† +๐Ÿฅ +๐Ÿฌ +๐Ÿคฒ +โ‰ +๐Ÿ“ธ +ยฉ +โš› +๐Ÿ“ฆ +๐Ÿ›ฉ +๐ŸŽบ +๐Ÿ”Ž +๐Ÿ”จ +๐Ÿ–‹ +๐Ÿš‡ +๐Ÿฅƒ +โ‡ +๐Ÿค +๐Ÿง„ +๐Ÿ’ฎ +๐Ÿฌ +๐Ÿ’ฟ +๐Ÿ”ช +๐ŸŸซ +๐Ÿ‚ +๐Ÿ“Š +๐Ÿ‰ +๐Ÿฅš +๐Ÿฅ“ +๐Ÿฅญ +โš  +๐ŸงŠ +โœ’ +๐Ÿ—’ +๐Ÿ˜ +๐Ÿ“ +๐Ÿซ +๐Ÿš„ +๐Ÿฆƒ +๐Ÿฆ +๐Ÿ› +๐ŸŽ‘ +๐Ÿ–‡ +๐Ÿ”ฝ +๐Ÿ…พ +โ˜บ +๐Ÿ”ฏ +๐ŸŽฎ +๐Ÿ˜ฟ +๐Ÿค” +๐Ÿˆฏ +๐ŸŒบ +โ +โŒ +๐ŸŒ +๐Ÿฐ +๐Ÿ—ฟ +๐Ÿ’ถ +โ˜น +๐ŸŸค +๐Ÿ•ฆ +๐ŸŒ +๐Ÿก +โ›ท +โฌ… +๐Ÿฉน +๐Ÿฅž +๐Ÿคบ +๐Ÿ—พ +๐Ÿ’Ž +๐ŸŒˆ +๐Ÿก +๐ŸŽ‹ +๐ŸŽ‚ +๐Ÿ’ญ +๐Ÿšฑ +๐Ÿฅพ +๐ŸงŸ +๐Ÿฒ +๐Ÿšถ +๐Ÿ’Œ +โœ‹ +๐ŸŽฉ +๐Ÿคณ +๐Ÿ— +โ—€ +๐Ÿšบ +๐Ÿฆ +๐Ÿ˜พ +๐Ÿฉณ +๐Ÿ’ฆ +๐Ÿ’ฏ +๐Ÿ„ +๐Ÿ•ฃ +๐ŸŒซ +๐Ÿฃ +๐Ÿš– +โœ +๐Ÿ“ +โฉ +๐Ÿฆ“ +๐Ÿฆฑ +๐Ÿ›ฃ +๐Ÿง– +๐Ÿงˆ +๐Ÿ•ค +๐Ÿฆ  +โ†ช +๐Ÿ”ƒ +โŒš +๐Ÿ–Œ +๐Ÿšฎ +๐Ÿ”ญ +๐Ÿ– +๐Ÿ“ฉ +๐Ÿฆ +๐Ÿ˜ผ +๐Ÿ +๐Ÿ‘บ +๐Ÿ€ +โœ +๐Ÿ”– +๐Ÿคด +๐Ÿช€ +๐Ÿˆ +๐Ÿฆ +๐Ÿฆ„ +๐Ÿซ +๐Ÿฆ† +๐ŸŽ +๐Ÿž +๐Ÿฆพ +๐ŸŒ‡ +๐ŸฆŽ +๐Ÿฅฅ +๐Ÿฅ˜ +๐Ÿ˜ป +๐Ÿ—จ +๐Ÿš  +โ™ +๐Ÿงบ +๐Ÿ“— +๐Ÿ˜ฆ +๐Ÿšช +๐Ÿ•’ +๐Ÿ“ฏ +๐ŸŽฐ +๐Ÿ“ด +๐ŸŽ’ +๐Ÿ—“ +โ™‹ +๐ŸŽ +๐Ÿฆ‰ +๐ŸŒ‹ +๐Ÿ”ง +๐Ÿ”ข +๐Ÿ˜จ +๐Ÿ›  +๐ŸคŸ +๐ŸŒ€ +๐Ÿฝ +โ˜  +๐Ÿ›ก +๐Ÿฑ +๐Ÿบ +๐Ÿ“ถ +๐Ÿ˜ˆ +๐Ÿ’ +๐Ÿฉฐ +๐Ÿงฝ +๐Ÿ’ค +๐Ÿฅ +โ˜ƒ +๐Ÿ˜ +๐Ÿง +๐Ÿ’ž +๐Ÿ‰ +๐Ÿ”ฃ +๐Ÿช +๐Ÿ—ฃ +๐Ÿˆธ +๐Ÿ”™ +๐Ÿ™ +๐Ÿ”€ +๐Ÿ˜ข +๐Ÿ“ฟ +๐Ÿช“ +๐ŸŒ‘ +๐Ÿšธ +๐Ÿšป +๐ŸŒง +๐Ÿ” +๐Ÿšœ +๐Ÿ˜ฃ +๐Ÿ†‘ +โž— +๐Ÿ‘พ +๐Ÿ—‘ +โ™  +๐Ÿ’ +๐Ÿฆจ +๐ŸŽด +๐Ÿˆถ +โ™Š +๐Ÿ˜ฝ +๐ŸŒช +๐Ÿฅ +๐ŸŒฅ +๐Ÿฆก +๐Ÿ”„ +๐Ÿง‰ +๐Ÿคพ +๐ŸšŒ +๐Ÿ•ท +๐Ÿ˜‘ +๐Ÿ•ก +๐Ÿ•˜ +๐Ÿ› +ใ€ฝ +๐Ÿฆป +๐Ÿˆท +๐Ÿ˜ช +โ˜‘ +โคด +๐Ÿ“  +๐Ÿฆง +๐Ÿ”† +๐Ÿฃ +๐Ÿ™ +๐Ÿ”Ÿ +๐Ÿ“จ +โคต +๐ŸฆŒ +โ›ด +โ˜” +๐Ÿ”ฑ +๐Ÿ’ผ +โšช +๐Ÿญ +๐Ÿง‚ +๐Ÿ•‰ +๐Ÿ˜ถ +๐Ÿ˜ซ +๐Ÿค— +๐Ÿ˜“ +๐ŸŽ +๐ŸŽ„ +๐Ÿ˜– +๐Ÿฅ— +๐ŸŽ™ +โ˜ธ +๐Ÿผ +๐Ÿฉบ +๐Ÿ‰‘ +๐Ÿ’ฝ +๐Ÿ›ณ +๐Ÿป +๐Ÿ†’ +๐Ÿง• +๐Ÿน +๐Ÿ—‚ +๐Ÿ†™ +๐Ÿ”ป +โ™‚ +๐Ÿ’ด +๐Ÿ•š +๐Ÿพ +๐ŸŒ” +๐Ÿ˜Ž +๐Ÿฅ‰ +๐Ÿคก +๐Ÿ•ฅ +โžฐ +๐ŸŒธ +โ™ฃ +๐Ÿฅซ +๐Ÿงธ +โญ +๐ŸŒฐ +๐Ÿค˜ +๐ŸŒต +๐ŸŽท +๐ŸŽข +๐Ÿ‘• +๐Ÿ”‘ +๐Ÿ†Ž +๐Ÿ’‡ +๐Ÿฆ˜ +๐Ÿคค +๐Ÿ˜ +๐Ÿฅต +๐Ÿ“ +๐Ÿ“‡ +๐Ÿ”ณ +๐Ÿ” +๐Ÿฅ” +๐Ÿ”ฎ +๐Ÿ“Ÿ +๐Ÿคœ +โœ‚ +๐Ÿ”ฒ +๐Ÿฅ‹ +๐Ÿ‘… +๐Ÿ’ธ +โ†˜ +๐ŸŒถ +๐Ÿ’ˆ +๐Ÿท +๐ŸฅŸ +๐Ÿ™ˆ +๐Ÿš‰ +๐Ÿˆ +๐Ÿœ +๐ŸŒฝ +๐Ÿ•— +๐Ÿฆฟ +โœก +๐ŸŒญ +๐Ÿ“‰ +๐Ÿ… +๐Ÿ‘„ +๐ŸŸฃ +๐Ÿช” +๐Ÿ”ค +๐Ÿ“Ž +๐Ÿ’‚ +๐Ÿšก +๐Ÿ’ฉ +๐Ÿ™‡ +๐ŸŽธ +๐Ÿ‘› +๐Ÿ—ก +๐ŸŽ +โ†™ +๐ŸŽ… +๐Ÿฆฏ +๐Ÿ”‰ +โŽ +๐Ÿฆš +โ›‘ +๐ŸŒ“ +๐ŸšŽ +๐Ÿ˜ +๐Ÿšฏ +๐Ÿ’ฒ +โ— +๐Ÿ“„ +โ• +๐Ÿ“ง +โ™‰ +๐Ÿท +๐Ÿ–ฅ +๐Ÿงฎ +๐Ÿš™ +๐ŸŽš +๐ŸŒ +๐Ÿ˜• +๐Ÿงพ +๐Ÿ— +โ™Ÿ +๐Ÿ˜ณ +๐Ÿฅฆ +๐Ÿง” +๐Ÿช‚ +๐Ÿ“• +๐Ÿงป +โ˜ +๐Ÿ”Š +๐Ÿ’น +โญ• +๐Ÿคฉ +๐Ÿ’ฌ +๐Ÿ“‘ +๐Ÿ‘ช +๐Ÿ•™ +๐Ÿฆ— +โ›ฒ +๐Ÿ–ฒ +โž• +๐Ÿ–จ +๐Ÿฅผ +โš” +๐Ÿผ +๐Ÿˆบ +๐ŸŒฑ +๐Ÿ† +๐Ÿ‘ฃ +๐Ÿ”ˆ +๐Ÿคš +๐Ÿฅ… +๐Ÿ’’ +๐Ÿš’ +๐Ÿช +๐Ÿ”› +๐Ÿ˜Š +๐Ÿ›บ +๐Ÿ“… +๐Ÿ‘น +๐Ÿ“ค +โ“ +๐Ÿ“œ +๐Ÿ‘ƒ +๐Ÿฅข +โ›ฑ +๐ŸŽฃ +๐Ÿ•ธ +๐Ÿ“ญ +๐Ÿ—œ +๐ŸŽž +๐Ÿ”ถ +๐Ÿคฏ +๐Ÿ’ข +โœ +๐Ÿ›ท +๐ŸŒณ +๐Ÿ•Ÿ +๐Ÿ•Ž +๐Ÿท +๐Ÿšฌ +๐Ÿญ +๐Ÿ‘ฉ +โ†• +๐Ÿ’™ +๐Ÿด +๐ŸŒŠ +๐Ÿงถ +๐ŸŽฆ +๐Ÿˆณ +๐Ÿ˜ค +๐Ÿ†˜ +๐Ÿ“™ +๐Ÿฆฅ +๐ŸŽฌ +๐Ÿ”’ +๐Ÿ“ท +๐Ÿ‘˜ +๐ŸฅŽ +โ—ฝ +๐Ÿ˜š +๐Ÿข +๐Ÿฎ +๐Ÿงซ +๐Ÿฆฉ +๐Ÿšฆ +๐ŸŒ‚ +๐Ÿฅ‘ +๐Ÿƒ +๐Ÿ† +๐Ÿช’ +๐Ÿ› +๐ŸŸง +๐ŸŽต +โบ +โŒ› +๐Ÿ“’ +๐Ÿป +๐ŸŽ +๐Ÿ›ธ +๐Ÿง +๐Ÿ”ต +๐Ÿ” +๐Ÿง +๐Ÿข +๐Ÿ—บ +๐Ÿƒ +๐ŸŒก +โœŠ +๐Ÿšˆ +โ–ถ +๐Ÿคต +โฎ +๐Ÿ +๐Ÿฆฆ +๐Ÿฆท +๐ŸŒด +๐Ÿ’ +โš– +๐Ÿš +๐Ÿ™ +๐Ÿ›ด +๐Ÿ–• +๐Ÿฆ‡ +๐ŸŸ  +๐Ÿ’ฐ +๐Ÿ™€ +๐Ÿคฑ +๐Ÿฅ– +๐ŸŽณ +๐Ÿ›ต +๐ŸŽ +๐Ÿ +โ˜ +๐Ÿ– +๐Ÿ“ป +๐Ÿงฃ +๐ŸŽค +๐Ÿ– +๐Ÿ™„ +๐Ÿ“Œ +๐Ÿšญ +โ™ฆ +๐Ÿฆ +๐Ÿฅ +โ›ช +๐ŸŒฏ +๐Ÿ” +โ™‘ +๐Ÿ“” +๐ŸŽŒ +๐Ÿ‚ +๐Ÿฅฃ +๐Ÿšฟ +๐Ÿ“ฃ +๐Ÿ˜‚ +๐Ÿ”ฆ +๐Ÿ“€ +๐Ÿ“ช +๐Ÿช‘ +๐Ÿฑ +๐Ÿก +โ˜ช +๐Ÿฆบ +๐ŸŽ“ +๐ŸŒฌ +๐Ÿคง +๐ŸฅŠ +โ™“ +๐Ÿฆ… +โš’ +โ™ฟ +๐Ÿคฆ +โฌœ +๐Ÿฌ +๐ŸŽ† +๐Ÿคฝ +โ›บ +โ˜Ž +๐Ÿˆ +๐ŸŽญ +๐ŸŽ  +๐Ÿ‘ซ +โ›น +โ†ฉ +๐Ÿน +๐Ÿ‹ +๐Ÿฅ’ +๐Ÿฆ› +๐Ÿ‘ +๐Ÿฟ +๐Ÿ‘Ÿ +๐Ÿคฃ +๐Ÿต +๐Ÿ‘Š +๐Ÿ†– +๐Ÿ…ฑ +๐Ÿช• +๐Ÿฆด +๐Ÿ• +๐Ÿฟ +๐Ÿ’ +๐Ÿฆธ +๐Ÿ† +๐Ÿฅฉ +๐Ÿ‘š +๐Ÿ—ผ +๐Ÿˆš +๐Ÿ“ +โ˜ข +๐Ÿ™ +๐ŸŽฏ +๐ŸŸข +๐Ÿ‘  +โธ +๐ŸŸ +โน +๐Ÿ‘‘ +๐Ÿฅง +๐Ÿคฐ +๐ŸŒƒ +๐Ÿ  +๐Ÿ•” +โญ +๐Ÿฅช +๐Ÿฆฒ +๐Ÿ“น +๐Ÿ˜” +๐Ÿด +โœจ +๐Ÿ˜ท +๐Ÿฆ‹ +๐ŸŽ– +๐Ÿ +๐Ÿ›ข +๐Ÿฆฝ +๐Ÿš› +๐Ÿ’ +๐Ÿง +โ„น +โ›ฉ +๐Ÿถ +๐Ÿ”— +๐Ÿ“ซ +๐Ÿ™ +โœ– +๐Ÿ— +๐ŸŒ— +๐Ÿฅ• +๐Ÿ‘ฝ +๐Ÿคž +๐Ÿฅ +โœณ +๐Ÿค  +โš— +๐Ÿ‘“ +๐Ÿฆถ +๐Ÿ” +๐Ÿ‘ +๐Ÿš +๐ŸŽฟ +๐Ÿงฑ +โ˜„ +๐ŸŸ \ No newline at end of file diff --git a/elisa_dnt/rules.ini b/elisa_dnt/rules.ini new file mode 100644 index 0000000..b8cca94 --- /dev/null +++ b/elisa_dnt/rules.ini @@ -0,0 +1,10 @@ +email=(?:(?:[a-zA-Z0-9_\-\.]+)@(?:(?:\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(?:(?:[a-zA-Z0-9\-]+\.)+))(?:[a-zA-Z]{2,4}|[0-9]{1,3})(?:\]?)) +url=(?:(?:(?i:(?:ht|f)tps?)\://)?(?:(?i:www|[a-zA-Z0-9-])\.)?[a-zA-Z0-9-.]+\.(?:TRAVELERSINSURANCE|NORTHWESTERNMUTUAL|SANDVIKCOROMANT|KERRYPROPERTIES|AMERICANEXPRESS|WEATHERCHANNEL|KERRYLOGISTICS|COOKINGCHANNEL|CANCERRESEARCH|BANANAREPUBLIC|AMERICANFAMILY|AFAMILYCOMPANY|WOLTERSKLUWER|TRAVELCHANNEL|SPREADBETTING|LIFEINSURANCE|INTERNATIONAL|VERSICHERUNG|SCHOLARSHIPS|LPLFINANCIAL|CONSTRUCTION|WILLIAMHILL|RIGHTATHOME|REDUMBRELLA|PROGRESSIVE|PRODUCTIONS|PLAYSTATION|PHOTOGRAPHY|OLAYANGROUP|MOTORCYCLES|LAMBORGHINI|KERRYHOTELS|INVESTMENTS|FOODNETWORK|ENTERPRISES|ENGINEERING|CREDITUNION|CONTRACTORS|CALVINKLEIN|BRIDGESTONE|BLOCKBUSTER|BLACKFRIDAY|BARCLAYCARD|ACCOUNTANTS|VOLKSWAGEN|VLAANDEREN|VISTAPRINT|UNIVERSITY|TELEFONICA|TECHNOLOGY|TATAMOTORS|SWIFTCOVER|SCHAEFFLER|RESTAURANT|REPUBLICAN|REALESTATE|PRUDENTIAL|PROTECTION|PROPERTIES|ONYOURSIDE|NEXTDIRECT|NEWHOLLAND|NATIONWIDE|MITSUBISHI|MANAGEMENT|INDUSTRIES|IMMOBILIEN|HEALTHCARE|FOUNDATION|EXTRASPACE|EUROVISION|CUISINELLA|CREDITCARD|CONSULTING|CAPITALONE|BOEHRINGER|BNPPARIBAS|BASKETBALL|ASSOCIATES|APARTMENTS|ACCOUNTANT|YODOBASHI|VACATIONS|TRAVELERS|STOCKHOLM|STATEFARM|STATEBANK|SOLUTIONS|SHANGRILA|SCJOHNSON|RICHARDLI|PRAMERICA|PASSAGENS|PANASONIC|MICROSOFT|MELBOURNE|MARSHALLS|MARKETING|LIFESTYLE|LANDROVER|LANCASTER|LADBROKES|KUOKGROUP|INSURANCE|INSTITUTE|HONEYWELL|HOMESENSE|HOMEGOODS|HOMEDEPOT|HISAMITSU|GOODHANDS|GOLDPOINT|FURNITURE|FUJIXEROX|FRONTDOOR|FRESENIUS|FIRESTONE|FINANCIAL|FAIRWINDS|EQUIPMENT|EDUCATION|DIRECTORY|COMMUNITY|CHRISTMAS|BLOOMBERG|BARCELONA|AQUARELLE|ANALYTICS|AMSTERDAM|ALLFINANZ|ALFAROMEO|ACCENTURE|YOKOHAMA|WOODSIDE|VERISIGN|VENTURES|VANGUARD|UCONNECT|TRAINING|TELECITY|SYMANTEC|SUPPLIES|STCGROUP|SOFTWARE|SOFTBANK|SHOWTIME|SHOPPING|SERVICES|SECURITY|SAMSCLUB|SAARLAND|RELIANCE|REDSTONE|PROPERTY|PLUMBING|PICTURES|PHARMACY|PARTNERS|OBSERVER|MOVISTAR|MORTGAGE|MERCKMSD|MEMORIAL|MCKINSEY|MASERATI|MARRIOTT|LUNDBECK|LIGHTING|JPMORGAN|ISTANBUL|IPIRANGA|INFINITI|HOSPITAL|HOLDINGS|HELSINKI|HDFCBANK|GUARDIAN|GRAPHICS|GRAINGER|GOODYEAR|FRONTIER|FOOTBALL|FIRMDALE|FIDELITY|FEEDBACK|EXCHANGE|EVERBANK|ETISALAT|ESURANCE|ERICSSON|ENGINEER|DOWNLOAD|DISCOVER|DISCOUNT|DIAMONDS|DEMOCRAT|DELOITTE|DELIVERY|COMPUTER|COMMBANK|CLOTHING|CLINIQUE|CLEANING|CITYEATS|CIPRIANI|CHRYSLER|CATHOLIC|CATERING|CAPETOWN|BUSINESS|BUILDERS|BUDAPEST|BRUSSELS|BROADWAY|BRADESCO|BOUTIQUE|BASEBALL|BARGAINS|BAREFOOT|BARCLAYS|ATTORNEY|ALLSTATE|AIRFORCE|ABUDHABI|ZUERICH|YOUTUBE|YAMAXUN|XFINITY|WINNERS|WINDOWS|WHOSWHO|WEDDING|WEBSITE|WEATHER|WATCHES|WANGGOU|WALMART|TRADING|TOSHIBA|TIFFANY|TICKETS|THEATRE|THEATER|TEMASEK|SYSTEMS|SURGERY|SUPPORT|STORAGE|STATOIL|STARHUB|STAPLES|SPIEGEL|SINGLES|SHRIRAM|SHIKSHA|SCIENCE|SCHWARZ|SCHMIDT|SANDVIK|SAMSUNG|REXROTH|REVIEWS|RENTALS|RECIPES|REALTOR|POLITIE|PIONEER|PHILIPS|PANERAI|ORIGINS|ORGANIC|OLDNAVY|OKINAWA|NEUSTAR|NETWORK|NETFLIX|NETBANK|MONSTER|METLIFE|MARKETS|LINCOLN|LIMITED|LIAISON|LECLERC|LATROBE|LASALLE|LANXESS|LANCOME|LACAIXA|KOMATSU|KITCHEN|JUNIPER|JEWELRY|ISMAILI|ISELECT|HYUNDAI|HOTMAIL|HOTELES|HOSTING|HOLIDAY|HITACHI|HANGOUT|HAMBURG|GUITARS|GROCERY|GODADDY|GENTING|GALLERY|FUJITSU|FROGANS|FORSALE|FLOWERS|FLORIST|FLIGHTS|FITNESS|FISHING|FINANCE|FERRERO|FERRARI|FASHION|FARMERS|EXPRESS|EXPOSED|DOMAINS|DIGITAL|DENTIST|CRUISES|CRICKET|COURSES|COUPONS|COUNTRY|CORSICA|COOKING|CONTACT|COMPARE|COMPANY|COMCAST|COLOGNE|COLLEGE|CLUBMED|CITADEL|CHINTAI|CHANNEL|CARTIER|CAREERS|CARAVAN|CAPITAL|BUGATTI|BROTHER|BOOKING|BESTBUY|BENTLEY|BAUHAUS|BANAMEX|AVIANCA|AUSPOST|AUDIBLE|AUCTION|ATHLETA|ANDROID|ALIBABA|AGAKHAN|ACADEMY|ABOGADO|ZAPPOS|YANDEX|YACHTS|XPERIA|XIHUAN|WEBCAM|WARMAN|WALTER|VUELOS|VOYAGE|VOTING|VISION|VIRGIN|VILLAS|VIKING|VIAJES|UNICOM|TRAVEL|TOYOTA|TKMAXX|TJMAXX|TIENDA|TENNIS|TATTOO|TARGET|TAOBAO|TAIPEI|SYDNEY|SWATCH|SUZUKI|SUPPLY|STUDIO|STREAM|SOCIAL|SOCCER|SHOUJI|SELECT|SECURE|SEARCH|SCHULE|SCHOOL|SANOFI|SAKURA|SAFETY|RYUKYU|ROGERS|ROCHER|REVIEW|REPORT|REPAIR|REISEN|REALTY|RACING|QUEBEC|PICTET|PIAGET|PHYSIO|PHOTOS|PFIZER|OTSUKA|ORANGE|ORACLE|ONLINE|OLAYAN|OFFICE|NOWRUZ|NORTON|NISSAY|NISSAN|NATURA|NAGOYA|MUTUAL|MUSEUM|MOSCOW|MORMON|MONASH|MOBILY|MOBILE|MATTEL|MARKET|MAKEUP|MAISON|MADRID|LUXURY|LONDON|LOCKER|LIVING|LEFRAK|LAWYER|LATINO|LANCIA|KOSHER|KINDLE|KINDER|KAUFEN|JUEGOS|JOBURG|JAGUAR|INTUIT|INSURE|IMAMAT|HUGHES|HOTELS|HOCKEY|HIPHOP|HERMES|HEALTH|GRATIS|GOOGLE|GLOBAL|GIVING|GEORGE|GARDEN|GALLUP|FUTBOL|FLICKR|FAMILY|EXPERT|EVENTS|ESTATE|ENERGY|EMERCK|DURBAN|DUPONT|DUNLOP|DOCTOR|DIRECT|DESIGN|DENTAL|DEGREE|DEALER|DATSUN|DATING|CRUISE|CREDIT|COUPON|CONDOS|COMSEC|COFFEE|CLINIC|CLAIMS|CIRCLE|CHURCH|CHROME|CHANEL|CENTER|CASINO|CASEIH|CAREER|CAMERA|BROKER|BOSTON|BOSTIK|BLANCO|BHARTI|BERLIN|BEAUTY|BAYERN|AUTHOR|ARAMCO|ANQUAN|ALSTOM|ALSACE|ALIPAY|AIRTEL|AIRBUS|AGENCY|AFRICA|ACTIVE|ABBVIE|ABBOTT|ABARTH|ZIPPO|YAHOO|XEROX|WORLD|WORKS|WEIBO|WEBER|WATCH|WALES|VOLVO|VODKA|VISTA|VIDEO|VEGAS|UBANK|TUSHU|TUNES|TRUST|TRADE|TOURS|TOTAL|TORAY|TOOLS|TOKYO|TODAY|TMALL|TIROL|TIRES|TATAR|SWISS|SUCKS|STYLE|STUDY|STORE|STADA|SPORT|SPACE|SOLAR|SMILE|SMART|SLING|SKYPE|SHOES|SHELL|SHARP|SEVEN|SENER|SALON|RUGBY|RODEO|ROCKS|RICOH|REISE|REHAB|RADIO|QUEST|PROMO|PRIME|PRESS|PRAXI|POKER|PLACE|PIZZA|PHOTO|PHONE|PARTY|PARTS|PARIS|OSAKA|OMEGA|NOWTV|NOKIA|NINJA|NIKON|NEXUS|NADEX|MOVIE|MOPAR|MONEY|MIAMI|MEDIA|MANGO|MACYS|LUPIN|LOTTO|LOTTE|LOCUS|LOANS|LIXIL|LIPSY|LINDE|LILLY|LEXUS|LEGAL|LEASE|LAMER|KYOTO|KOELN|JETZT|IVECO|IRISH|INTEL|IKANO|HYATT|HOUSE|HORSE|HONDA|HOMES|GUIDE|GUCCI|GROUP|GRIPE|GREEN|GMAIL|GLOBO|GLASS|GLADE|GIVES|GIFTS|GAMES|GALLO|FORUM|FOREX|FINAL|FEDEX|FAITH|EPSON|EPOST|EMAIL|EDEKA|EARTH|DUBAI|DRIVE|DODGE|DELTA|DEALS|DANCE|DABUR|CYMRU|CROWN|CODES|COACH|CLOUD|CLICK|CITIC|CISCO|CHEAP|CHASE|CARDS|CANON|BUILD|BOSCH|BOATS|BLACK|BINGO|BIBLE|BEATS|BAIDU|AZURE|AUTOS|AUDIO|ARCHI|APPLE|AMICA|AMFAM|AETNA|ADULT|ACTOR|ZONE|ZERO|ZARA|YOGA|XBOX|WORK|WINE|WIKI|WIEN|WEIR|WANG|VOTO|VOTE|VIVO|VIVA|VISA|VANA|TUBE|TOYS|TOWN|TIPS|TIAA|TEVA|TECH|TEAM|TAXI|TALK|SURF|STAR|SPOT|SONY|SONG|SOHU|SNCF|SKIN|SITE|SINA|SILK|SHOW|SHOP|SHIA|SHAW|SEXY|SEEK|SEAT|SCOT|SCOR|SAXO|SAVE|SARL|SAPO|SALE|SAFE|RUHR|RSVP|ROOM|RMIT|RICH|REST|RENT|REIT|READ|RAID|QPON|PROF|PROD|POST|PORN|POHL|PLUS|PLAY|PINK|PING|PICS|PCCW|PARS|PAGE|OPEN|OLLO|NIKE|NICO|NEXT|NEWS|NAVY|NAME|MOTO|MODA|MOBI|MINT|MINI|MENU|MEME|MEET|MAIF|LUXE|LTDA|LOVE|LOFT|LOAN|LIVE|LINK|LIMO|LIKE|LIFE|LIDL|LGBT|LEGO|LAND|KRED|KPMG|KIWI|KDDI|JPRS|JOBS|JEEP|JAVA|ITAU|INFO|IMMO|IMDB|IEEE|ICBC|HSBC|HOST|HGTV|HERE|HELP|HDFC|HAUS|HAIR|GURU|GUGE|GOOG|GOLF|GOLD|GMBH|GIFT|GGEE|GENT|GBIZ|GAME|FUND|FREE|FORD|FOOD|FLIR|FISH|FIRE|FILM|FIDO|FIAT|FAST|FARM|FANS|FAIL|FAGE|ERNI|DVAG|DUNS|DUCK|DOHA|DOCS|DISH|DIET|DESI|DELL|DEAL|DCLK|DATE|DATA|CYOU|COOP|COOL|CLUB|CITY|CITI|CHAT|CERN|CBRE|CASH|CASE|CASA|CARS|CARE|CAMP|CALL|CAFE|BUZZ|BOOK|BOND|BOFA|BLUE|BLOG|BING|BIKE|BEST|BEER|BBVA|BANK|BAND|BABY|AUTO|AUDI|ASIA|ASDA|ARTE|ARPA|ARMY|ARAB|AMEX|ALLY|AKDN|AIGO|AERO|ADAC|ABLE|AARP|ZIP|YUN|YOU|XYZ|XXX|XIN|WTF|WTC|WOW|WME|WIN|WED|VIP|VIN|VIG|VET|UPS|UOL|UNO|UBS|TVS|TUI|TRV|TOP|TJX|THD|TEL|TDK|TCI|TAX|TAB|STC|SRT|SRL|SOY|SKY|SKI|SFR|SEX|SEW|SES|SCB|SCA|SBS|SBI|SAS|SAP|RWE|RUN|RIP|RIO|RIL|REN|RED|QVC|PWC|PUB|PRU|PRO|PNC|PIN|PID|PHD|PET|PAY|OVH|OTT|ORG|OOO|ONL|ONG|ONE|OFF|OBI|NYC|NTT|NRW|NRA|NOW|NHK|NGO|NFL|NEW|NET|NEC|NBA|NAB|MTR|MTN|MSD|MOV|MOM|MOI|MOE|MMA|MLS|MLB|MIT|MIL|MEO|MEN|MED|MBA|MAP|MAN|LTD|LPL|LOL|LLC|LDS|LAW|LAT|KRD|KPN|KIM|KIA|KFH|JOY|JOT|JNJ|JMP|JLL|JLC|JIO|JCP|JCB|IWC|ITV|IST|INT|INK|ING|IFM|ICU|ICE|IBM|HOW|HOT|HKT|HIV|HBO|GOV|GOT|GOP|GOO|GMX|GMO|GLE|GEA|GDN|GAP|GAL|FYI|FUN|FTR|FRL|FOX|FOO|FLY|FIT|FAN|EUS|ESQ|EDU|ECO|EAT|DVR|DTV|DOT|DOG|DNP|DIY|DHL|DEV|DDS|DAY|DAD|CSC|CRS|COM|CFD|CFA|CEO|CEB|CBS|CBN|CBA|CAT|CAR|CAM|CAL|CAB|BZH|BUY|BOX|BOT|BOO|BOM|BNL|BMW|BMS|BIZ|BIO|BID|BET|BCN|BCG|BBT|BBC|BAR|AXA|AWS|ART|APP|AOL|ANZ|AIG|AFL|AEG|ADS|ACO|ABC|ABB|AAA|ZW|ZM|ZA|YT|YE|WS|WF|VU|VN|VI|VG|VE|VC|VA|UZ|UY|US|UK|UG|UA|TZ|TW|TV|TT|TR|TO|TN|TM|TL|TK|TJ|TH|TG|TF|TD|TC|SZ|SY|SX|SV|SU|ST|SR|SO|SN|SM|SL|SK|SJ|SI|SH|SG|SE|SD|SC|SB|SA|RW|RU|RS|RO|RE|QA|PY|PW|PT|PS|PR|PN|PM|PL|PK|PH|PG|PF|PE|PA|OM|NZ|NU|NR|NP|NO|NL|NI|NG|NF|NE|NC|NA|MZ|MY|MX|MW|MV|MU|MT|MS|MR|MQ|MP|MO|MN|MM|ML|MK|MH|MG|ME|MD|MC|MA|LY|LV|LU|LT|LS|LR|LK|LI|LC|LB|LA|KZ|KY|KW|KR|KP|KN|KM|KI|KH|KG|KE|JP|JO|JM|JE|IT|IS|IR|IQ|IO|IN|IM|IL|IE|ID|HU|HT|HR|HN|HM|HK|GY|GW|GU|GT|GS|GR|GQ|GP|GN|GM|GL|GI|GH|GG|GF|GE|GD|GB|GA|FR|FO|FM|FK|FJ|FI|EU|ET|ES|ER|EG|EE|EC|DZ|DO|DM|DK|DJ|DE|CZ|CY|CX|CW|CV|CU|CR|CO|CN|CM|CL|CK|CI|CH|CG|CF|CD|CC|CA|BZ|BY|BW|BV|BT|BS|BR|BO|BN|BM|BJ|BI|BH|BG|BF|BE|BD|BB|BA|AZ|AX|AW|AU|AT|AS|AR|AQ|AO|AM|AL|AI|AG|AF|AE|AD|AC|travelersinsurance|northwesternmutual|sandvikcoromant|kerryproperties|americanexpress|weatherchannel|kerrylogistics|cookingchannel|cancerresearch|bananarepublic|americanfamily|afamilycompany|wolterskluwer|travelchannel|spreadbetting|lifeinsurance|international|versicherung|scholarships|lplfinancial|construction|williamhill|rightathome|redumbrella|progressive|productions|playstation|photography|olayangroup|motorcycles|lamborghini|kerryhotels|investments|foodnetwork|enterprises|engineering|creditunion|contractors|calvinklein|bridgestone|blockbuster|blackfriday|barclaycard|accountants|volkswagen|vlaanderen|vistaprint|university|telefonica|technology|tatamotors|swiftcover|schaeffler|restaurant|republican|realestate|prudential|protection|properties|onyourside|nextdirect|newholland|nationwide|mitsubishi|management|industries|immobilien|healthcare|foundation|extraspace|eurovision|cuisinella|creditcard|consulting|capitalone|boehringer|bnpparibas|basketball|associates|apartments|accountant|yodobashi|vacations|travelers|stockholm|statefarm|statebank|solutions|shangrila|scjohnson|richardli|pramerica|passagens|panasonic|microsoft|melbourne|marshalls|marketing|lifestyle|landrover|lancaster|ladbrokes|kuokgroup|insurance|institute|honeywell|homesense|homegoods|homedepot|hisamitsu|goodhands|goldpoint|furniture|fujixerox|frontdoor|fresenius|firestone|financial|fairwinds|equipment|education|directory|community|christmas|bloomberg|barcelona|aquarelle|analytics|amsterdam|allfinanz|alfaromeo|accenture|yokohama|woodside|verisign|ventures|vanguard|uconnect|training|telecity|symantec|supplies|stcgroup|software|softbank|showtime|shopping|services|security|samsclub|saarland|reliance|redstone|property|plumbing|pictures|pharmacy|partners|observer|movistar|mortgage|merckmsd|memorial|mckinsey|maserati|marriott|lundbeck|lighting|jpmorgan|istanbul|ipiranga|infiniti|hospital|holdings|helsinki|hdfcbank|guardian|graphics|grainger|goodyear|frontier|football|firmdale|fidelity|feedback|exchange|everbank|etisalat|esurance|ericsson|engineer|download|discover|discount|diamonds|democrat|deloitte|delivery|computer|commbank|clothing|clinique|cleaning|cityeats|cipriani|chrysler|catholic|catering|capetown|business|builders|budapest|brussels|broadway|bradesco|boutique|baseball|bargains|barefoot|barclays|attorney|allstate|airforce|abudhabi|zuerich|youtube|yamaxun|xfinity|winners|windows|whoswho|wedding|website|weather|watches|wanggou|walmart|trading|toshiba|tiffany|tickets|theatre|theater|temasek|systems|surgery|support|storage|statoil|starhub|staples|spiegel|singles|shriram|shiksha|science|schwarz|schmidt|sandvik|samsung|rexroth|reviews|rentals|recipes|realtor|politie|pioneer|philips|panerai|origins|organic|oldnavy|okinawa|neustar|network|netflix|netbank|monster|metlife|markets|lincoln|limited|liaison|leclerc|latrobe|lasalle|lanxess|lancome|lacaixa|komatsu|kitchen|juniper|jewelry|ismaili|iselect|hyundai|hotmail|hoteles|hosting|holiday|hitachi|hangout|hamburg|guitars|grocery|godaddy|genting|gallery|fujitsu|frogans|forsale|flowers|florist|flights|fitness|fishing|finance|ferrero|ferrari|fashion|farmers|express|exposed|domains|digital|dentist|cruises|cricket|courses|coupons|country|corsica|cooking|contact|compare|company|comcast|cologne|college|clubmed|citadel|chintai|channel|cartier|careers|caravan|capital|bugatti|brother|booking|bestbuy|bentley|bauhaus|banamex|avianca|auspost|audible|auction|athleta|android|alibaba|agakhan|academy|abogado|zappos|yandex|yachts|xperia|xihuan|webcam|warman|walter|vuelos|voyage|voting|vision|virgin|villas|viking|viajes|unicom|travel|toyota|tkmaxx|tjmaxx|tienda|tennis|tattoo|target|taobao|taipei|sydney|swatch|suzuki|supply|studio|stream|social|soccer|shouji|select|secure|search|schule|school|sanofi|sakura|safety|ryukyu|rogers|rocher|review|report|repair|reisen|realty|racing|quebec|pictet|piaget|physio|photos|pfizer|otsuka|orange|oracle|online|olayan|office|nowruz|norton|nissay|nissan|natura|nagoya|mutual|museum|moscow|mormon|monash|mobily|mobile|mattel|market|makeup|maison|madrid|luxury|london|locker|living|lefrak|lawyer|latino|lancia|kosher|kindle|kinder|kaufen|juegos|joburg|jaguar|intuit|insure|imamat|hughes|hotels|hockey|hiphop|hermes|health|gratis|google|global|giving|george|garden|gallup|futbol|flickr|family|expert|events|estate|energy|emerck|durban|dupont|dunlop|doctor|direct|design|dental|degree|dealer|datsun|dating|cruise|credit|coupon|condos|comsec|coffee|clinic|claims|circle|church|chrome|chanel|center|casino|caseih|career|camera|broker|boston|bostik|blanco|bharti|berlin|beauty|bayern|author|aramco|anquan|alstom|alsace|alipay|airtel|airbus|agency|africa|active|abbvie|abbott|abarth|zippo|yahoo|xerox|world|works|weibo|weber|watch|wales|volvo|vodka|vista|video|vegas|ubank|tushu|tunes|trust|trade|tours|total|toray|tools|tokyo|today|tmall|tirol|tires|tatar|swiss|sucks|style|study|store|stada|sport|space|solar|smile|smart|sling|skype|shoes|shell|sharp|seven|sener|salon|rugby|rodeo|rocks|ricoh|reise|rehab|radio|quest|promo|prime|press|praxi|poker|place|pizza|photo|phone|party|parts|paris|osaka|omega|nowtv|nokia|ninja|nikon|nexus|nadex|movie|mopar|money|miami|media|mango|macys|lupin|lotto|lotte|locus|loans|lixil|lipsy|linde|lilly|lexus|legal|lease|lamer|kyoto|koeln|jetzt|iveco|irish|intel|ikano|hyatt|house|horse|honda|homes|guide|gucci|group|gripe|green|gmail|globo|glass|glade|gives|gifts|games|gallo|forum|forex|final|fedex|faith|epson|epost|email|edeka|earth|dubai|drive|dodge|delta|deals|dance|dabur|cymru|crown|codes|coach|cloud|click|citic|cisco|cheap|chase|cards|canon|build|bosch|boats|black|bingo|bible|beats|baidu|azure|autos|audio|archi|apple|amica|amfam|aetna|adult|actor|zone|zero|zara|yoga|xbox|work|wine|wiki|wien|weir|wang|voto|vote|vivo|viva|visa|vana|tube|toys|town|tips|tiaa|teva|tech|team|taxi|talk|surf|star|spot|sony|song|sohu|sncf|skin|site|sina|silk|show|shop|shia|shaw|sexy|seek|seat|scot|scor|saxo|save|sarl|sapo|sale|safe|ruhr|rsvp|room|rmit|rich|rest|rent|reit|read|raid|qpon|prof|prod|post|porn|pohl|plus|play|pink|ping|pics|pccw|pars|page|open|ollo|nike|nico|next|news|navy|name|moto|moda|mobi|mint|mini|menu|meme|meet|maif|luxe|ltda|love|loft|loan|live|link|limo|like|life|lidl|lgbt|lego|land|kred|kpmg|kiwi|kddi|jprs|jobs|jeep|java|itau|info|immo|imdb|ieee|icbc|hsbc|host|hgtv|here|help|hdfc|haus|hair|guru|guge|goog|golf|gold|gmbh|gift|ggee|gent|gbiz|game|fund|free|ford|food|flir|fish|fire|film|fido|fiat|fast|farm|fans|fail|fage|erni|dvag|duns|duck|doha|docs|dish|diet|desi|dell|deal|dclk|date|data|cyou|coop|cool|club|city|citi|chat|cern|cbre|cash|case|casa|cars|care|camp|call|cafe|buzz|book|bond|bofa|blue|blog|bing|bike|best|beer|bbva|bank|band|baby|auto|audi|asia|asda|arte|arpa|army|arab|amex|ally|akdn|aigo|aero|adac|able|aarp|zip|yun|you|xyz|xxx|xin|wtf|wtc|wow|wme|win|wed|vip|vin|vig|vet|ups|uol|uno|ubs|tvs|tui|trv|top|tjx|thd|tel|tdk|tci|tax|tab|stc|srt|srl|soy|sky|ski|sfr|sex|sew|ses|scb|sca|sbs|sbi|sas|sap|rwe|run|rip|rio|ril|ren|red|qvc|pwc|pub|pru|pro|pnc|pin|pid|phd|pet|pay|ovh|ott|org|ooo|onl|ong|one|off|obi|nyc|ntt|nrw|nra|now|nhk|ngo|nfl|new|net|nec|nba|nab|mtr|mtn|msd|mov|mom|moi|moe|mma|mls|mlb|mit|mil|meo|men|med|mba|map|man|ltd|lpl|lol|llc|lds|law|lat|krd|kpn|kim|kia|kfh|joy|jot|jnj|jmp|jll|jlc|jio|jcp|jcb|iwc|itv|ist|int|ink|ing|ifm|icu|ice|ibm|how|hot|hkt|hiv|hbo|gov|got|gop|goo|gmx|gmo|gle|gea|gdn|gap|gal|fyi|fun|ftr|frl|fox|foo|fly|fit|fan|eus|esq|edu|eco|eat|dvr|dtv|dot|dog|dnp|diy|dhl|dev|dds|day|dad|csc|crs|com|cfd|cfa|ceo|ceb|cbs|cbn|cba|cat|car|cam|cal|cab|bzh|buy|box|bot|boo|bom|bnl|bmw|bms|biz|bio|bid|bet|bcn|bcg|bbt|bbc|bar|axa|aws|art|app|aol|anz|aig|afl|aeg|ads|aco|abc|abb|aaa|zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|uk|ug|ua|tz|tw|tv|tt|tr|to|tn|tm|tl|tk|tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|sr|so|sn|sm|sl|sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|re|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eg|ee|ec|dz|do|dm|dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bo|bn|bm|bj|bi|bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ar|aq|ao|am|al|ai|ag|af|ae|ad|ac)(?:\:[0-9]{1,5})*(?:/(?:$|[a-zA-Z0-9\.\,\;\?\'\\\+&%\$#\=~_\-]+))*(?=\b|\s|$)) +phone=(?:(?:\(?\+\d{1,2}\)? *|#)*(?:[0-9](?: |-)?)?(?:\(?[0-9]{3}\)?|[0-9]{3})(?: |-)?(?:[0-9]{3}(?: |-)?[0-9]{4,7})) +time=(?:(?:19[0-9]{2}|[2-9][0-9]{3})-(?:(?:0(?:1|3|5|7|8)|10|12)-(?:0[1-9]|1[0-9]|2[0-9]|3[0-1])|(?:0(?:4|6|9)|11)-(?:0[1-9]|1[0-9]|2[0-9]|30)|(?:02)-(?:0[1-9]|1[0-9]|2[0-9]))\x20(?:0[0-9]|1[0-9]|2[0-3])(?::[0-5][0-9]){0,2}) +hash_tag=(?:(?<=\s|^|\b|[^\w])#\p{N}*[\p{L}_]+[\p{L}\p{N}_]*) +mention=(?:(?<=\s|^|\b|[^\w])@[\w]+(?=\s|$|\b|[^\w])) +emoticon=(?:(?<=\s|^|\b|[^\w])[:;=]-*(?:[])*>) \ No newline at end of file diff --git a/elisa_dnt/utils.py b/elisa_dnt/utils.py index 38cac38..7e767de 100644 --- a/elisa_dnt/utils.py +++ b/elisa_dnt/utils.py @@ -1,73 +1,87 @@ # encoding: utf-8 # Created by chenghaomou at 2019-05-22 + import itertools -import emoji -import string +import regex as re import warnings from collections import namedtuple -Match = namedtuple('Match', 'start end re') - -rules = { - "del": { - "email": r"(?i)( *[\w!#$%&'*+/=?^`{|}~-]+(?:\.[\w!#$%&'*+/=?^`{|}~-]+)*@(?:[a-z\d](?:[a-z\d-]*[a-z\d])?\.)+[a-z\d](?:[a-z\d-]*[a-z\d])? *)", - "url": r"( *\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?ยซยปโ€œโ€โ€˜โ€™])|(?:(?)+ *)", - "twitter": r"( *pic\.twitter\.com/[a-zA-Z0-9]+ *)", - "emoticon": r"((?![\w]) *(:\)+|:-+\)+|:\(+|:-+\(+|;\)+|;-+\)+|:-+O|8-+|:P|<3|:<|:D|:\||:S|:\$|:\/|:-+\/)+ *(?![\w]))", - "emoji": u" *[" + "".join(set(x for y in list(map(list, emoji.EMOJI_UNICODE.values())) for x in y if - len(x) == 1 and x not in string.punctuation + '0123456789')) + "]+ *" - }, - "sub": { - "email": r"(?i)([\w!#$%&'*+/=?^`{|}~-]+(?:\.[\w!#$%&'*+/=?^`{|}~-]+)*@(?:[a-z\d](?:[a-z\d-]*[a-z\d])?\.)+[a-z\d](?:[a-z\d-]*[a-z\d])?)", - "url": r"(\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?ยซยปโ€œโ€โ€˜โ€™])|(?:(?)+)", - "twitter": r"(pic\.twitter\.com/[a-zA-Z0-9]+)", - "emoticon": r"((?![\w])(:\)+|:-+\)+|:\(+|:-+\(+|;\)+|;-+\)+|:-+O|8-+|:P|<3|:<|:D|:\||:S|:\$|:\/|:-+\/)+(?![\w]))", - "emoji": u"[" + "".join(set(x for y in list(map(list, emoji.EMOJI_UNICODE.values())) for x in y if - len(x) == 1 and x not in string.punctuation + '0123456789')) + "]+" - } -} +Match = namedtuple('Match', 'start end cls') MARKERS = [chr(x) for x in range(0x4DC0, 0x4DFF)] -options = { - "colors": { - "email": "background-image:linear-gradient(90deg, #a2fafc, #11a9fc);", - "url": "background-image:linear-gradient(90deg, #fcd766, #fc7f00);", - "html": "background-image:linear-gradient(90deg, #aa9cfc, #11a9fc);", - "mention": "background-image:linear-gradient(90deg, #abfca5, #fce43d);", - "time": "background-image:linear-gradient(90deg, #abfca5, #fce43d);", - "hashtag": "background-image:linear-gradient(90deg, #aa9cfc, #fc9ce7);", - "comb": "background-image:linear-gradient(90deg, #a2fafc, #fce43d);", - "emoticon": "background-image:linear-gradient(90deg, #FFFFFF, #fce43d);", - "emoji": "background-image:linear-gradient(90deg, #fce43d, #FFFFFF);", - }, - "categories": ["email", "url", "html", "mention", "time", "hashtag", "comb", "emoticon", "emoji"], -} - - -def find(string: str, RULES: dict) -> list: - matches = itertools.chain(*[exp.finditer(string) for key, exp in RULES.items() if key != "comb"]) - matches = [match for match in sorted(matches, key=lambda m: (m.start(0), -m.end(0)))] - filtered_matches = [] - - for i, match in enumerate(matches): - if i > 0 and filtered_matches[-1].start <= match.start(0) < match.end(0) <= filtered_matches[-1].end: + +def generate_options(path: str = 'elisa_dnt/rules.ini') -> dict: + colors = [ + '#e3f2fd', + '#bbdefb', + '#90caf9', + '#64b5f6', + '#42a5f5', + '#2196f3', + '#1e88e5', + '#1976d2', + '#1565c0', + '#0d47a1' + ] + options = {'colors': {'comb': 'background-color: #4caf50;' }, 'categories': ['comb']} + with open(path) as i: + for j, line in enumerate(map(lambda x: x.strip('\n'), i.readlines())): + name, _ = line.split('=', 1) + options['categories'].append(name) + options['colors'][name] = f'background-color: {colors[j]};' + options['categories'].append('emoji') + options['colors']['emoji'] = f'background-color: {colors[(j+1)%len(colors)]};' + + return options + + +def load_rules(emoji_path: str = 'elisa_dnt/emojis.ini', + rule_path: str = 'elisa_dnt/rules.ini', + scheme: str = 'del', + ) -> dict: + + with open(emoji_path) as i: + emojis = '|'.join(list(map(lambda x: x.strip('\n'), i.readlines()))) + + rules = {} + + with open(rule_path) as i: + for rule in map(lambda x: x.strip('\n'), i.readlines()): + name, value = rule.split('=', 1) + if scheme == "del": + rules[name] = re.compile(u"([ \u202b]*{}[ \u202b]*)".format(value)) + else: + rules[name] = re.compile(r"({})".format(value)) + if scheme == "del": + rules['emoji'] = re.compile(u'([ \u202b]*(?:' + emojis + ')[ \u202b]*)', re.UNICODE) + else: + rules['emoji'] = re.compile(r'(' + emojis + ')', re.UNICODE) + + # print(rules['emoji']) + + return rules + + +def find(string: str, rules: dict) -> list: + # matches = itertools.chain(*[(key, exp.finditer(string)) ]) + matches = [(key, match) for key, exp in rules.items() for match in exp.finditer(string)] + matches = [match for match in sorted(matches, key=lambda m: (m[-1].start(0), -(m[-1].end(0))))] + merged_matches = [] + + for i, (name, match) in enumerate(matches): + if i > 0 and merged_matches[-1].start <= match.start(0) < match.end(0) <= merged_matches[-1].end: continue - elif i > 0 and filtered_matches[-1].start <= match.start(0) <= filtered_matches[-1].end: - filtered_matches[-1] = Match(filtered_matches[-1].start, max(match.end(0), filtered_matches[-1].end), - re=RULES["comb"]) + elif i > 0 and merged_matches[-1].start <= match.start(0) <= merged_matches[-1].end + 1: + merged_matches[-1] = Match(merged_matches[-1].start, + max(match.end(0), merged_matches[-1].end), + 'comb') else: - filtered_matches.append(Match(match.start(0), match.end(0), re=match.re)) + merged_matches.append(Match(match.start(0), + match.end(0), + name)) - return filtered_matches + return merged_matches def mark(string: str, matches: list, scheme: str = "sub") -> tuple: @@ -149,9 +163,9 @@ def mark(string: str, matches: list, scheme: str = "sub") -> tuple: return segments, modification, lead -def visual(string: str, matches: list, options: dict, RULES: dict) -> str: +def visual(string: str, matches: list, options: dict, rules: dict) -> str: def colorize(match, text): - cls = [key for key, value in RULES.items() if value == match.re][0] + cls = match.cls if cls in options["categories"]: if "<" not in text and ">" not in text: return f"""{text}""" @@ -163,15 +177,18 @@ def colorize(match, text): return text res = string + matched = set() for match in matches: start, end = match.start, match.end text = string[start:end] - res = res.replace(text, colorize(match, text)) + if text not in matched: + res = res.replace(text, colorize(match, text)) + matched.add(text) return res -def split(corpus_path, corpus_output, ini_output, scheme: str, ref: str, RULES: dict): +def split(corpus_path, corpus_output, ini_output, scheme: str, ref: str, rules: dict): with open(corpus_path) as source, open(corpus_output, "w") as o_source, open(ini_output, "w") as o_source_ini: if ref == "": @@ -179,7 +196,7 @@ def split(corpus_path, corpus_output, ini_output, scheme: str, ref: str, RULES: for src in source.readlines(): total_sents += 1 src = src.strip('\n') - src_matches = find(src, RULES) + src_matches = find(src, rules) src_after, src_mod, src_lead = mark(src, src_matches, scheme=scheme) if scheme == "del": for seg in src_after: @@ -213,8 +230,8 @@ def split(corpus_path, corpus_output, ini_output, scheme: str, ref: str, RULES: src_line = src_line.strip('\n') tgt_line = tgt_line.strip('\n') - src_matches = find(src_line, RULES) - tgt_matches = find(tgt_line, RULES) + src_matches = find(src_line, rules) + tgt_matches = find(tgt_line, rules) src_matches_text = [src_line[m.start(0):m.end(0)] for m in src_matches] tgt_matches_text = [tgt_line[m.start(0):m.end(0)] for m in tgt_matches] x_matches = list(set(src_matches_text).intersection(set(tgt_matches_text))) @@ -297,3 +314,18 @@ def restore(dnt_path, ini_path, output, scheme="del"): new_translation = new_translation.replace(char, segments[min(ord(char) - 0x4DC0, len(segments) - 1)]) o.write(new_translation + '\n') + + +if __name__ == "__main__": + + txt = """RT @jokateM: Utu humfanya mtu awe kipenzi cha watu,utu humfanya mtu awe kimbilio la watu,aliyekosa utu hana mvuto kwa watu. ๐Ÿ™๐Ÿฝโค She writes [ar]: ู…ู„ุฎุต ุชุบุทูŠุฉ ูˆูƒุงู„ุฉ ุงู„ุงู†ุจุงุก ุงู„ุฑุณู…ูŠุฉ ู„ู…ุธุงู‡ุฑุงุช ุงู…ุณ: ู…ูˆุงุทู†ูŠู† ุงุนุชุฏูˆุง ุนู„ู‰ ุงู„ุดุฑุทุฉ ูุงุถุทุฑูˆู‡ุง ู„ุงุณุชุฎุฏุงู… ุงู„ุบุงุฒ ุงู„ู…ุณูŠู„ ู„ู„ุฏู…ูˆุน http://suna-sd.net/suna/showNews/-fJi7HGycvs26Azq7aG4mmjptp-NQZ_WndSuVb1-KMY/1 #ุงู„ุฎุฑุง ds.CRIME BE PART OF VODACOM SUCCESS: https://t.co/Wzo1EckNhe via @YouTube CEO wa @MeTL_Group, @moodewji akiwa katika majadiliano kwenye mkutano wa @africaceoforum unaofanyika Geneva, Switze... https://t.co/uBAXDYfmlQ +@earadiofm: #MICHEZO Msanii na Mbunge wa Mikumi @ProfessorJayTz akiwa na Seleman Matola katika uwanja wa Taifa kushuhudia mechi kati ya...RT @earadiofm: #MICHEZO Msanii na Mbunge wa Mikumi @ProfessorJayTz akiwa na Seleman Matola katika uwanja wa Taifa kushuhudia mechi kati ya...@Youtube She writes [ar]: ู…ู„ุฎุต ุชุบุทูŠุฉ ูˆูƒุงู„ุฉ ุงู„ุงู†ุจุงุก ุงู„ุฑุณู…ูŠุฉ ู„ู…ุธุงู‡ุฑุงุช ุงู…ุณ: ู…ูˆุงุทู†ูŠู† ุงุนุชุฏูˆุง ุนู„ู‰ ุงู„ุดุฑุทุฉ ูุงุถุทุฑูˆู‡ุง ู„ุงุณุชุฎุฏุงู… ุงู„ุบุงุฒ ุงู„ู…ุณูŠู„ ู„ู„ุฏู…ูˆุน http://suna-sd.net/suna/showNews/-fJi7HGycvs26Azq7aG4mmjptp-NQZ_WndSuVb1-KMY/1 โ€ซ#ุงู„ุฎุฑุงุกโ€ฌ""" + + rules = load_rules('emojis.ini', 'rules.ini', 'del') + options = generate_options('rules.ini') + matches = find(txt, rules) + spans = [txt[m.start:m.end] for m in matches] + + print(spans) + print(mark(txt, find(txt, rules), 'del')) + print(visual(txt, matches, options, rules)) diff --git a/requirements.txt b/requirements.txt index 3246b0b..ed1a306 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,19 @@ # Requirements automatically generated by pigar. # https://github.com/damnever/pigar +# elisa_dnt/emoji.py: 4 +beautifulsoup4 == 4.7.1 + # elisa_dnt/utils.py: 4 emoji == 0.5.2 # elisa_dnt/__main__.py: 4 +# elisa_dnt/tools.py: 4 # statistics.py: 5 regex == 2019.4.14 # setup.py: 1 setuptools == 41.0.1 + +# elisa_dnt/emoji.py: 5 +urllib3 == 1.24.2 diff --git a/setup.py b/setup.py index 4edb4a3..9fb8a0f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='elisa-dnt', - version='0.0.8', + version='0.0.9', packages=['elisa_dnt'], url='https://github.com/ChenghaoMou/elisa-dnt', license='',