From 8683e8a643716a28e2cb1bffa27763687dc1b40f Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 26 Oct 2021 21:03:02 +0200 Subject: [PATCH 1/5] Split tmxutil.py into a proper python package --- .gitignore | 1 + pyproject.toml | 6 + setup.py | 31 + src/tmxutil/__init__.py | 5 + src/tmxutil/__main__.py | 4 + src/tmxutil/cli.py | 400 ++++++++++ src/tmxutil/filters/__init__.py | 0 src/tmxutil/filters/deduplicate.py | 104 +++ src/tmxutil/filters/ipc.py | 70 ++ src/tmxutil/formats/__init__.py | 87 +++ src/tmxutil/formats/count.py | 100 +++ src/tmxutil/formats/json.py | 31 + src/tmxutil/formats/pickle.py | 38 + src/tmxutil/formats/tab.py | 76 ++ src/tmxutil/formats/tmx.py | 146 ++++ src/tmxutil/formats/txt.py | 11 + src/tmxutil/interactive.py | 40 + src/tmxutil/types.py | 61 ++ src/tmxutil/utils.py | 33 + tmxutil.py | 1122 ---------------------------- 20 files changed, 1244 insertions(+), 1122 deletions(-) create mode 100644 .gitignore create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 src/tmxutil/__init__.py create mode 100644 src/tmxutil/__main__.py create mode 100755 src/tmxutil/cli.py create mode 100644 src/tmxutil/filters/__init__.py create mode 100644 src/tmxutil/filters/deduplicate.py create mode 100644 src/tmxutil/filters/ipc.py create mode 100644 src/tmxutil/formats/__init__.py create mode 100644 src/tmxutil/formats/count.py create mode 100644 src/tmxutil/formats/json.py create mode 100644 src/tmxutil/formats/pickle.py create mode 100644 src/tmxutil/formats/tab.py create mode 100644 src/tmxutil/formats/tmx.py create mode 100644 src/tmxutil/formats/txt.py create mode 100644 src/tmxutil/interactive.py create mode 100644 src/tmxutil/types.py create mode 100644 src/tmxutil/utils.py delete mode 100755 tmxutil.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..11041c7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.egg-info diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6b268de --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="tmxutil-pkg-jelmervdl", + version="1.2", + author="Jelmer van der Linde", + author_email="jelmer@ikhoefgeen.nl", + description="Tool to create, augment, filter and generally work with TMX files.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/paracrawl/tmxutil", + project_urls={ + "Bug Tracker": "https://github.com/paracrawl/tmxutil/issues", + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + entry_points={ + 'console_scripts': [ + 'tmxutil=tmxutil.cli:entrypoint', + ], + }, + package_dir={"": "src"}, + packages=setuptools.find_packages(where="src"), + python_requires=">=3.6", +) \ No newline at end of file diff --git a/src/tmxutil/__init__.py b/src/tmxutil/__init__.py new file mode 100644 index 0000000..702d616 --- /dev/null +++ b/src/tmxutil/__init__.py @@ -0,0 +1,5 @@ +try: + import pkg_resources + __version__ = pkg_resources.require('tmxutil-pkg-jelmervdl')[0].version +except: + __version__ = 'dev' diff --git a/src/tmxutil/__main__.py b/src/tmxutil/__main__.py new file mode 100644 index 0000000..8955dd3 --- /dev/null +++ b/src/tmxutil/__main__.py @@ -0,0 +1,4 @@ +from .cli import entrypoint + +if __name__ == '__main__': + entrypoint() diff --git a/src/tmxutil/cli.py b/src/tmxutil/cli.py new file mode 100755 index 0000000..fd9898d --- /dev/null +++ b/src/tmxutil/cli.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python3 +# Tool to convert between tab, txt and tmx formatting, filtering & adding +# annotations. +import re +import operator +import sys +from contextlib import ExitStack +from datetime import datetime +from textwrap import dedent +from argparse import ArgumentParser, FileType, Namespace, RawDescriptionHelpFormatter +from logging import info, getLogger, INFO, ERROR +from typing import cast, Callable, Tuple, Iterator, Iterable, Any, Type, TypeVar, List, Dict, Set, Mapping, Optional +from functools import reduce, partial +from io import TextIOWrapper +from itertools import chain, starmap +from . import __version__ +from .types import Reader, Writer, TranslationUnit, BufferedBinaryIO +from .utils import first, fromisoformat, fromfilesize +from .filters.deduplicate import deduplicate +from .filters.ipc import IPCLabeler, IPCGroupLabeler +from .formats import make_reader +from .formats.count import CountWriter, LiveCountWriter +from .formats.json import JSONWriter +from .formats.pickle import PickleReader, PickleWriter +from .formats.tab import TabReader, TabWriter +from .formats.tmx import TMXReader, TMXWriter +from .formats.txt import TxtWriter +from .interactive import tqdm + + +def text_key(unit: TranslationUnit) -> Tuple[str,...]: + return tuple(translation.text for translation in unit.translations.values()) + + +T = TypeVar('T', float, str) + +def build_binary_condition(type: Type[T], op: Callable[[T,T], bool]) -> Callable[[Callable[[TranslationUnit], Iterable[Any]],str], Callable[[TranslationUnit], bool]]: + """Wrapper for standard python operations on types. I.e. to implement gt + and lt.""" + def build_condition(lhs: Callable[[TranslationUnit], Iterable[Any]], rhs: str) -> Callable[[TranslationUnit], bool]: + return lambda unit: any(op(type(el), type(rhs)) for el in lhs(unit)) + return build_condition + + +def build_regex_condition(lhs: Callable[[TranslationUnit], Iterable[Any]], rhs: str) -> Callable[[TranslationUnit], bool]: + """Specialised version (or wrapper around) build_binary_condition that makes + one that tests a regular expression.""" + pattern = re.compile(rhs) + return lambda unit: any(pattern.search(str(el)) is not None for el in lhs(unit)) + + +condition_operators = { + '<': build_binary_condition(float, operator.lt), + '>': build_binary_condition(float, operator.gt), + '<=': build_binary_condition(float, operator.le), + '>=': build_binary_condition(float, operator.ge), + '=': build_binary_condition(str, operator.eq), + '=~': build_regex_condition +} + + +def set_property(key: str, value: str, unit: TranslationUnit) -> TranslationUnit: + unit[key] = {value} + return unit + + +def del_properties(properties: List[str], unit: TranslationUnit) -> TranslationUnit: + for prop in properties: + del unit[prop] + return unit + + +def parse_properties(props: str) -> Dict[str,Set[str]]: + properties: Dict[str,Set[str]] = {} + for prop in props.split(','): + key, value = prop.split('=', 1) + properties.setdefault(key, set()).add(value) + return properties + + +def parse_condition(operators: Mapping[str,Callable[[str,str], Callable[[TranslationUnit], bool]]], expr: str, functions={}) -> Callable[[TranslationUnit], bool]: + pattern = r'^(?P.+?)(?P{operators})(?P.*)$'.format( + operators='|'.join(re.escape(op) for op in sorted(operators.keys(), key=len, reverse=True))) + + match = re.match(pattern, expr) + + if match is None: + raise ValueError("Could not parse condition '{}'".format(expr)) + + info("Using expression op:'%(op)s' lhs:'%(lhs)s' rhs:'%(rhs)s'", match.groupdict()) + + prop_getter = parse_property_getter(match.group('lhs'), functions=functions) + + return operators[match.group('op')](prop_getter, match.group('rhs')) + + +def parse_property_getter(expr: str, functions: Dict[str,Callable[[Any],Any]] = {'len': len}) -> Callable[[TranslationUnit], Iterable[Any]]: + ops = [] #type: List[Callable[[Any], Any]] + + while True: + match = re.match(r'^(?P[a-zA-Z_]\w*)\((?P.+?)\)$', expr) + if not match: + break + + if not match.group('fun') in functions: + raise ValueError('Function `{}` in expression `{}` not found.'.format(match.group('fun'), expr)) + + ops.insert(0, functions[match.group('fun')]) + expr = match.group('expr') + + match = re.match(r'^((?P[\w-]+)?(?P\.))?(?P[\w-]+)(?P\[\])?$', expr) + if not match: + raise ValueError('Could not interpret expression `{}`'.format(expr)) + + prop = match.group('prop') + + # 'en.source-document' or 'en.text' + if match.group('lang'): + lang = match.group('lang') + if prop == 'text': + val_getter = lambda unit: [unit.translations[lang].text] + else: + val_getter = lambda unit: unit.translations[lang][prop] + # e.g. '.collection', only look in root + elif match.group('dot'): + val_getter = lambda unit: unit[prop] + # e.g. 'text'; text can only occur in translations + elif prop == 'text': + val_getter = lambda unit: (translation.text for translation in unit.translations.values()) + # e.g. 'source-document' or 'collection'; search through both root and translations + else: + val_getter = lambda unit: reduce(lambda acc, translation: acc + list(translation.get(prop, [])), unit.translations.values(), list(unit.get(prop, []))) + + if match.group('brackets'): + agg_getter = lambda unit: [frozenset(val_getter(unit))] # convert to frozenset so it can be used as key in dict/Counter + else: + agg_getter = val_getter + + if ops: + fun_getter = lambda unit: (reduce(lambda val, op: op(val), ops, val) for val in agg_getter(unit)) + else: + fun_getter = agg_getter + + return fun_getter + + +def first_item_getter(key: str) -> Callable[[TranslationUnit], Optional[str]]: + """Creates a getter that gets one value from a translation unit's properties, + if there are more values for that property, it's undefined which one it gets. + If the property does not exist, or is empty, it will return None.""" + def getter(obj: TranslationUnit) -> Optional[str]: + return first(obj.get(key, set()), default=None) + return getter + + +def make_deduplicator(args: Namespace, reader: Iterator[TranslationUnit], mem_limit : int = 2 * 10**9) -> Iterator[TranslationUnit]: + """ + Make a deduplicate filter based on the input options. Fancy bifixer based + deduplicator if we have the data, otherwise fall back to boring deduplicator. + """ + + # Grab the first object from the reader to see what we're dealing with + try: + peeked_obj = next(reader) + except StopIteration: + # It's an empty reader. No need to wrap it in anything deduplicating. + return reader + + # Stick the peeked object back on :P + reader = chain([peeked_obj], reader) + + if 'hash-bifixer' in peeked_obj and 'score-bifixer' in peeked_obj: + return deduplicate(reader, key=first_item_getter('hash-bifixer'), sort_key=first_item_getter('score-bifixer'), mem_limit=mem_limit) + else: + return deduplicate(reader, key=text_key, mem_limit=mem_limit) + + +def abort(message: str) -> int: + """Abandon ship! Use in case of misguided users.""" + print(message, file=sys.stderr) + return 1 + + +def properties_adder(properties: Dict[str,Set[str]], reader: Iterator[TranslationUnit]) -> Iterator[TranslationUnit]: + for unit in reader: + unit.update(properties) + yield unit + + +def import_file_as_module(file): + filename = os.path.basename(file) + basename, ext = os.path.splitext(filename) + if ext not in {'.py'}: + raise ValueError('Error importing {}: can only import .py files'.format(file)) + + spec = importlib.util.spec_from_file_location(basename, file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def main(argv: List[str], stdin: BufferedBinaryIO, stdout: BufferedBinaryIO) -> int: + parser = ArgumentParser( + formatter_class=RawDescriptionHelpFormatter, + description='Annotate, analyze, filter and convert (mainly) tmx files', + epilog=dedent(''' + Supported syntax for FILTER_EXPR: + Syntax: PROP_EXPR OPERATOR VALUE where: + PROP_EXPR Either 'text' or the value of the "type" attribute + of a element. + OPERATOR Supported operators: + >, >=, <, <= for numeric comparisons. + =, =~ for string comparisons. + VALUE String, number or regular expression. + + Examples: + collection=europat Matches sentence pairs that have a property + 'collection' that is exactly 'europat'. + text=~euro.* Matches pairs that match a regular expression. + id>400 Matches pairs that have an id larger than 400 + + Supported syntax for PROP_EXPR: + Syntax: [FUNCTION] ( [LANG] [.] PROPERTY [\\[\\]] ) where all except + PROPERTY is optional. If FUNCTION is not used, you don't need the + parenthesis. The [] after PROPERTY can be used to indicate that + all values of that property for a should be treated as a + single set. + + Examples: + source-document Count every prop type "source-document", either as + part of the or . + .collection Count every collection observed as a prop of the + sentence pair. + .collection[] Count every combination of + observed in . + len(en.text) String length of english side of the sentence pair. + You can use your own functions using --include. + ''')) + parser.add_argument('--version', action='version', version=__version__) + parser.add_argument('-i', '--input-format', choices=['tmx', 'tab', 'pickle'], help='Input file format. Automatically detected if left unspecified.') + parser.add_argument('-o', '--output-format', choices=['tmx', 'tab', 'txt', 'json', 'pickle'], default='tmx', help='Output file format. Output is always written to stdout.') + parser.add_argument('-l', '--input-languages', nargs=2, help='Input languages in case of tab input. Needs to be in order their appearance in the columns.') + parser.add_argument('-c', '--input-columns', nargs='+', help='Input columns in case of tab input. Column names ending in -1 or -2 will be treated as translation-specific.') + parser.add_argument('--output-languages', nargs='+', help='Output languages for tab and txt output. txt output allows only one language, tab multiple.') + parser.add_argument('--output-columns', metavar="PROP_EXPR", nargs='+', help='Output columns for tab output. Use {lang}.{property} syntax to select language specific properties such as en.source-document or de.text.') + parser.add_argument('--output', default=stdout, type=FileType('wb'), help='Output file. Defaults to stdout.') + parser.add_argument('--creation-date', type=fromisoformat, default=datetime.now(), help='override creation date in tmx output.') + parser.add_argument('-p', '--properties', action='append', help='List of A=B,C=D properties to add to each sentence pair. You can use one --properties for all files or one for each input file.') + parser.add_argument('-d', '--deduplicate', action='store_true', help='Deduplicate units before printing. Unit properties are combined where possible. If score-bifixer and hash-bifixer are avaiable, these will be used.') + parser.add_argument('--drop', nargs='+', dest='drop_properties', help='Drop properties from output.') + parser.add_argument('--renumber-output', action='store_true', help='Renumber the translation unit ids. Always enabled when multiple input files are given.') + parser.add_argument('--ipc', dest='ipc_meta_files', nargs="+", type=FileType('r'), help='One or more IPC metadata files.') + parser.add_argument('--ipc-group', dest='ipc_group_files', nargs="+", type=FileType('r'), help='One or more IPC grouping files.') + parser.add_argument('--with', nargs='+', action='append', dest='filter_with', metavar='FILTER_EXPR') + parser.add_argument('--without', nargs='+', action='append', dest='filter_without', metavar='FILTER_EXPR') + parser.add_argument('-P', '--progress', action='store_true', help='Show progress bar when reading files.') + logging_options = parser.add_mutually_exclusive_group() + logging_options.add_argument('-q', '--quiet', action='store_true', help='Hide issues encountered while reading files.') + logging_options.add_argument('-v', '--verbose', action='store_true', help='Print progress updates.') + parser.add_argument('--workspace', type=fromfilesize, help='Mamimum memory usage for deduplication. When exceeded, will continue deduplication using filesystem.', default='4G') + parser.add_argument('--count', dest='count_property', help='Count which values occur for a property.', metavar='COUNT_EXPR') + parser.add_argument('--include', action='append', default=[], dest='count_libraries', help='Include a python file so functions defined in that file can be used with --count, e.g. include something that provides a domain(url:str) function, and use `--count domain(source-document)`.') + parser.add_argument('files', nargs='*', default=[stdin], type=FileType('rb'), help='Input files. May be gzipped. If not specified stdin is used.') + + # I prefer the modern behaviour where you can do `tmxutil.py -p a=1 file.tmx + # -p a=2 file2.tmx` etc. but that's only available since Python 3.7. + if hasattr(parser, 'parse_intermixed_args'): + args = parser.parse_intermixed_args(argv) + else: + args = parser.parse_args(argv) + + if args.verbose: + getLogger().setLevel(INFO) + elif args.quiet: + getLogger().setLevel(ERROR) + + # Load in functions early so if anything is wrong with them we'll know before + # we attempt to parse anything. + functions = reduce(lambda obj, file: {**obj, **import_file_as_module(file).__dict__}, + args.count_libraries, {'len': len}) + + # Create reader. Make sure to call make_reader immediately and not somewhere + # down in a nested generator so if one of the files cannot be found, we + # error out immediately. + readers = [make_reader(fh, **vars(args)) for fh in args.files] + + # Add properties to each specific file? If so, do it before we chain all + # readers into a single iterator. If all share the same properties we'll + # add it after chaining multiple readers into one. + if args.properties and len(args.properties) > 1: + if len(args.properties) != len(readers): + return abort("When specifying multiple --properties options, you need" + " to specify exactly one for each input file. You have {}" + " --properties options, but {} files.".format(len(args.properties), len(readers))) + properties_per_file = (parse_properties(props) for props in args.properties) + + readers = [properties_adder(properties, reader) for properties, reader in zip(properties_per_file, readers)] + + # If we have multiple input files, the translation unit ids will be a mess + # when merged. So renumber them. + args.renumber_output = True + + # Merge all readers into a single source of sentence pairs + reader = chain.from_iterable(readers) + + # If we want to add properties (the same ones) to all input files, we do it + # now, after merging all readers into one. + if args.properties and len(args.properties) == 1: + properties = parse_properties(args.properties[0]) + reader = properties_adder(properties, reader) + + # Optional filter & annotation steps for reader. + if args.ipc_meta_files: + reader = map(IPCLabeler(args.ipc_meta_files).annotate, reader) + + if args.ipc_group_files: + reader = map(IPCGroupLabeler(args.ipc_group_files).annotate, reader) + + if args.filter_with: + dnf = [[parse_condition(condition_operators, cond_str, functions=functions) for cond_str in cond_expr] for cond_expr in args.filter_with] + reader = filter(lambda unit: any(all(expr(unit) for expr in cond) for cond in dnf), reader) + + if args.filter_without: + dnf = [[parse_condition(condition_operators, cond_str, functions=functions) for cond_str in cond_expr] for cond_expr in args.filter_without] + reader = filter(lambda unit: all(any(not expr(unit) for expr in cond) for cond in dnf), reader) + + if args.deduplicate: + reader = make_deduplicator(args, reader, mem_limit=args.workspace) + + if args.renumber_output: + reader = starmap(partial(set_property, 'id'), enumerate(reader, start=1)) + + # If we want to drop properties from the output, do that as the last step. + if args.drop_properties: + reader = map(partial(del_properties, args.drop_properties), reader) + + # Create writer + with ExitStack() as ctx: + if args.output_format == 'pickle': + writer = ctx.enter_context(PickleWriter(args.output)) # type: Writer + + else: + text_out = ctx.enter_context(TextIOWrapper(args.output, encoding='utf-8')) + + if args.count_property: + count_property = parse_property_getter(args.count_property, functions=functions) + + + if tqdm and args.progress: + writer = ctx.enter_context(LiveCountWriter(text_out, key=count_property)) + else: + writer = ctx.enter_context(CountWriter(text_out, key=count_property)) + elif args.output_format == 'tmx': + writer = ctx.enter_context(TMXWriter(text_out, creation_date=args.creation_date)) + elif args.output_format == 'tab': + if not args.output_columns: + if not args.output_languages: + return abort("Use --output-languages X Y to select the order of the columns in the output, or use --output-columns directly.") + args.output_columns = [ + *(f'{lang}.source-document' for lang in args.output_languages), + *(f'{lang}.text' for lang in args.output_languages) + ] + + column_getters = [ + parse_property_getter(expr, functions=functions) + for expr in args.output_columns + ] + + writer = ctx.enter_context(TabWriter(text_out, column_getters)) + elif args.output_format == 'txt': + if not args.output_languages or len(args.output_languages) != 1: + return abort("Use --output-languages X to select which language." + " When writing txt, it can only write one language at" + " a time.") + writer = ctx.enter_context(TxtWriter(text_out, args.output_languages[0])) + elif args.output_format == 'json': + writer = ctx.enter_context(JSONWriter(text_out)) + elif args.output_format == 'pickle': + writer = ctx.enter_context(PickleWriter(args.output)) + else: + raise ValueError('Unknown output format: {}'.format(args.output_format)) + + # Main loop. with statement for writer so it can write header & footer + count = 0 + for unit in reader: + writer.write(unit) + count += 1 + info("Written %d records.", count) + + return 0 + + +def entrypoint(): + """main() but with all the standard parameters passed in""" + try: + sys.exit(main(sys.argv[1:], + cast(BufferedBinaryIO, sys.stdin.buffer), + cast(BufferedBinaryIO, sys.stdout.buffer))) + except ValueError as e: + sys.exit(abort("Error: {}".format(e))) diff --git a/src/tmxutil/filters/__init__.py b/src/tmxutil/filters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tmxutil/filters/deduplicate.py b/src/tmxutil/filters/deduplicate.py new file mode 100644 index 0000000..5517d60 --- /dev/null +++ b/src/tmxutil/filters/deduplicate.py @@ -0,0 +1,104 @@ +import pickle +import resource +from typing import Iterator, Callable, Any, Dict, List +from itertools import chain +from logging import info +from collections import OrderedDict +from tempfile import TemporaryFile +from ..types import TranslationUnit + + +def deduplicate(reader: Iterator[TranslationUnit], key: Callable[[TranslationUnit], Any], sort_key: Callable[[TranslationUnit], Any] = lambda unit: 0, mem_limit:int = 2 * 10**9) -> Iterator[TranslationUnit]: + """ + Deduplicate records read from reader. It does this by creating a hash table + of all records, grouped by key(record). If multiple records have the same + key they are combined if properties allow this (i.e. sets, lists) or + overwritten in case compare(current, new) is True. See deduplicate_merge(). + + Note: This function behaves like an iterator but will only start yielding + results once reader has run out of records. + + Note: If the memory usage becomes too large (because storing all unique + units is taking up too much storage) it will fall back to deduplicate_external + which uses a file as backing for temporarily storing translation units. + """ + + best = dict() # type: Dict[int,TranslationUnit] + + try: + first_unit = next(reader) + except StopIteration: + return reader + + for n, unit in enumerate(chain([first_unit], reader), start=1): + unit_id = hash(key(unit)) + + if unit_id in best: + best[unit_id] = deduplicate_merge(best[unit_id], unit, sort_key) + else: + best[unit_id] = unit + + if n % 10000 == 0: + mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + info('best contains %d (%d processed) entries (%1.2f GB)', len(best), n, mem_usage / 10**9) + if mem_usage > mem_limit: + info("Exceeded in-memory size limit, switching to file-backed deduplication") + already_processed = best.values() + del best + yield from deduplicate_external(chain(already_processed, reader), key, sort_key) + break + else: + yield from best.values() + + +def deduplicate_external(reader: Iterator[TranslationUnit], key: Callable[[TranslationUnit], Any], sort_key: Callable[[TranslationUnit], Any] = lambda unit: 0) -> Iterator[TranslationUnit]: + best = OrderedDict() # type: Dict[int,List[int]] + + with TemporaryFile() as fh: + for n, unit in enumerate(reader, start=1): + offset = fh.tell() + + pickle.dump(unit, fh) + + unit_id = hash(key(unit)) + + best.setdefault(unit_id, []).append(offset) + + if n % 10000 == 0: + mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + disk_usage = fh.tell() + info('best contains %d (%d processed) entries (mem: %1.2f GB, disk: %1.2f GB)', len(best), n, mem_usage / 10**9, disk_usage / 10**9) + + info('All entries inspected, %d unique entries; building output', len(best)) + + for n, duplicates in enumerate(best.values(), start=1): + best_unit = TranslationUnit() + + for offset in duplicates: + fh.seek(offset) + unit = pickle.load(fh) + + if not best_unit: + best_unit = unit + else: + best_unit = deduplicate_merge(best_unit, unit, sort_key) + + if n % 10000 == 0: + info('%d out of %d built', n, len(best)) + + yield best_unit + + +def deduplicate_merge(best_unit: TranslationUnit, new_unit: TranslationUnit, sort_key: Callable[[TranslationUnit], Any]) -> TranslationUnit: + """Merges new_unit into best_unit, combining collections but overwriting + all other entries if and only if compare(current, new) is true""" + new_is_better = sort_key(new_unit) < sort_key(best_unit) + + if new_is_better: + for key, value in new_unit.items(): + best_unit[key] = value + + for lang, translation in new_unit.translations.items(): + best_unit.translations[lang].updateVariant(translation) + + return best_unit \ No newline at end of file diff --git a/src/tmxutil/filters/ipc.py b/src/tmxutil/filters/ipc.py new file mode 100644 index 0000000..ff27603 --- /dev/null +++ b/src/tmxutil/filters/ipc.py @@ -0,0 +1,70 @@ +from typing import List, TextIO, Dict, Tuple, Set +from tmxutil.types import TranslationUnit +from logging import warning + + +class IPCLabeler(object): + """Add IPC labels to sentence pairs based on the patent ids found in the + source-document property of either side of the pair.""" + + #lut: Dict[Tuple[str,str], Set[str]] + + def __init__(self, files: List[TextIO] = []): + self.lut = dict() # type: Dict[Tuple[str,str], Set[str]] + for fh in files: + self.load(fh) + + def load(self, fh: TextIO) -> None: + for line in fh: + parts = line.split('\t', 11) + if len(parts) != 6 and len(parts) != 12: + warning("Expected 6 or 12 fields while reading IPC file, found %d, in %s:%d", len(parts), fh.name, line) + continue + src_id, _, _, _, src_lang, src_ipcs = parts[:6] + self.lut[(src_lang.lower(), src_id)] = set(ipc.strip() for ipc in src_ipcs.split(',') if ipc.strip() != '') + if len(parts) == 12: + trg_id, _, _, _, trg_lang, trg_ipcs = parts[6:] + self.lut[(trg_lang.lower(), trg_id)] = set(ipc.strip() for ipc in trg_ipcs.split(',') if ipc.strip() != '') + + def annotate(self, unit: TranslationUnit) -> TranslationUnit: + for lang, translation in unit.translations.items(): + # Ignoring type because https://github.com/python/mypy/issues/2013 + translation['ipc'] = set().union(*( + self.lut[(lang.lower(), url)] + for url in translation['source-document'] + if (lang.lower(), url) in self.lut + )) # type: ignore + return unit + + +class IPCGroupLabeler(object): + """Add overall IPC group ids based on IPC labels added by IPCLabeler.""" + + #patterns: List[Tuple[str,Set[str]]] + + def __init__(self, files: List[TextIO] = []): + self.patterns = [] # type: List[Tuple[str,Set[str]]] + for fh in files: + self.load(fh) + + def load(self, fh: TextIO) -> None: + for line in fh: + prefix, group, *_ = line.split('\t', 2) + self.patterns.append(( + prefix.strip(), + {prefix.strip(), group.strip()} if prefix.strip() != "" else {group.strip()} + )) + + # Sort with most specific on top + self.patterns.sort(key=lambda pattern: (-len(pattern[0]), pattern[0])) + + def find_group(self, ipc_code: str) -> Set[str]: + for prefix, groups in self.patterns: + if ipc_code.startswith(prefix): + return groups + return set() + + def annotate(self, unit: TranslationUnit) -> TranslationUnit: + for lang, translation in unit.translations.items(): + translation['ipc-group'] = set().union(*map(self.find_group, translation['ipc'])) # type: ignore + return unit \ No newline at end of file diff --git a/src/tmxutil/formats/__init__.py b/src/tmxutil/formats/__init__.py new file mode 100644 index 0000000..29b5b7d --- /dev/null +++ b/src/tmxutil/formats/__init__.py @@ -0,0 +1,87 @@ +import gzip +from io import TextIOWrapper +from typing import Any, Generator, Optional, Iterable, Sequence, cast, Iterator, Tuple, Dict +from itertools import chain +from ..types import Reader, TranslationUnit, BufferedBinaryIO +from ..interactive import tqdm, ProgressWrapper +from .pickle import PickleReader +from .tmx import TMXReader +from .tab import TabReader + +def closer(fh: Any) -> Generator[Any,None,None]: + """Generator that closes fh once it it their turn.""" + if hasattr(fh, 'close'): + fh.close() + yield from [] + + +def is_gzipped(fh: BufferedBinaryIO) -> bool: + """Test if stream is probably a gzip stream""" + return fh.peek(2).startswith(b'\x1f\x8b') + + +def make_reader(fh: BufferedBinaryIO, *, input_format: Optional[str] = None, input_columns: Optional[Iterable[str]] = None, input_languages: Optional[Sequence[str]] = None, progress:bool = False, **kwargs: Any) -> Iterator[TranslationUnit]: + if tqdm and progress: + fh = ProgressWrapper(fh) + + if is_gzipped(fh): + fh = cast(BufferedBinaryIO, gzip.open(fh)) + + if not input_format: + file_format, format_args = autodetect(fh) + else: + file_format, format_args = input_format, {} + + if file_format == 'tab' and 'columns' not in format_args and input_columns: + format_args['columns'] = input_columns + + if file_format == 'pickle': + reader: Reader = PickleReader(fh) + elif file_format == 'tmx': + reader = TMXReader(fh) + elif file_format == 'tab': + if not input_languages or len(input_languages) != 2: + raise ValueError("'tab' format needs exactly two input languages specified") + text_fh = TextIOWrapper(fh, encoding='utf-8') + reader = TabReader(text_fh, *input_languages, **format_args) + else: + raise ValueError("Cannot create file reader for format '{}'".format(file_format)) + + # Hook an empty generator to the end that will close the file we opened. + return chain(reader, closer(reader)) + + +def peek_first_line(fh: BufferedBinaryIO, length: int = 128) -> bytes: + """Tries to get the first full line in a buffer that supports peek.""" + while True: + buf = fh.peek(length) + + pos = buf.find(b'\n') + if pos != -1: + return buf[0:pos] + + if len(buf) < length: + return buf + + buf *= 2 + + +def autodetect(fh: BufferedBinaryIO) -> Tuple[str, Dict[str,Any]]: + """Fill in arguments based on what we can infer from the input we're going + to get. fh needs to have a peek() method and return bytes.""" + + # First test: is it XML? + xml_signature = b'= 7: + return 'tab', {'columns': ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'hash-bifixer', 'score-bifixer', 'score-bicleaner']} + + if column_count >= 5: + return 'tab', {'columns': ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'score-aligner']} + + raise ValueError('Did not recognize file format') + diff --git a/src/tmxutil/formats/count.py b/src/tmxutil/formats/count.py new file mode 100644 index 0000000..ed9f603 --- /dev/null +++ b/src/tmxutil/formats/count.py @@ -0,0 +1,100 @@ +from tmxutil.types import Writer, TranslationUnit +from tmxutil.interactive import tqdm +from typing import Optional, Callable, TextIO, List, Any, Type +from types import TracebackType +from operator import itemgetter + + +class CountWriter(Writer): + """Instead of writing tmx records, it counts a property and writes a summary + of which values it encountered for that property, and how often it encountered + them.""" + def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): + self.fh = fh + self.key = key + + def __enter__(self) -> 'CountWriter': + self.counter = Counter() # type: Counter[Any] + return self + + def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: + if type is None: + for key, count in sorted(self.counter.most_common(), key=itemgetter(1), reverse=True): + self.fh.write("{}\t{}\n".format(count, " ".join(sorted(key)) if isinstance(key, frozenset) else key)) + + def write(self, unit: TranslationUnit) -> None: + self.counter.update(self.key(unit)) + + +class LiveCountWriter(Writer): + """Live variant of CountWriter: shows live updating bars while counting.""" + def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): + self.fh = fh + self.key = key + self.top_n = 10 + + def __enter__(self) -> 'LiveCountWriter': + self.counter = Counter() # type: Counter[Any] + self.total = 0 + self.bars: tqdm = [] + self.n = 0 + self.last_update = time() + self.last_n = 0 + self.update_interval = 128 + return self + + def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: + for bar in self.bars: + bar.close() + + if type is None: + for key, count in self.counter.most_common(): + self.fh.write("{}\t{}\n".format(count, " ".join(sorted(key)) if isinstance(key, frozenset) else key)) + + def refresh(self): + top = self.counter.most_common(self.top_n) + remainder = len(self.counter) - len(top) + + if remainder: + remainder_count = self.total - sum(count for _, count in top) + top.append(('({} more)'.format(remainder), remainder_count)) + + # Make sure we've enough bars + while len(top) > len(self.bars): + self.bars.append(tqdm( + position=len(self.bars)+1, + unit='unit', + file=sys.stderr, + dynamic_ncols=True, + bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')) + + # Determine the label length for alignment + label_len=max(len(str(value)) for value, _ in top) + + # Update all bars (sorting by most common on top) + for bar, (value, count) in zip(self.bars, top): + bar.set_description('{{: <{:d}s}}'.format(label_len+2).format(str(value)), refresh=False) + bar.total = self.total + bar.n = count + bar.refresh() + + def _count_iter(self, iterable): + count = 0 + for item in iterable: + count += 1 + yield item + self.total += count + + def _smooth(self, current, target): + return floor(0.7 * current + 0.3 * target) + + def write(self, unit: TranslationUnit) -> None: + vals = self.key(unit) + self.counter.update(self._count_iter(vals)) + self.n += 1 + if self.n % self.update_interval == 0: + time_since_last_update = max(time() - self.last_update, 1e-10) + n_per_sec = self.update_interval / time_since_last_update + self.update_interval = max(self._smooth(self.update_interval, 0.5 * n_per_sec), 1) + self.last_update = time() + self.refresh() diff --git a/src/tmxutil/formats/json.py b/src/tmxutil/formats/json.py new file mode 100644 index 0000000..3c305bd --- /dev/null +++ b/src/tmxutil/formats/json.py @@ -0,0 +1,31 @@ +from tmxutil.types import Writer, TranslationUnit +from typing import TextIO +from textwrap import indent +from json import dumps + + +class JSONWriter(Writer): + indent = ' ' + + def __init__(self, fh: TextIO): + self.fh = fh + + def __enter__(self) -> 'JSONWriter': + print('[', file=self.fh) + self.first = True + return self + + def __exit__(self, type, value, traceback) -> None: + if type is not None: + return + print(']', file=self.fh) + + def write(self, unit: TranslationUnit) -> None: + if self.first: + comma = '' + self.first = False + else: + comma = ',' + + print(comma + indent(dumps(unit, indent=self.indent), self.indent), file=self.fh) + diff --git a/src/tmxutil/formats/pickle.py b/src/tmxutil/formats/pickle.py new file mode 100644 index 0000000..4c12e1b --- /dev/null +++ b/src/tmxutil/formats/pickle.py @@ -0,0 +1,38 @@ +import pickle +from tmxutil.types import Reader, Writer, TranslationUnit, TranslationUnitVariant +from typing import Type, Any, BinaryIO, Iterator + + +class TranslationUnitUnpickler(pickle.Unpickler): + def find_class(self, module: str, name: str) -> Type[Any]: + if module == 'tmxutil' or module == '__main__': + if name == 'TranslationUnitVariant': + return TranslationUnitVariant + elif name == 'TranslationUnit': + return TranslationUnit + raise pickle.UnpicklingError("global '{}.{}' is forbidden".format(module, name)) + + +class PickleReader(Reader): + def __init__(self, fh: BinaryIO): + self.fh = fh + + def close(self) -> None: + self.fh.close() + + def records(self) -> Iterator[TranslationUnit]: + try: + while True: + unit = TranslationUnitUnpickler(self.fh).load() + assert isinstance(unit, TranslationUnit) + yield unit + except EOFError: + pass + + +class PickleWriter(Writer): + def __init__(self, fh: BinaryIO): + self.fh = fh + + def write(self, unit: TranslationUnit) -> None: + pickle.dump(unit, self.fh) diff --git a/src/tmxutil/formats/tab.py b/src/tmxutil/formats/tab.py new file mode 100644 index 0000000..7120266 --- /dev/null +++ b/src/tmxutil/formats/tab.py @@ -0,0 +1,76 @@ +import csv +from tmxutil.types import Reader, Writer, TranslationUnit, TranslationUnitVariant +from typing import List, Callable, TextIO, Iterable, Any, Iterator + + +class TabReader(Reader): + def __init__(self, fh: TextIO, src_lang: str, trg_lang: str, columns: Iterable[str] = ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'score-aligner']): + self.fh = fh + self.src_lang = src_lang + self.trg_lang = trg_lang + self.columns = columns + + def close(self) -> None: + self.fh.close() + + def records(self) -> Iterator[TranslationUnit]: + class Variant: + __slots__ = ('lang', 'unit') + def __init__(self, lang: str): + self.lang = lang + self.unit = TranslationUnitVariant() + + for n, line in enumerate(self.fh): + # Skip blank lines + if line.strip() == '': + continue + + values = line.rstrip('\n').split('\t') + + record = TranslationUnit(id={str(n)}) + + var1 = Variant(self.src_lang) + + var2 = Variant(self.trg_lang) + + for column, value in zip(self.columns, values): + if column == '-': + continue + + if column.endswith('-1') or column.endswith('-2'): + variant = var1 if column.endswith('-1') else var2 + + if column[:-2] == 'lang': + variant.lang = value + elif column[:-2] == 'text': + variant.unit.text = value + else: + variant.unit[column[:-2]] = {value} + else: + record.setdefault(column, set()).add(value) + + record.translations = { + var1.lang: var1.unit, + var2.lang: var2.unit + } + + yield record + + +class TabWriter(Writer): + fh: TextIO + columns: List[Callable[[TranslationUnit], Iterable[Any]]] + + def __init__(self, fh: TextIO, columns: List[Callable[[TranslationUnit], Iterable[Any]]]): + self.fh = fh + self.columns = columns + + def __enter__(self) -> 'TabWriter': + self.writer = csv.writer(self.fh, delimiter='\t') + return self + + def write(self, unit: TranslationUnit) -> None: + self.writer.writerow([ + ';'.join(map(str, getter(unit))) + for getter in self.columns + ]) \ No newline at end of file diff --git a/src/tmxutil/formats/tmx.py b/src/tmxutil/formats/tmx.py new file mode 100644 index 0000000..262c1c3 --- /dev/null +++ b/src/tmxutil/formats/tmx.py @@ -0,0 +1,146 @@ +import tmxutil +from tmxutil.types import Reader, Writer, TranslationUnit, TranslationUnitVariant +from tmxutil.utils import first +from typing import BinaryIO, Iterator, List, Mapping, Set, Tuple, TextIO, Optional, Type +from types import TracebackType +from logging import info, warning +from xml.etree.ElementTree import iterparse, Element +from datetime import datetime + + +# _escape_cdata and _escape_attrib are copied from +# https://github.com/python/cpython/blob/3.9/Lib/xml/etree/ElementTree.py +def _escape_cdata(text: str) -> str: + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text + + +def _escape_attrib(text: str) -> str: + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + if "\"" in text: + text = text.replace("\"", """) + if "\r" in text: + text = text.replace("\r", " ") + if "\n" in text: + text = text.replace("\n", " ") + if "\t" in text: + text = text.replace("\t", " ") + return text + + +def _flatten(unit: Mapping[str,Set[str]]) -> Iterator[Tuple[str,str]]: + for key, values in unit.items(): + for value in values: + yield key, value + + +class TMXReader(Reader): + """TMX File format reader. XML attributes are mostly ignored. + elements of the are added as attributes, and of as attributes + with sets of values as we expect one or more of them, i.e. one or more + source-document, ipc, etc.""" + + def __init__(self, fh: BinaryIO): + self.fh = fh + + def close(self) -> None: + self.fh.close() + + def records(self) -> Iterator[TranslationUnit]: + stack = list() # type: List[Element] + path = list() # type: List[str] + + info("TMXReader starts reading from %s", self.fh.name) + + unit: TranslationUnit + translation: TranslationUnitVariant + + lang_key = '{http://www.w3.org/XML/1998/namespace}lang' + + for event, element in iterparse(self.fh, events=('start', 'end')): + if event == 'start': + stack.append(element) + path.append(element.tag) + + if path == ['tmx', 'body', 'tu']: + unit = TranslationUnit(id={element.get('tuid')}) + elif path == ['tmx', 'body', 'tu', 'tuv']: + translation = TranslationUnitVariant() + elif event == 'end': + if path == ['tmx', 'body', 'tu']: + yield unit + elif path == ['tmx', 'body', 'tu', 'prop']: + if element.text is None: + warning('empty encountered in unit with id %s in file %s; property ignored', element.get('type'), first(unit['id']), self.fh.name) + else: + unit.setdefault(element.get('type'), set()).add(element.text.strip()) + elif path == ['tmx', 'body', 'tu', 'tuv']: + unit.translations[element.attrib[lang_key]] = translation + translations = None + elif path == ['tmx', 'body', 'tu', 'tuv', 'prop']: + if element.text is None: + warning('empty encountered in unit with id %s in file %s; property ignored', element.get('type'), first(unit['id']), self.fh.name) + else: + translation.setdefault(element.get('type'), set()).add(element.text.strip()) + elif path == ['tmx', 'body', 'tu', 'tuv', 'seg']: + if element.text is None: + warning('empty translation segment encountered in unit with id %s in file %s', first(unit['id']), self.fh.name) + translation.text = '' + else: + translation.text = element.text.strip() + + path.pop() + stack.pop() + if stack: + stack[-1].remove(element) + + +class TMXWriter(Writer): + def __init__(self, fh: TextIO, *, creation_date: Optional[datetime] = None): + self.fh = fh + self.creation_date = creation_date or datetime.now() + + def __enter__(self) -> 'TMXWriter': + self.fh.write('\n' + '\n' + ' \n' + ' \n' + ' \n') + return self + + def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: + self.fh.write(' \n' + '\n') + + def write(self, unit: TranslationUnit) -> None: + self.fh.write(' \n') + for key, value in sorted(_flatten(unit)): + if key != 'id' and value: + self.fh.write(' ' + _escape_cdata(str(value)) + '\n') + + for lang, translation in sorted(unit.translations.items()): + self.fh.write(' \n') + for key, value in sorted(_flatten(translation)): + if value: + self.fh.write(' ' + _escape_cdata(str(value)) + '\n') + self.fh.write(' ' + _escape_cdata(str(translation.text)) + '\n' + ' \n') + self.fh.write(' \n') + diff --git a/src/tmxutil/formats/txt.py b/src/tmxutil/formats/txt.py new file mode 100644 index 0000000..4f668e5 --- /dev/null +++ b/src/tmxutil/formats/txt.py @@ -0,0 +1,11 @@ +from typing import TextIO +from tmxutil.types import TranslationUnit, Writer + +class TxtWriter(Writer): + def __init__(self, fh: TextIO, language: str): + self.fh = fh + self.language = language + + def write(self, unit: TranslationUnit) -> None: + print(unit.translations[self.language].text, file=self.fh) + diff --git a/src/tmxutil/interactive.py b/src/tmxutil/interactive.py new file mode 100644 index 0000000..3953aa4 --- /dev/null +++ b/src/tmxutil/interactive.py @@ -0,0 +1,40 @@ +import sys +import os +from typing import Any + + +try: + from tqdm.autonotebook import tqdm +except ImportError: + tqdm = None + +class ProgressWrapper: + """Wraps around a file-like object and shows a progress bar as to how much + of it has been read.""" + + def __init__(self, fh: Any): + self.fh = fh + self.tqdm = tqdm( + desc=fh.name, + total=os.fstat(fh.fileno()).st_size, + initial=fh.seekable() and fh.tell(), + file=sys.stderr, + unit='b', + unit_scale=True) + + def __getattr__(self, attr: str) -> Any: + return getattr(self.fh, attr) + + def read(self, size: int = -1) -> Any: + data = self.fh.read(size) + self.tqdm.update(len(data)) + return data + + def read1(self, size: int = -1) -> Any: + data = self.fh.read1(size) + self.tqdm.update(len(data)) + return data + + def close(self) -> None: + self.tqdm.close() + self.fh.close() diff --git a/src/tmxutil/types.py b/src/tmxutil/types.py new file mode 100644 index 0000000..5e237c1 --- /dev/null +++ b/src/tmxutil/types.py @@ -0,0 +1,61 @@ +from abc import ABCMeta, ABC, abstractmethod +from typing import Optional, Type, Iterator, Dict, Set, BinaryIO +from types import TracebackType + + +class TranslationUnitVariant(Dict[str, Set[str]]): + __slots__ = ['text'] + + def __init__(self, *, text: Optional[str] = None, **kwargs: Set[str]): + super().__init__(**kwargs) + self.text = text or '' + + def updateVariant(self, other: 'TranslationUnitVariant') -> None: + self.text = other.text + for key, value in other.items(): + if key in self: + self[key] |= value + else: + self[key] = value + + +class TranslationUnit(Dict[str,Set[str]]): + __slots__ = ['translations'] + + def __init__(self, *, translations: Optional[Dict[str, TranslationUnitVariant]] = None, **kwargs: Set[str]): + super().__init__(**kwargs) + self.translations = translations or dict() # type: Dict[str,TranslationUnitVariant] + + +class BufferedBinaryIO(BinaryIO, metaclass=ABCMeta): + @abstractmethod + def peek(self, size: int) -> bytes: + ... + + +class Reader(ABC): + """Interface for sentence pair input stream.""" + + def __iter__(self) -> Iterator[TranslationUnit]: + return self.records() + + @abstractmethod + def records(self) -> Iterator[TranslationUnit]: + pass + + +class Writer(ABC): + """Interface for sentence pair output stream. Has with statement context + magic functions that can be overwritten to deal with writing headers and + footers, or starting and ending XML output.""" + + def __enter__(self) -> 'Writer': + return self + + def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: + pass + + @abstractmethod + def write(self, unit: TranslationUnit) -> None: + pass + diff --git a/src/tmxutil/utils.py b/src/tmxutil/utils.py new file mode 100644 index 0000000..59eb6e9 --- /dev/null +++ b/src/tmxutil/utils.py @@ -0,0 +1,33 @@ +import re +from datetime import datetime +from typing import TypeVar, Iterable, Optional, Union + +# Only Python 3.7+ has fromisoformat +if hasattr(datetime, 'fromisoformat'): + fromisoformat = datetime.fromisoformat +else: + def fromisoformat(date_string: str) -> datetime: + match = re.match(r'^(?P\d{4})-(?P\d{2})-(?P\d{2})(?:.(?P\d{2})(?::(?P\d{2})(?::(?P\d{2}))?)?)?$', date_string) + if match is None: + raise ValueError("invalid fromisoformat value: '{}'".format(date_string)) + return datetime( + int(match['year']), int(match['month']), int(match['day']), + int(match['hour']), int(match['minute']), int(match['second'])) + + +def fromfilesize(size_string: str) -> int: + order = 1 + for suffix in ['B', 'K', 'M', 'G', 'T']: + if size_string.endswith(suffix): + return int(size_string[:-1]) * order + else: + order *= 1000 + return int(size_string) + + +A = TypeVar('A') +B = TypeVar('B') +def first(it: Iterable[A], default: Optional[B] = None) -> Optional[Union[A,B]]: + return next(iter(it), default) + + diff --git a/tmxutil.py b/tmxutil.py deleted file mode 100755 index f0a87f7..0000000 --- a/tmxutil.py +++ /dev/null @@ -1,1122 +0,0 @@ -#!/usr/bin/env python3 -# Tool to convert between tab, txt and tmx formatting, filtering & adding -# annotations. - -__VERSION__ = 1.1 - -import csv -import sys -import os -import re -import gzip -import pickle -import resource -import operator -import importlib.util -from textwrap import dedent -from abc import ABC, ABCMeta, abstractmethod -from argparse import ArgumentParser, FileType, Namespace, RawDescriptionHelpFormatter -from collections import defaultdict, OrderedDict, Counter -from contextlib import ExitStack -from datetime import datetime -from functools import partial -from io import BufferedReader, TextIOWrapper -from itertools import combinations, chain, starmap -from functools import reduce -from logging import info, warning, getLogger, INFO, ERROR -from math import floor -from operator import itemgetter -from pprint import pprint -from tempfile import TemporaryFile -from time import time -from typing import Callable, Dict, List, Counter, Optional, Any, Iterator, Iterable, Set, FrozenSet, Tuple, Type, TypeVar, BinaryIO, TextIO, IO, Union, cast, Generator, Sequence, Mapping -from types import TracebackType -from xml.sax.saxutils import escape, quoteattr -from xml.etree.ElementTree import iterparse, Element - -try: - from tqdm.autonotebook import tqdm -except ImportError: - tqdm = None - -# Only Python 3.7+ has fromisoformat -if hasattr(datetime, 'fromisoformat'): - fromisoformat = datetime.fromisoformat -else: - def fromisoformat(date_string: str) -> datetime: - match = re.match(r'^(?P\d{4})-(?P\d{2})-(?P\d{2})(?:.(?P\d{2})(?::(?P\d{2})(?::(?P\d{2}))?)?)?$', date_string) - if match is None: - raise ValueError("invalid fromisoformat value: '{}'".format(date_string)) - return datetime( - int(match['year']), int(match['month']), int(match['day']), - int(match['hour']), int(match['minute']), int(match['second'])) - - -def fromfilesize(size_string: str) -> int: - order = 1 - for suffix in ['B', 'K', 'M', 'G', 'T']: - if size_string.endswith(suffix): - return int(size_string[:-1]) * order - else: - order *= 1000 - return int(size_string) - - -class TranslationUnitVariant(Dict[str, Set[str]]): - __slots__ = ['text'] - - def __init__(self, *, text: Optional[str] = None, **kwargs: Set[str]): - super().__init__(**kwargs) - self.text = text or '' - - def updateVariant(self, other: 'TranslationUnitVariant') -> None: - self.text = other.text - for key, value in other.items(): - if key in self: - self[key] |= value - else: - self[key] = value - - -class TranslationUnit(Dict[str,Set[str]]): - __slots__ = ['translations'] - - def __init__(self, *, translations: Optional[Dict[str, TranslationUnitVariant]] = None, **kwargs: Set[str]): - super().__init__(**kwargs) - self.translations = translations or dict() # type: Dict[str,TranslationUnitVariant] - - -class BufferedBinaryIO(BinaryIO, metaclass=ABCMeta): - @abstractmethod - def peek(self, size: int) -> bytes: - ... - - -class ProgressWrapper: - """Wraps around a file-like object and shows a progress bar as to how much - of it has been read.""" - - def __init__(self, fh: Any): - self.fh = fh - self.tqdm = tqdm( - desc=fh.name, - total=os.fstat(fh.fileno()).st_size, - initial=fh.seekable() and fh.tell(), - file=sys.stderr, - unit='b', - unit_scale=True) - - def __getattr__(self, attr: str) -> Any: - return getattr(self.fh, attr) - - def read(self, size: int = -1) -> Any: - data = self.fh.read(size) - self.tqdm.update(len(data)) - return data - - def read1(self, size: int = -1) -> Any: - data = self.fh.read1(size) - self.tqdm.update(len(data)) - return data - - def close(self) -> None: - self.tqdm.close() - self.fh.close() - - -class Reader(ABC): - """Interface for sentence pair input stream.""" - - def __iter__(self) -> Iterator[TranslationUnit]: - return self.records() - - @abstractmethod - def records(self) -> Iterator[TranslationUnit]: - pass - - -class Writer(ABC): - """Interface for sentence pair output stream. Has with statement context - magic functions that can be overwritten to deal with writing headers and - footers, or starting and ending XML output.""" - - def __enter__(self) -> 'Writer': - return self - - def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: - pass - - @abstractmethod - def write(self, unit: TranslationUnit) -> None: - pass - - -class TMXReader(Reader): - """TMX File format reader. XML attributes are mostly ignored. - elements of the are added as attributes, and of as attributes - with sets of values as we expect one or more of them, i.e. one or more - source-document, ipc, etc.""" - - def __init__(self, fh: BinaryIO): - self.fh = fh - - def close(self) -> None: - self.fh.close() - - def records(self) -> Iterator[TranslationUnit]: - stack = list() # type: List[Element] - path = list() # type: List[str] - - info("TMXReader starts reading from %s", self.fh.name) - - unit: TranslationUnit - translation: TranslationUnitVariant - - lang_key = '{http://www.w3.org/XML/1998/namespace}lang' - - for event, element in iterparse(self.fh, events=('start', 'end')): - if event == 'start': - stack.append(element) - path.append(element.tag) - - if path == ['tmx', 'body', 'tu']: - unit = TranslationUnit(id={element.get('tuid')}) - elif path == ['tmx', 'body', 'tu', 'tuv']: - translation = TranslationUnitVariant() - elif event == 'end': - if path == ['tmx', 'body', 'tu']: - yield unit - elif path == ['tmx', 'body', 'tu', 'prop']: - if element.text is None: - warning('empty encountered in unit with id %s in file %s; property ignored', element.get('type'), first(unit['id']), self.fh.name) - else: - unit.setdefault(element.get('type'), set()).add(element.text.strip()) - elif path == ['tmx', 'body', 'tu', 'tuv']: - unit.translations[element.attrib[lang_key]] = translation - translations = None - elif path == ['tmx', 'body', 'tu', 'tuv', 'prop']: - if element.text is None: - warning('empty encountered in unit with id %s in file %s; property ignored', element.get('type'), first(unit['id']), self.fh.name) - else: - translation.setdefault(element.get('type'), set()).add(element.text.strip()) - elif path == ['tmx', 'body', 'tu', 'tuv', 'seg']: - if element.text is None: - warning('empty translation segment encountered in unit with id %s in file %s', first(unit['id']), self.fh.name) - translation.text = '' - else: - translation.text = element.text.strip() - - path.pop() - stack.pop() - if stack: - stack[-1].remove(element) - - -# _escape_cdata and _escape_attrib are copied from -# https://github.com/python/cpython/blob/3.9/Lib/xml/etree/ElementTree.py -def _escape_cdata(text: str) -> str: - if "&" in text: - text = text.replace("&", "&") - if "<" in text: - text = text.replace("<", "<") - if ">" in text: - text = text.replace(">", ">") - return text - - -def _escape_attrib(text: str) -> str: - if "&" in text: - text = text.replace("&", "&") - if "<" in text: - text = text.replace("<", "<") - if ">" in text: - text = text.replace(">", ">") - if "\"" in text: - text = text.replace("\"", """) - if "\r" in text: - text = text.replace("\r", " ") - if "\n" in text: - text = text.replace("\n", " ") - if "\t" in text: - text = text.replace("\t", " ") - return text - - -def _flatten(unit: Mapping[str,Set[str]]) -> Iterator[Tuple[str,str]]: - for key, values in unit.items(): - for value in values: - yield key, value - - -class TMXWriter(Writer): - def __init__(self, fh: TextIO, *, creation_date: Optional[datetime] = None): - self.fh = fh - self.creation_date = creation_date or datetime.now() - - def __enter__(self) -> 'TMXWriter': - self.fh.write('\n' - '\n' - ' \n' - ' \n' - ' \n') - return self - - def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: - self.fh.write(' \n' - '\n') - - def write(self, unit: TranslationUnit) -> None: - self.fh.write(' \n') - for key, value in sorted(_flatten(unit)): - if key != 'id' and value: - self.fh.write(' ' + _escape_cdata(str(value)) + '\n') - - for lang, translation in sorted(unit.translations.items()): - self.fh.write(' \n') - for key, value in sorted(_flatten(translation)): - if value: - self.fh.write(' ' + _escape_cdata(str(value)) + '\n') - self.fh.write(' ' + _escape_cdata(str(translation.text)) + '\n' - ' \n') - self.fh.write(' \n') - - -class TabReader(Reader): - def __init__(self, fh: TextIO, src_lang: str, trg_lang: str, columns: Iterable[str] = ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'score-aligner']): - self.fh = fh - self.src_lang = src_lang - self.trg_lang = trg_lang - self.columns = columns - - def close(self) -> None: - self.fh.close() - - def records(self) -> Iterator[TranslationUnit]: - class Variant: - __slots__ = ('lang', 'unit') - def __init__(self, lang: str): - self.lang = lang - self.unit = TranslationUnitVariant() - - for n, line in enumerate(self.fh): - # Skip blank lines - if line.strip() == '': - continue - - values = line.rstrip('\n').split('\t') - - record = TranslationUnit(id={str(n)}) - - var1 = Variant(self.src_lang) - - var2 = Variant(self.trg_lang) - - for column, value in zip(self.columns, values): - if column == '-': - continue - - if column.endswith('-1') or column.endswith('-2'): - variant = var1 if column.endswith('-1') else var2 - - if column[:-2] == 'lang': - variant.lang = value - elif column[:-2] == 'text': - variant.unit.text = value - else: - variant.unit[column[:-2]] = {value} - else: - record.setdefault(column, set()).add(value) - - record.translations = { - var1.lang: var1.unit, - var2.lang: var2.unit - } - - yield record - - -A = TypeVar('A') -B = TypeVar('B') -def first(it: Iterable[A], default: Optional[B] = None) -> Optional[Union[A,B]]: - return next(iter(it), default) - - -class TabWriter(Writer): - fh: TextIO - columns: List[Callable[[TranslationUnit], Iterable[Any]]] - - def __init__(self, fh: TextIO, columns: List[Callable[[TranslationUnit], Iterable[Any]]]): - self.fh = fh - self.columns = columns - - def __enter__(self) -> 'TabWriter': - self.writer = csv.writer(self.fh, delimiter='\t') - return self - - def write(self, unit: TranslationUnit) -> None: - self.writer.writerow([ - ';'.join(map(str, getter(unit))) - for getter in self.columns - ]) - -class TxtWriter(Writer): - def __init__(self, fh: TextIO, language: str): - self.fh = fh - self.language = language - - def write(self, unit: TranslationUnit) -> None: - print(unit.translations[self.language].text, file=self.fh) - - -class PyWriter(Writer): - def __init__(self, fh: TextIO): - self.fh = fh - - def write(self, unit: TranslationUnit) -> None: - pprint(unit, stream=self.fh) - - -class TranslationUnitUnpickler(pickle.Unpickler): - def find_class(self, module: str, name: str) -> Type[Any]: - if module == 'tmxutil' or module == '__main__': - if name == 'TranslationUnitVariant': - return TranslationUnitVariant - elif name == 'TranslationUnit': - return TranslationUnit - raise pickle.UnpicklingError("global '{}.{}' is forbidden".format(module, name)) - - -class PickleReader(Reader): - def __init__(self, fh: BinaryIO): - self.fh = fh - - def close(self) -> None: - self.fh.close() - - def records(self) -> Iterator[TranslationUnit]: - try: - while True: - unit = TranslationUnitUnpickler(self.fh).load() - assert isinstance(unit, TranslationUnit) - yield unit - except EOFError: - pass - - -class PickleWriter(Writer): - def __init__(self, fh: BinaryIO): - self.fh = fh - - def write(self, unit: TranslationUnit) -> None: - pickle.dump(unit, self.fh) - - -class CountWriter(Writer): - """Instead of writing tmx records, it counts a property and writes a summary - of which values it encountered for that property, and how often it encountered - them.""" - def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): - self.fh = fh - self.key = key - - def __enter__(self) -> 'CountWriter': - self.counter = Counter() # type: Counter[Any] - return self - - def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: - if type is None: - for key, count in sorted(self.counter.most_common(), key=itemgetter(1), reverse=True): - self.fh.write("{}\t{}\n".format(count, " ".join(sorted(key)) if isinstance(key, frozenset) else key)) - - def write(self, unit: TranslationUnit) -> None: - self.counter.update(self.key(unit)) - - -class LiveCountWriter(Writer): - """Live variant of CountWriter: shows live updating bars while counting.""" - def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): - self.fh = fh - self.key = key - self.top_n = 10 - - def __enter__(self) -> 'LiveCountWriter': - self.counter = Counter() # type: Counter[Any] - self.total = 0 - self.bars: tqdm = [] - self.n = 0 - self.last_update = time() - self.last_n = 0 - self.update_interval = 128 - return self - - def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: - for bar in self.bars: - bar.close() - - if type is None: - for key, count in self.counter.most_common(): - self.fh.write("{}\t{}\n".format(count, " ".join(sorted(key)) if isinstance(key, frozenset) else key)) - - def refresh(self): - top = self.counter.most_common(self.top_n) - remainder = len(self.counter) - len(top) - - if remainder: - remainder_count = self.total - sum(count for _, count in top) - top.append(('({} more)'.format(remainder), remainder_count)) - - # Make sure we've enough bars - while len(top) > len(self.bars): - self.bars.append(tqdm( - position=len(self.bars)+1, - unit='unit', - file=sys.stderr, - dynamic_ncols=True, - bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')) - - # Determine the label length for alignment - label_len=max(len(str(value)) for value, _ in top) - - # Update all bars (sorting by most common on top) - for bar, (value, count) in zip(self.bars, top): - bar.set_description('{{: <{:d}s}}'.format(label_len+2).format(str(value)), refresh=False) - bar.total = self.total - bar.n = count - bar.refresh() - - def _count_iter(self, iterable): - count = 0 - for item in iterable: - count += 1 - yield item - self.total += count - - def _smooth(self, current, target): - return floor(0.7 * current + 0.3 * target) - - def write(self, unit: TranslationUnit) -> None: - vals = self.key(unit) - self.counter.update(self._count_iter(vals)) - self.n += 1 - if self.n % self.update_interval == 0: - time_since_last_update = max(time() - self.last_update, 1e-10) - n_per_sec = self.update_interval / time_since_last_update - self.update_interval = max(self._smooth(self.update_interval, 0.5 * n_per_sec), 1) - self.last_update = time() - self.refresh() - - -class IPCLabeler(object): - """Add IPC labels to sentence pairs based on the patent ids found in the - source-document property of either side of the pair.""" - - #lut: Dict[Tuple[str,str], Set[str]] - - def __init__(self, files: List[TextIO] = []): - self.lut = dict() # type: Dict[Tuple[str,str], Set[str]] - for fh in files: - self.load(fh) - - def load(self, fh: TextIO) -> None: - for line in fh: - parts = line.split('\t', 11) - if len(parts) != 6 and len(parts) != 12: - warning("Expected 6 or 12 fields while reading IPC file, found %d, in %s:%d", len(parts), fh.name, line) - continue - src_id, _, _, _, src_lang, src_ipcs = parts[:6] - self.lut[(src_lang.lower(), src_id)] = set(ipc.strip() for ipc in src_ipcs.split(',') if ipc.strip() != '') - if len(parts) == 12: - trg_id, _, _, _, trg_lang, trg_ipcs = parts[6:] - self.lut[(trg_lang.lower(), trg_id)] = set(ipc.strip() for ipc in trg_ipcs.split(',') if ipc.strip() != '') - - def annotate(self, unit: TranslationUnit) -> TranslationUnit: - for lang, translation in unit.translations.items(): - # Ignoring type because https://github.com/python/mypy/issues/2013 - translation['ipc'] = set().union(*( - self.lut[(lang.lower(), url)] - for url in translation['source-document'] - if (lang.lower(), url) in self.lut - )) # type: ignore - return unit - - -class IPCGroupLabeler(object): - """Add overall IPC group ids based on IPC labels added by IPCLabeler.""" - - #patterns: List[Tuple[str,Set[str]]] - - def __init__(self, files: List[TextIO] = []): - self.patterns = [] # type: List[Tuple[str,Set[str]]] - for fh in files: - self.load(fh) - - def load(self, fh: TextIO) -> None: - for line in fh: - prefix, group, *_ = line.split('\t', 2) - self.patterns.append(( - prefix.strip(), - {prefix.strip(), group.strip()} if prefix.strip() != "" else {group.strip()} - )) - - # Sort with most specific on top - self.patterns.sort(key=lambda pattern: (-len(pattern[0]), pattern[0])) - - def find_group(self, ipc_code: str) -> Set[str]: - for prefix, groups in self.patterns: - if ipc_code.startswith(prefix): - return groups - return set() - - def annotate(self, unit: TranslationUnit) -> TranslationUnit: - for lang, translation in unit.translations.items(): - translation['ipc-group'] = set().union(*map(self.find_group, translation['ipc'])) # type: ignore - return unit - - -def text_key(unit: TranslationUnit) -> Tuple[str,...]: - return tuple(translation.text for translation in unit.translations.values()) - - -def deduplicate(reader: Iterator[TranslationUnit], key: Callable[[TranslationUnit], Any], sort_key: Callable[[TranslationUnit], Any] = lambda unit: 0, mem_limit:int = 2 * 10**9) -> Iterator[TranslationUnit]: - """ - Deduplicate records read from reader. It does this by creating a hash table - of all records, grouped by key(record). If multiple records have the same - key they are combined if properties allow this (i.e. sets, lists) or - overwritten in case compare(current, new) is True. See deduplicate_merge(). - - Note: This function behaves like an iterator but will only start yielding - results once reader has run out of records. - - Note: If the memory usage becomes too large (because storing all unique - units is taking up too much storage) it will fall back to deduplicate_external - which uses a file as backing for temporarily storing translation units. - """ - - best = dict() # type: Dict[int,TranslationUnit] - - try: - first_unit = next(reader) - except StopIteration: - return reader - - for n, unit in enumerate(chain([first_unit], reader), start=1): - unit_id = hash(key(unit)) - - if unit_id in best: - best[unit_id] = deduplicate_merge(best[unit_id], unit, sort_key) - else: - best[unit_id] = unit - - if n % 10000 == 0: - mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - info('best contains %d (%d processed) entries (%1.2f GB)', len(best), n, mem_usage / 10**9) - if mem_usage > mem_limit: - info("Exceeded in-memory size limit, switching to file-backed deduplication") - already_processed = best.values() - del best - yield from deduplicate_external(chain(already_processed, reader), key, sort_key) - break - else: - yield from best.values() - - -def deduplicate_external(reader: Iterator[TranslationUnit], key: Callable[[TranslationUnit], Any], sort_key: Callable[[TranslationUnit], Any] = lambda unit: 0) -> Iterator[TranslationUnit]: - best = OrderedDict() # type: Dict[int,List[int]] - - with TemporaryFile() as fh: - for n, unit in enumerate(reader, start=1): - offset = fh.tell() - - pickle.dump(unit, fh) - - unit_id = hash(key(unit)) - - best.setdefault(unit_id, []).append(offset) - - if n % 10000 == 0: - mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - disk_usage = fh.tell() - info('best contains %d (%d processed) entries (mem: %1.2f GB, disk: %1.2f GB)', len(best), n, mem_usage / 10**9, disk_usage / 10**9) - - info('All entries inspected, %d unique entries; building output', len(best)) - - for n, duplicates in enumerate(best.values(), start=1): - best_unit = TranslationUnit() - - for offset in duplicates: - fh.seek(offset) - unit = pickle.load(fh) - - if not best_unit: - best_unit = unit - else: - best_unit = deduplicate_merge(best_unit, unit, sort_key) - - if n % 10000 == 0: - info('%d out of %d built', n, len(best)) - - yield best_unit - - -def deduplicate_merge(best_unit: TranslationUnit, new_unit: TranslationUnit, sort_key: Callable[[TranslationUnit], Any]) -> TranslationUnit: - """Merges new_unit into best_unit, combining collections but overwriting - all other entries if and only if compare(current, new) is true""" - new_is_better = sort_key(new_unit) < sort_key(best_unit) - - if new_is_better: - for key, value in new_unit.items(): - best_unit[key] = value - - for lang, translation in new_unit.translations.items(): - best_unit.translations[lang].updateVariant(translation) - - return best_unit - - -T = TypeVar('T', float, str) - -def build_binary_condition(type: Type[T], op: Callable[[T,T], bool]) -> Callable[[Callable[[TranslationUnit], Iterable[Any]],str], Callable[[TranslationUnit], bool]]: - """Wrapper for standard python operations on types. I.e. to implement gt - and lt.""" - def build_condition(lhs: Callable[[TranslationUnit], Iterable[Any]], rhs: str) -> Callable[[TranslationUnit], bool]: - return lambda unit: any(op(type(el), type(rhs)) for el in lhs(unit)) - return build_condition - - -def build_regex_condition(lhs: Callable[[TranslationUnit], Iterable[Any]], rhs: str) -> Callable[[TranslationUnit], bool]: - """Specialised version (or wrapper around) build_binary_condition that makes - one that tests a regular expression.""" - pattern = re.compile(rhs) - return lambda unit: any(pattern.search(str(el)) is not None for el in lhs(unit)) - - -condition_operators = { - '<': build_binary_condition(float, operator.lt), - '>': build_binary_condition(float, operator.gt), - '<=': build_binary_condition(float, operator.le), - '>=': build_binary_condition(float, operator.ge), - '=': build_binary_condition(str, operator.eq), - '=~': build_regex_condition -} - - -def set_property(key: str, value: str, unit: TranslationUnit) -> TranslationUnit: - unit[key] = {value} - return unit - - -def del_properties(properties: List[str], unit: TranslationUnit) -> TranslationUnit: - for prop in properties: - del unit[prop] - return unit - - -def parse_properties(props: str) -> Dict[str,Set[str]]: - properties: Dict[str,Set[str]] = {} - for prop in props.split(','): - key, value = prop.split('=', 1) - properties.setdefault(key, set()).add(value) - return properties - - -def parse_condition(operators: Mapping[str,Callable[[str,str], Callable[[TranslationUnit], bool]]], expr: str, functions={}) -> Callable[[TranslationUnit], bool]: - pattern = r'^(?P.+?)(?P{operators})(?P.*)$'.format( - operators='|'.join(re.escape(op) for op in sorted(operators.keys(), key=len, reverse=True))) - - match = re.match(pattern, expr) - - if match is None: - raise ValueError("Could not parse condition '{}'".format(expr)) - - info("Using expression op:'%(op)s' lhs:'%(lhs)s' rhs:'%(rhs)s'", match.groupdict()) - - prop_getter = parse_property_getter(match.group('lhs'), functions=functions) - - return operators[match.group('op')](prop_getter, match.group('rhs')) - - -def parse_property_getter(expr: str, functions: Dict[str,Callable[[Any],Any]] = {'len': len}) -> Callable[[TranslationUnit], Iterable[Any]]: - ops = [] - - while True: - match = re.match(r'^(?P[a-zA-Z_]\w*)\((?P.+?)\)$', expr) - if not match: - break - - if not match.group('fun') in functions: - raise ValueError('Function `{}` in expression `{}` not found.'.format(match.group('fun'), expr)) - - ops.insert(0, functions[match.group('fun')]) - expr = match.group('expr') - - match = re.match(r'^((?P[\w-]+)?(?P\.))?(?P[\w-]+)(?P\[\])?$', expr) - if not match: - raise ValueError('Could not interpret expression `{}`'.format(expr)) - - prop = match.group('prop') - - # 'en.source-document' or 'en.text' - if match.group('lang'): - lang = match.group('lang') - if prop == 'text': - val_getter = lambda unit: [unit.translations[lang].text] - else: - val_getter = lambda unit: unit.translations[lang][prop] - # e.g. '.collection', only look in root - elif match.group('dot'): - val_getter = lambda unit: unit[prop] - # e.g. 'text'; text can only occur in translations - elif prop == 'text': - val_getter = lambda unit: (translation.text for translation in unit.translations.values()) - # e.g. 'source-document' or 'collection'; search through both root and translations - else: - val_getter = lambda unit: reduce(lambda acc, translation: acc + list(translation.get(prop, [])), unit.translations.values(), list(unit.get(prop, []))) - - if match.group('brackets'): - agg_getter = lambda unit: [frozenset(val_getter(unit))] # convert to frozenset so it can be used as key in dict/Counter - else: - agg_getter = val_getter - - if ops: - fun_getter = lambda unit: (reduce(lambda val, op: op(val), ops, val) for val in agg_getter(unit)) - else: - fun_getter = agg_getter - - return fun_getter - - -def closer(fh: Any) -> Generator[Any,None,None]: - """Generator that closes fh once it it their turn.""" - if hasattr(fh, 'close'): - fh.close() - yield from [] - - -def is_gzipped(fh: BufferedBinaryIO) -> bool: - """Test if stream is probably a gzip stream""" - return fh.peek(2).startswith(b'\x1f\x8b') - - -def make_reader(fh: BufferedBinaryIO, *, input_format: Optional[str] = None, input_columns: Optional[Iterable[str]] = None, input_languages: Optional[Sequence[str]] = None, progress:bool = False, **kwargs: Any) -> Iterator[TranslationUnit]: - if tqdm and progress: - fh = ProgressWrapper(fh) - - if is_gzipped(fh): - fh = cast(BufferedBinaryIO, gzip.open(fh)) - - if not input_format: - file_format, format_args = autodetect(fh) - else: - file_format, format_args = input_format, {} - - if file_format == 'tab' and 'columns' not in format_args and input_columns: - format_args['columns'] = input_columns - - if file_format == 'pickle': - reader: Reader = PickleReader(fh) - elif file_format == 'tmx': - reader = TMXReader(fh) - elif file_format == 'tab': - if not input_languages or len(input_languages) != 2: - raise ValueError("'tab' format needs exactly two input languages specified") - text_fh = TextIOWrapper(fh, encoding='utf-8') - reader = TabReader(text_fh, *input_languages, **format_args) - else: - raise ValueError("Cannot create file reader for format '{}'".format(file_format)) - - # Hook an empty generator to the end that will close the file we opened. - return chain(reader, closer(reader)) - - -def peek_first_line(fh: BufferedBinaryIO, length: int = 128) -> bytes: - """Tries to get the first full line in a buffer that supports peek.""" - while True: - buf = fh.peek(length) - - pos = buf.find(b'\n') - if pos != -1: - return buf[0:pos] - - if len(buf) < length: - return buf - - buf *= 2 - - -def autodetect(fh: BufferedBinaryIO) -> Tuple[str, Dict[str,Any]]: - """Fill in arguments based on what we can infer from the input we're going - to get. fh needs to have a peek() method and return bytes.""" - - # First test: is it XML? - xml_signature = b'= 7: - return 'tab', {'columns': ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'hash-bifixer', 'score-bifixer', 'score-bicleaner']} - - if column_count >= 5: - return 'tab', {'columns': ['source-document-1', 'source-document-2', 'text-1', 'text-2', 'score-aligner']} - - raise ValueError('Did not recognize file format') - - -def first_item_getter(key: str) -> Callable[[TranslationUnit], Optional[str]]: - """Creates a getter that gets one value from a translation unit's properties, - if there are more values for that property, it's undefined which one it gets. - If the property does not exist, or is empty, it will return None.""" - def getter(obj: TranslationUnit) -> Optional[str]: - return first(obj.get(key, set()), default=None) - return getter - - -def make_deduplicator(args: Namespace, reader: Iterator[TranslationUnit], mem_limit : int = 2 * 10**9) -> Iterator[TranslationUnit]: - """ - Make a deduplicate filter based on the input options. Fancy bifixer based - deduplicator if we have the data, otherwise fall back to boring deduplicator. - """ - - # Grab the first object from the reader to see what we're dealing with - try: - peeked_obj = next(reader) - except StopIteration: - # It's an empty reader. No need to wrap it in anything deduplicating. - return reader - - # Stick the peeked object back on :P - reader = chain([peeked_obj], reader) - - if 'hash-bifixer' in peeked_obj and 'score-bifixer' in peeked_obj: - return deduplicate(reader, key=first_item_getter('hash-bifixer'), sort_key=first_item_getter('score-bifixer'), mem_limit=mem_limit) - else: - return deduplicate(reader, key=text_key, mem_limit=mem_limit) - - -def abort(message: str) -> int: - """Abandon ship! Use in case of misguided users.""" - print(message, file=sys.stderr) - return 1 - - -def properties_adder(properties: Dict[str,Set[str]], reader: Iterator[TranslationUnit]) -> Iterator[TranslationUnit]: - for unit in reader: - unit.update(properties) - yield unit - - -def import_file_as_module(file): - filename = os.path.basename(file) - basename, ext = os.path.splitext(filename) - if ext not in {'.py'}: - raise ValueError('Error importing {}: can only import .py files'.format(file)) - - spec = importlib.util.spec_from_file_location(basename, file) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def main(argv: List[str], stdin: BufferedBinaryIO, stdout: BufferedBinaryIO) -> int: - parser = ArgumentParser( - formatter_class=RawDescriptionHelpFormatter, - description='Annotate, analyze, filter and convert (mainly) tmx files', - epilog=dedent(''' - Supported syntax for FILTER_EXPR: - Syntax: PROP_EXPR OPERATOR VALUE where: - PROP_EXPR Either 'text' or the value of the "type" attribute - of a element. - OPERATOR Supported operators: - >, >=, <, <= for numeric comparisons. - =, =~ for string comparisons. - VALUE String, number or regular expression. - - Examples: - collection=europat Matches sentence pairs that have a property - 'collection' that is exactly 'europat'. - text=~euro.* Matches pairs that match a regular expression. - id>400 Matches pairs that have an id larger than 400 - - Supported syntax for PROP_EXPR: - Syntax: [FUNCTION] ( [LANG] [.] PROPERTY [\\[\\]] ) where all except - PROPERTY is optional. If FUNCTION is not used, you don't need the - parenthesis. The [] after PROPERTY can be used to indicate that - all values of that property for a should be treated as a - single set. - - Examples: - source-document Count every prop type "source-document", either as - part of the or . - .collection Count every collection observed as a prop of the - sentence pair. - .collection[] Count every combination of - observed in . - len(en.text) String length of english side of the sentence pair. - You can use your own functions using --include. - ''')) - parser.add_argument('-i', '--input-format', choices=['tmx', 'tab', 'pickle'], help='Input file format. Automatically detected if left unspecified.') - parser.add_argument('-o', '--output-format', choices=['tmx', 'tab', 'txt', 'py', 'pickle'], default='tmx', help='Output file format. Output is always written to stdout.') - parser.add_argument('-l', '--input-languages', nargs=2, help='Input languages in case of tab input. Needs to be in order their appearance in the columns.') - parser.add_argument('-c', '--input-columns', nargs='+', help='Input columns in case of tab input. Column names ending in -1 or -2 will be treated as translation-specific.') - parser.add_argument('--output-languages', nargs='+', help='Output languages for tab and txt output. txt output allows only one language, tab multiple.') - parser.add_argument('--output-columns', metavar="PROP_EXPR", nargs='+', help='Output columns for tab output. Use {lang}.{property} syntax to select language specific properties such as en.source-document or de.text.') - parser.add_argument('--output', default=stdout, type=FileType('wb'), help='Output file. Defaults to stdout.') - parser.add_argument('--creation-date', type=fromisoformat, default=datetime.now(), help='override creation date in tmx output.') - parser.add_argument('-p', '--properties', action='append', help='List of A=B,C=D properties to add to each sentence pair. You can use one --properties for all files or one for each input file.') - parser.add_argument('-d', '--deduplicate', action='store_true', help='Deduplicate units before printing. Unit properties are combined where possible. If score-bifixer and hash-bifixer are avaiable, these will be used.') - parser.add_argument('--drop', nargs='+', dest='drop_properties', help='Drop properties from output.') - parser.add_argument('--renumber-output', action='store_true', help='Renumber the translation unit ids. Always enabled when multiple input files are given.') - parser.add_argument('--ipc', dest='ipc_meta_files', nargs="+", type=FileType('r'), help='One or more IPC metadata files.') - parser.add_argument('--ipc-group', dest='ipc_group_files', nargs="+", type=FileType('r'), help='One or more IPC grouping files.') - parser.add_argument('--with', nargs='+', action='append', dest='filter_with', metavar='FILTER_EXPR') - parser.add_argument('--without', nargs='+', action='append', dest='filter_without', metavar='FILTER_EXPR') - parser.add_argument('-P', '--progress', action='store_true', help='Show progress bar when reading files.') - logging_options = parser.add_mutually_exclusive_group() - logging_options.add_argument('-q', '--quiet', action='store_true', help='Hide issues encountered while reading files.') - logging_options.add_argument('-v', '--verbose', action='store_true', help='Print progress updates.') - parser.add_argument('--workspace', type=fromfilesize, help='Mamimum memory usage for deduplication. When exceeded, will continue deduplication using filesystem.', default='4G') - parser.add_argument('--count', dest='count_property', help='Count which values occur for a property.', metavar='COUNT_EXPR') - parser.add_argument('--include', action='append', default=[], dest='count_libraries', help='Include a python file so functions defined in that file can be used with --count, e.g. include something that provides a domain(url:str) function, and use `--count domain(source-document)`.') - parser.add_argument('files', nargs='*', default=[stdin], type=FileType('rb'), help='Input files. May be gzipped. If not specified stdin is used.') - - # I prefer the modern behaviour where you can do `tmxutil.py -p a=1 file.tmx - # -p a=2 file2.tmx` etc. but that's only available since Python 3.7. - if hasattr(parser, 'parse_intermixed_args'): - args = parser.parse_intermixed_args(argv) - else: - args = parser.parse_args(argv) - - if args.verbose: - getLogger().setLevel(INFO) - elif args.quiet: - getLogger().setLevel(ERROR) - - # Load in functions early so if anything is wrong with them we'll know before - # we attempt to parse anything. - functions = reduce(lambda obj, file: {**obj, **import_file_as_module(file).__dict__}, - args.count_libraries, {'len': len}) - - # Create reader. Make sure to call make_reader immediately and not somewhere - # down in a nested generator so if one of the files cannot be found, we - # error out immediately. - readers = [make_reader(fh, **vars(args)) for fh in args.files] - - # Add properties to each specific file? If so, do it before we chain all - # readers into a single iterator. If all share the same properties we'll - # add it after chaining multiple readers into one. - if args.properties and len(args.properties) > 1: - if len(args.properties) != len(readers): - return abort("When specifying multiple --properties options, you need" - " to specify exactly one for each input file. You have {}" - " --properties options, but {} files.".format(len(args.properties), len(readers))) - properties_per_file = (parse_properties(props) for props in args.properties) - - readers = [properties_adder(properties, reader) for properties, reader in zip(properties_per_file, readers)] - - # If we have multiple input files, the translation unit ids will be a mess - # when merged. So renumber them. - args.renumber_output = True - - # Merge all readers into a single source of sentence pairs - reader = chain.from_iterable(readers) - - # If we want to add properties (the same ones) to all input files, we do it - # now, after merging all readers into one. - if args.properties and len(args.properties) == 1: - properties = parse_properties(args.properties[0]) - reader = properties_adder(properties, reader) - - # Optional filter & annotation steps for reader. - if args.ipc_meta_files: - reader = map(IPCLabeler(args.ipc_meta_files).annotate, reader) - - if args.ipc_group_files: - reader = map(IPCGroupLabeler(args.ipc_group_files).annotate, reader) - - if args.filter_with: - dnf = [[parse_condition(condition_operators, cond_str, functions=functions) for cond_str in cond_expr] for cond_expr in args.filter_with] - reader = filter(lambda unit: any(all(expr(unit) for expr in cond) for cond in dnf), reader) - - if args.filter_without: - dnf = [[parse_condition(condition_operators, cond_str, functions=functions) for cond_str in cond_expr] for cond_expr in args.filter_without] - reader = filter(lambda unit: all(any(not expr(unit) for expr in cond) for cond in dnf), reader) - - if args.deduplicate: - reader = make_deduplicator(args, reader, mem_limit=args.workspace) - - if args.renumber_output: - reader = starmap(partial(set_property, 'id'), enumerate(reader, start=1)) - - # If we want to drop properties from the output, do that as the last step. - if args.drop_properties: - reader = map(partial(del_properties, args.drop_properties), reader) - - # Create writer - with ExitStack() as ctx: - if args.output_format == 'pickle': - writer = ctx.enter_context(PickleWriter(args.output)) # type: Writer - - else: - text_out = ctx.enter_context(TextIOWrapper(args.output, encoding='utf-8')) - - if args.count_property: - count_property = parse_property_getter(args.count_property, functions=functions) - - - if tqdm and args.progress: - writer = ctx.enter_context(LiveCountWriter(text_out, key=count_property)) - else: - writer = ctx.enter_context(CountWriter(text_out, key=count_property)) - elif args.output_format == 'tmx': - writer = ctx.enter_context(TMXWriter(text_out, creation_date=args.creation_date)) - elif args.output_format == 'tab': - if not args.output_columns: - if not args.output_languages: - return abort("Use --output-languages X Y to select the order of the columns in the output, or use --output-columns directly.") - args.output_columns = [ - *(f'{lang}.source-document' for lang in args.output_languages), - *(f'{lang}.text' for lang in args.output_languages) - ] - - column_getters = [ - parse_property_getter(expr, functions=functions) - for expr in args.output_columns - ] - - writer = ctx.enter_context(TabWriter(text_out, column_getters)) - elif args.output_format == 'txt': - if not args.output_languages or len(args.output_languages) != 1: - return abort("Use --output-languages X to select which language." - " When writing txt, it can only write one language at" - " a time.") - writer = ctx.enter_context(TxtWriter(text_out, args.output_languages[0])) - elif args.output_format == 'py': - writer = ctx.enter_context(PyWriter(text_out)) - elif args.output_format == 'pickle': - writer = ctx.enter_context(PickleWriter(args.output)) - else: - raise ValueError('Unknown output format: {}'.format(args.output_format)) - - # Main loop. with statement for writer so it can write header & footer - count = 0 - for unit in reader: - writer.write(unit) - count += 1 - info("Written %d records.", count) - - return 0 - - -if __name__ == '__main__': - try: - sys.exit(main(sys.argv[1:], sys.stdin.buffer, sys.stdout.buffer)) - except ValueError as e: - sys.exit(abort("Error: {}".format(e))) From 3547a560baccd60a41b35f1c70476f003b3174ed Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 26 Oct 2021 21:03:18 +0200 Subject: [PATCH 2/5] Add license --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6f925d0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 University of Edinburgh + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From cbacbf8d378737ae0b3bdc27a1a3fed847c61ead Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 6 Dec 2022 14:44:28 +0000 Subject: [PATCH 3/5] Fix types --- src/tmxutil/__init__.py | 4 +++- src/tmxutil/cli.py | 2 +- src/tmxutil/formats/count.py | 8 +++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/tmxutil/__init__.py b/src/tmxutil/__init__.py index 702d616..f97179c 100644 --- a/src/tmxutil/__init__.py +++ b/src/tmxutil/__init__.py @@ -1,5 +1,7 @@ +__version__ = 'dev' + try: import pkg_resources __version__ = pkg_resources.require('tmxutil-pkg-jelmervdl')[0].version except: - __version__ = 'dev' + pass diff --git a/src/tmxutil/cli.py b/src/tmxutil/cli.py index fd9898d..1cee835 100755 --- a/src/tmxutil/cli.py +++ b/src/tmxutil/cli.py @@ -94,7 +94,7 @@ def parse_condition(operators: Mapping[str,Callable[[str,str], Callable[[Transla return operators[match.group('op')](prop_getter, match.group('rhs')) -def parse_property_getter(expr: str, functions: Dict[str,Callable[[Any],Any]] = {'len': len}) -> Callable[[TranslationUnit], Iterable[Any]]: +def parse_property_getter(expr: str, functions: Mapping[str,Callable[[Any],Any]] = {'len': len}) -> Callable[[TranslationUnit], Iterable[Any]]: ops = [] #type: List[Callable[[Any], Any]] while True: diff --git a/src/tmxutil/formats/count.py b/src/tmxutil/formats/count.py index ed9f603..e64e698 100644 --- a/src/tmxutil/formats/count.py +++ b/src/tmxutil/formats/count.py @@ -1,6 +1,8 @@ from tmxutil.types import Writer, TranslationUnit from tmxutil.interactive import tqdm -from typing import Optional, Callable, TextIO, List, Any, Type +from collections import Counter +from time import time +from typing import Optional, Callable, TextIO, Iterable, Any, Type from types import TracebackType from operator import itemgetter @@ -9,7 +11,7 @@ class CountWriter(Writer): """Instead of writing tmx records, it counts a property and writes a summary of which values it encountered for that property, and how often it encountered them.""" - def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): + def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], Iterable[Any]]): self.fh = fh self.key = key @@ -28,7 +30,7 @@ def write(self, unit: TranslationUnit) -> None: class LiveCountWriter(Writer): """Live variant of CountWriter: shows live updating bars while counting.""" - def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], List[Any]]): + def __init__(self, fh: TextIO, key: Callable[[TranslationUnit], Iterable[Any]]): self.fh = fh self.key = key self.top_n = 10 From dfe3032a047a27bdfef7cf3c8359e9f70ba92d9d Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 6 Dec 2022 14:48:57 +0000 Subject: [PATCH 4/5] More pythonic python package --- setup.py | 7 +++---- {src/tmxutil => tmxutil}/__init__.py | 0 {src/tmxutil => tmxutil}/__main__.py | 0 {src/tmxutil => tmxutil}/cli.py | 0 {src/tmxutil => tmxutil}/filters/__init__.py | 0 {src/tmxutil => tmxutil}/filters/deduplicate.py | 0 {src/tmxutil => tmxutil}/filters/ipc.py | 0 {src/tmxutil => tmxutil}/formats/__init__.py | 0 {src/tmxutil => tmxutil}/formats/count.py | 0 {src/tmxutil => tmxutil}/formats/json.py | 0 {src/tmxutil => tmxutil}/formats/pickle.py | 0 {src/tmxutil => tmxutil}/formats/tab.py | 0 {src/tmxutil => tmxutil}/formats/tmx.py | 0 {src/tmxutil => tmxutil}/formats/txt.py | 0 {src/tmxutil => tmxutil}/interactive.py | 0 {src/tmxutil => tmxutil}/types.py | 0 {src/tmxutil => tmxutil}/utils.py | 0 17 files changed, 3 insertions(+), 4 deletions(-) rename {src/tmxutil => tmxutil}/__init__.py (100%) rename {src/tmxutil => tmxutil}/__main__.py (100%) rename {src/tmxutil => tmxutil}/cli.py (100%) rename {src/tmxutil => tmxutil}/filters/__init__.py (100%) rename {src/tmxutil => tmxutil}/filters/deduplicate.py (100%) rename {src/tmxutil => tmxutil}/filters/ipc.py (100%) rename {src/tmxutil => tmxutil}/formats/__init__.py (100%) rename {src/tmxutil => tmxutil}/formats/count.py (100%) rename {src/tmxutil => tmxutil}/formats/json.py (100%) rename {src/tmxutil => tmxutil}/formats/pickle.py (100%) rename {src/tmxutil => tmxutil}/formats/tab.py (100%) rename {src/tmxutil => tmxutil}/formats/tmx.py (100%) rename {src/tmxutil => tmxutil}/formats/txt.py (100%) rename {src/tmxutil => tmxutil}/interactive.py (100%) rename {src/tmxutil => tmxutil}/types.py (100%) rename {src/tmxutil => tmxutil}/utils.py (100%) diff --git a/setup.py b/setup.py index 6b268de..fda981f 100644 --- a/setup.py +++ b/setup.py @@ -21,11 +21,10 @@ "Operating System :: OS Independent", ], entry_points={ - 'console_scripts': [ - 'tmxutil=tmxutil.cli:entrypoint', + "console_scripts": [ + "tmxutil=tmxutil.cli:entrypoint", ], }, - package_dir={"": "src"}, - packages=setuptools.find_packages(where="src"), + packages=["tmxutil"], python_requires=">=3.6", ) \ No newline at end of file diff --git a/src/tmxutil/__init__.py b/tmxutil/__init__.py similarity index 100% rename from src/tmxutil/__init__.py rename to tmxutil/__init__.py diff --git a/src/tmxutil/__main__.py b/tmxutil/__main__.py similarity index 100% rename from src/tmxutil/__main__.py rename to tmxutil/__main__.py diff --git a/src/tmxutil/cli.py b/tmxutil/cli.py similarity index 100% rename from src/tmxutil/cli.py rename to tmxutil/cli.py diff --git a/src/tmxutil/filters/__init__.py b/tmxutil/filters/__init__.py similarity index 100% rename from src/tmxutil/filters/__init__.py rename to tmxutil/filters/__init__.py diff --git a/src/tmxutil/filters/deduplicate.py b/tmxutil/filters/deduplicate.py similarity index 100% rename from src/tmxutil/filters/deduplicate.py rename to tmxutil/filters/deduplicate.py diff --git a/src/tmxutil/filters/ipc.py b/tmxutil/filters/ipc.py similarity index 100% rename from src/tmxutil/filters/ipc.py rename to tmxutil/filters/ipc.py diff --git a/src/tmxutil/formats/__init__.py b/tmxutil/formats/__init__.py similarity index 100% rename from src/tmxutil/formats/__init__.py rename to tmxutil/formats/__init__.py diff --git a/src/tmxutil/formats/count.py b/tmxutil/formats/count.py similarity index 100% rename from src/tmxutil/formats/count.py rename to tmxutil/formats/count.py diff --git a/src/tmxutil/formats/json.py b/tmxutil/formats/json.py similarity index 100% rename from src/tmxutil/formats/json.py rename to tmxutil/formats/json.py diff --git a/src/tmxutil/formats/pickle.py b/tmxutil/formats/pickle.py similarity index 100% rename from src/tmxutil/formats/pickle.py rename to tmxutil/formats/pickle.py diff --git a/src/tmxutil/formats/tab.py b/tmxutil/formats/tab.py similarity index 100% rename from src/tmxutil/formats/tab.py rename to tmxutil/formats/tab.py diff --git a/src/tmxutil/formats/tmx.py b/tmxutil/formats/tmx.py similarity index 100% rename from src/tmxutil/formats/tmx.py rename to tmxutil/formats/tmx.py diff --git a/src/tmxutil/formats/txt.py b/tmxutil/formats/txt.py similarity index 100% rename from src/tmxutil/formats/txt.py rename to tmxutil/formats/txt.py diff --git a/src/tmxutil/interactive.py b/tmxutil/interactive.py similarity index 100% rename from src/tmxutil/interactive.py rename to tmxutil/interactive.py diff --git a/src/tmxutil/types.py b/tmxutil/types.py similarity index 100% rename from src/tmxutil/types.py rename to tmxutil/types.py diff --git a/src/tmxutil/utils.py b/tmxutil/utils.py similarity index 100% rename from src/tmxutil/utils.py rename to tmxutil/utils.py From da367f0bb85410b60b9cd19567ec884b58c76e47 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 6 Dec 2022 14:57:55 +0000 Subject: [PATCH 5/5] Add Github actions --- .github/release.yml | 26 ++++++++++++++++++++++++++ .github/test.yml | 30 ++++++++++++++++++++++++++++++ .gitignore | 1 + setup.py | 2 +- 4 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 .github/release.yml create mode 100644 .github/test.yml diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000..91100b4 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,26 @@ +name: Release + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python setup.py sdist bdist_wheel + twine upload --repository pypi dist/* diff --git a/.github/test.yml b/.github/test.yml new file mode 100644 index 0000000..7be9344 --- /dev/null +++ b/.github/test.yml @@ -0,0 +1,30 @@ +name: Test + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python: [3.6, 3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + - name: Install Tox and any other packages + run: pip install tox + - name: Run Tox + # Run tox using the version of Python in `PATH` + run: tox -e py diff --git a/.gitignore b/.gitignore index 11041c7..fe2b882 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.egg-info +/dist diff --git a/setup.py b/setup.py index fda981f..6c87fbf 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setuptools.setup( - name="tmxutil-pkg-jelmervdl", + name="tmxutil", version="1.2", author="Jelmer van der Linde", author_email="jelmer@ikhoefgeen.nl",