diff --git a/README.md b/README.md index d2e87a5..3f10364 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,9 @@ Add [ruby/lib/](https://github.com/kaitai-io/kaitai_compress/tree/master/ruby/li | Algorithm | Process name | Arguments | Conforming | Test file extension | | - | - | - | - | - | -| [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) | `lz4` | None | [LZ4 block specification](https://lz4.github.io/lz4/lz4_Block_format.md) | lz4 | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_raw` | None | Raw LZMA stream | lzma_raw | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_lzma` | None | Legacy .lzma file format (AKA alone) | lzma | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_xz` | None | .xz file format | xz | -| [DEFLATE](https://en.wikipedia.org/wiki/DEFLATE) (AKA zlib) | `zlib` | None | [RFC 1951](https://tools.ietf.org/html/rfc1951) | zlib | -| [zstd](https://zstd.net) (AKA zstandard) | `zstd` | None | [Spec & ref implementation](http://facebook.github.io/zstd/zstd_manual.html) | zst | +| [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) | `lz4` | block_size, if should link blocks, compression level (`0`-`16`), if should checksum frame, if should checksum each block | [LZ4 block specification](https://lz4.github.io/lz4/lz4_Block_format.md) | `lz4` | +| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma` | algorithm version (`1, 2`), compression level (`0-9`, `-1` - don't compress with lzma, but use other filters specified), format (`auto`, `alone`, `raw`, `xz`), checksumming algorithm (`none`, `crc32`, `crc64`, `sha256`), modifiers (`e` for `--extreme`), dictionary size, literal context bit count, literal position bit count, position bit count, match finder (`hc3`, `hc4`, `bt2`, `bt3`, `bt4`), mode (`normal`, `fast`), additional_filters | Raw LZMA stream | `lzma` | +| [DEFLATE](https://en.wikipedia.org/wiki/DEFLATE) (AKA zlib) | `zlib` | container type (`raw`, `zlib`, `gzip`), log of window size (`9`-`15`), dictionary, compression level (`0`-`9`, `-1` for default), memory level (`0`-`9`), strategy (`filtered`, `huffman_only`), method (currently only `deflated` is supported) | [RFC 1951](https://tools.ietf.org/html/rfc1951) | `zlib`, `gz` | +| [zstd](https://zstd.net) (AKA zstandard) | `zstd` | format (`zstd1_magicless`, `zstd1`), log of (max) window size, dictionary, compression level (`1` - `22`, `-5` - `-1`), if should write checksum, if should write uncompressed size, if should write dict ID, strategy (`fast`, `dfast`, `greedy`, `lazy`, `lazy2`, `btlazy2`, `btopt`, `btultra`, `btultra2`), hash log size, match min size, chain log size, search log size, overlap log size, target length, if should use long distance matching, ldm hash log size, ldm match min size, ldm bucket size log, ldm hash rate log, job size, force max window | [Spec & ref implementation](http://facebook.github.io/zstd/zstd_manual.html) | `zst` | +| [bzip2](https://en.wikipedia.org/wiki/bzip2) | `bz2` | compression level (`1` - `9`) to add |[Official repo](https://gitlab.com/federicomenaquintero/bzip2)|`bz2`| +| [brotli](https://en.wikipedia.org/wiki/brotli) | `brotli` | compression level (`0`-`11`), mode (`generic`, `text`, `font`), log window size , log block size, dictionary | [Official repo](https://github.com/google/brotli) | `br` | diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000..e7be386 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +*.pyc +*.pyo +./build +./dist +./*.egg-info diff --git a/python/kaitai/compress/__init__.py b/python/kaitai/compress/__init__.py index 3256d26..661986f 100644 --- a/python/kaitai/compress/__init__.py +++ b/python/kaitai/compress/__init__.py @@ -1,5 +1,7 @@ -from .lz4 import Lz4 -from .zlib import Zlib -from .lzma_raw import LzmaRaw -from .lzma_lzma import LzmaLzma -from .lzma_xz import LzmaXz +from .core import * +from .algorithms.zlib import Zlib +from .algorithms.lzma import Lzma +from .algorithms.lz4 import Lz4 +from .algorithms.brotli import Brotli +from .algorithms.zstd import Zstd +from .algorithms.bz2 import Bz2 diff --git a/python/kaitai/compress/algorithms/brotli.py b/python/kaitai/compress/algorithms/brotli.py new file mode 100644 index 0000000..3007650 --- /dev/null +++ b/python/kaitai/compress/algorithms/brotli.py @@ -0,0 +1,39 @@ +import typing +from ..core import KaitaiProcessor, ProcessorContextStub +import brotli +from enum import Enum +from functools import partial + +class Brotli(KaitaiProcessor): + __slots__ = ("compressorParams", "decompressorParams", "brotli") + def __init__(self, level:typing.Optional[int]=None, mode:typing.Optional[str]="generic", log_window_size:typing.Optional[int]=None, log_block_size:typing.Optional[int]=None, dictionary:typing.Optional[bytes]=None): + if self.__class__.dO is None: + import brotli + self.__class__.brotli = brotli + self.compressorParams = {} + self.decompressorParams = {} + + if mode is not None: + if isinstance(mode, str): + mode = getattr(self.__class__.brotli, "MODE_" + mode.upper()) + self.compressorParams["mode"] = mode + + if level is not None: + self.compressorParams["quality"] = level + + if window_size is not None: + self.compressorParams["lgwin"] = log_window_size + + if block_size is not None: + self.compressorParams["lgblock"] = log_block_size + + if dictionary is not None: + self.decompressorParams["dictionary"] = +self.compressorParams["dictionary"] = dictionary + + # new API + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.__class__.brotli.decompress(data, **self.decompressorParams)) + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.__class__.brotli.compress(data, **self.compressorParams)) diff --git a/python/kaitai/compress/algorithms/bz2.py b/python/kaitai/compress/algorithms/bz2.py new file mode 100644 index 0000000..be5ad9b --- /dev/null +++ b/python/kaitai/compress/algorithms/bz2.py @@ -0,0 +1,15 @@ +import typing +from ..core import KaitaiProcessor, ProcessorContextStub +import bz2 + +class Bz2(KaitaiProcessor) : + __slots__ = ("decompressor", "compressor") + def __init__(self, level:int=22, *args, **kwargs): + self.decompressor = bz2.BZ2Decompressor() + self.compressor = bz2.BZ2Compressor(level) + + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.decompressor.decompress(data)) + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.compressor.compress(data)) diff --git a/python/kaitai/compress/algorithms/lz4.py b/python/kaitai/compress/algorithms/lz4.py new file mode 100644 index 0000000..cb2bdac --- /dev/null +++ b/python/kaitai/compress/algorithms/lz4.py @@ -0,0 +1,19 @@ +import typing +from ..core import KaitaiProcessor, ProcessorContextStub + +class Lz4(KaitaiProcessor): + __slots__ = ("obj",) + def __init__(self, block_size:bool=0, linker:bool=True, compression_level:int=16, frame_checksum:bool=False, block_checksum:bool=False, *args, **kwargs): + import lz4.frame + self.obj=lz4.frame.LZ4FrameCompressor(block_size=block_size, block_linker=linker, compression_level=compression_level, content_checksum=frame_checksum, block_checksum=block_checksum, return_bytearray=False) + + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.obj.decompress(data)) + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.obj.compress(data)) + + def getArgs(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + import lz4.frame + res = lz4.frame.get_frame_info(data) + return (res["block_size"], res["linker"], res["compression_level"], res["content_checksum"], res["block_checksum"]) diff --git a/python/kaitai/compress/algorithms/lzma.py b/python/kaitai/compress/algorithms/lzma.py new file mode 100644 index 0000000..9f747ea --- /dev/null +++ b/python/kaitai/compress/algorithms/lzma.py @@ -0,0 +1,81 @@ +import typing +from ..core import KaitaiProcessor, ProcessorContextStub +import lzma +from enum import Enum +from functools import wraps + +modifiersMapping = { + "e": lzma.PRESET_EXTREME +} + +class Lzma(KaitaiProcessor): + __slots__ = ("decompressor", "compressor") + def __init__(self, algo:int=2, level:int=9, format:typing.Optional[typing.Union[str, int]]=lzma.FORMAT_AUTO, check:typing.Optional[typing.Union[str, int]]=-1, modifiers:str="e", dict_size:typing.Optional[int]=None, literal_context_bits:typing.Optional[int]=3, literal_position_bits:typing.Optional[int]=0, position_bits:typing.Optional[int]=2, match_finder:typing.Optional[str]="bt4", mode:typing.Optional[str]="normal", additional_filters:typing.Iterable[typing.Dict[str, typing.Any]]=(), *args, **kwargs): + if isinstance(format, str): + format = getattr(lzma, "FORMAT_" + format.upper()) + + if isinstance(check, str): + check = getattr(lzma, "CHECK_" + check.upper()) + + + filters = list(additional_filters) + if algo > -1: + if isinstance(modifiers, str): + modifiersNum = 0 + for m in modifiers: + modifiersNum |= modifiersMapping[m] + modifiers = modifiersNum + del modifiersNum + + lzmaFilter={ + "id": "lzma" + str(algo), + "preset": level | modifiers, + } + + if dict_size is not None: + lzmaFilter["dict"] = dict_size, + if literal_context_bits is not None: + lzmaFilter["lc"] = literal_context_bits + if literal_position_bits is not None: + lzmaFilter["lp"] = literal_position_bits + if position_bits is not None: + lzmaFilter["pb"] = position_bits + if match_finder is not None: + if isinstance(match_finder, str): + match_finder = getattr(lzma, "MF_"+match_finder.upper()) + lzmaFilter["mf"] = match_finder + if mode is not None: + if isinstance(mode, str): + mode = getattr(lzma, "MODE_"+mode.upper()) + lzmaFilter["mode"] = mode + filters.append(lzmaFilter) + + for f in filters: + if isinstance(f["id"], str): + f["id"] = getattr(lzma, "FILTER_"+f["id"].upper()) + + compressorParams = { + "format": format, + "check": check, + "preset": None, # set in filters + "filters": filters + } + decompressorParams = { + "format": format, + "memlimit": None, + } + + if format is lzma.FORMAT_RAW: + decompressorParams["filters"] = filters + + self.decompressor = lzma.LZMADecompressor(**decompressorParams) + + if "format" not in compressorParams or compressorParams["format"] == lzma.FORMAT_AUTO: + compressorParams["format"] = lzma.FORMAT_XZ # TODO: detect from stream + self.compressor = lzma.LZMACompressor(**compressorParams) + + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.decompressor.decompress(data)) + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + return ProcessorContextStub(self.compressor.compress(data)) diff --git a/python/kaitai/compress/algorithms/zlib.py b/python/kaitai/compress/algorithms/zlib.py new file mode 100644 index 0000000..b6c08ee --- /dev/null +++ b/python/kaitai/compress/algorithms/zlib.py @@ -0,0 +1,52 @@ +import typing +import zlib +from enum import IntEnum +from ..core import KaitaiProcessor, ProcessorContextStub + +class Container(IntEnum): + raw = -1 + zlib = 1 + gzip = 16 + +containerWLenTransformers={ + Container.raw: lambda x: -x, + Container.zlib: lambda x: x, + Container.gzip: lambda x: Container.gzip.value + x +} + + +class Zlib(KaitaiProcessor): + __slots__ = ("compressorParams", "decompressorParams", "dO", "cO") + def __init__(self, containerType:Container=Container.zlib, log_window_size:int=15, zdict:typing.Optional[bytes]=None, level:int=-1, mem_level:typing.Union[str, int]="DEF_MEM_LEVEL", strategy:typing.Union[str, int]="DEFAULT_STRATEGY", method:typing.Optional[typing.Union[str, int]]="deflated", *args, **kwargs): + #containerType = Container(containerType) + self.compressorParams = {} + self.decompressorParams = {} + if method is not None: + if isinstance(method, str): + method = getattr(zlib, method.upper()) + self.compressorParams["method"] = method + + if mem_level is not None: + if isinstance(mem_level, str): + memLevel = getattr(zlib, mem_level) + self.compressorParams["memLevel"] = method + + if strategy is not None: + if isinstance(strategy , str): + strategy = getattr(zlib, "Z_" + strategy.upper()) + self.compressorParams["strategy"] = method + + + self.compressorParams["level"] = level + self.decompressorParams["wbits"] = self.compressorParams["wbits"] = containerWLenTransformers[containerType](log_window_size) + + if zdict is not None: + self.decompressorParams["zdict"] = self.compressorParams["zdict"] = zdict + + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + dO = zlib.decompressobj(**self.decompressorParams) + return ProcessorContextStub( dO, dO.decompress, data) + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + cO = zlib.compressobj(**self.compressorParams) + return ProcessorContextStub( cO, cO.compress, data) diff --git a/python/kaitai/compress/algorithms/zstd.py b/python/kaitai/compress/algorithms/zstd.py new file mode 100644 index 0000000..868e614 --- /dev/null +++ b/python/kaitai/compress/algorithms/zstd.py @@ -0,0 +1,74 @@ +import typing +from ..core import KaitaiProcessor, ProcessorContextStub +from enum import Enum +from functools import partial + +class Zstd(KaitaiProcessor): + __slots__ = ("decompressor", "compressor") + zstd = None + def __init__(self, format:typing.Union[int, str]="zstd1_magicless", log_window_size:typing.Optional[int]=None, dictionary:typing.Optional[bytes]=None, level:int=22, should_write_checksum:bool=True, should_write_uncompressed_size:bool=True, should_write_dict_id:bool=True, strategy:typing.Optional[typing.Union[int, str]]=None, hash_log_size:typing.Optional[int]=None, match_min_size:typing.Optional[int]=None, chain_log_size:typing.Optional[int]=None, search_log_size:typing.Optional[int]=None, overlap_log_size:typing.Optional[int]=None, target_length:typing.Optional[int]=None, ldm:typing.Optional[bool]=None, ldm_hash_log_size:typing.Optional[int]=None, ldm_match_min_size:typing.Optional[int]=None, ldm_bucket_size_log:typing.Optional[int]=None, ldm_hash_rate_log:typing.Optional[int]=None, job_size:typing.Optional[int]=None, force_max_window:typing.Optional[int]=None): + if self.__class__.decompressor is None: + import zstd + self.__class__.zstd = zstd + if isinstance(format, str): + format = getattr(self.__class__.zstd, "FORMAT_" + format.upper()) + + decompressorParams = {"format":format} + compressorParamsDict = {"threads":-1, "format":format} + compressorParams = {} + + if dictionary is not None: + decompressorParams["dict_data"] = compressorParams["dict_data"] = dictionary + + if log_window_size is not None: + decompressorParams["max_window_size"] = 2**log_window_size + compressorParamsDict["window_log"] = log_window_size + + self.__class__.decompressor = self.__class__.zstd.ZstdDecompressor(dict_data=dictionary, **decompressorParams) + + compressorParamsDict["write_checksum"] = should_write_checksum + compressorParamsDict["write_content_size"] = should_write_uncompressed_size + compressorParamsDict["write_dict_id"] = should_write_dict_id + + if strategy is not None: + if isinstance(strategy, str): + strategy = getattr(self.__class__.zstd, "STRATEGY_" + strategy.upper()) + compressorParamsDict["strategy"] = strategy + + if hash_log_size is not None: + compressorParamsDict["hash_log"] = hash_log_size + if match_min_size is not None: + compressorParamsDict["min_match"] = match_min_size + + if chain_log_size is not None: + compressorParamsDict["chain_log"] = chain_log_size + if search_log_size is not None: + compressorParamsDict["search_log"] = search_log_size + if overlap_log_size is not None: + compressorParamsDict["overlap_log"] = overlap_log_size + if target_length is not None: + compressorParamsDict["target_length"] = target_length + if ldm is not None: + compressorParamsDict["enable_ldm"] = ldm + if ldm: + if ldm_hash_log_size is not None: + compressorParamsDict["ldm_hash_log"] = ldm_hash_log_size + if ldm_match_min_size is not None: + compressorParamsDict["ldm_min_match"] = ldm_match_min_size + if ldm_bucket_size_log is not None: + compressorParamsDict["ldm_bucket_size_log"] = ldm_bucket_size_log + if ldm_hash_rate_log is not None: + compressorParamsDict["ldm_hash_rate_log"] = ldm_hash_rate_log + if job_size is not None: + compressorParamsDict["job_size"] = job_size + if force_max_window is not None: + compressorParamsDict["force_max_window"] = force_max_window + + compressorParams["compression_params"] = self.__class__.zstd.ZstdCompressionParameters.from_level(level, **compressorParamsDict) + self.__class__.compressor = zstd.ZstdCompressor(**compressorParams) + + def process(self, data:typing.Union[bytes, bytearray]): + return ProcessorContextStub(self.__class__.decompressor(data)) + + def unprocess(self, data:typing.Union[bytes, bytearray]): + return ProcessorContextStub(self.__class__.compressor.compress(data)) diff --git a/python/kaitai/compress/core/__init__.py b/python/kaitai/compress/core/__init__.py new file mode 100644 index 0000000..62ca9d4 --- /dev/null +++ b/python/kaitai/compress/core/__init__.py @@ -0,0 +1,32 @@ +import typing + +class KaitaiProcessorContext: + def __call__(self, slce:slice, *args, **kwargs) -> bytes: + raise NotImplementedException("Please implement process") + +class ProcessorContextStub(KaitaiProcessorContext): + """A dummy implementation for non-seekable streams. Just decompresses all the data and saves it.""" + __slots__ = ("data",) + def __init__(self, data:typing.Union[bytes, bytearray], *args, **kwargs): + self.data = meth(data) + + def __call__(self, slc:slice, *args, **kwargs) -> bytes: + return self.data[slc] + +class KaitaiProcessor: + """The base processor class""" + def __init__(self, *args, **kwargs): + raise NotImplementedException("Please implement __init__") + + def decode(self, data:typing.Union[bytes, bytearray], *args, **kwargs) -> bytes: + """The method implementing compatibility to legacy API. Gonna be removed somewhen.""" + return self.process(data, *args, **kwargs)(slice(None, None, None)) + + def process(self, data:typing.Union[bytes, bytearray], *args, **kwargs) -> KaitaiProcessorContext: + raise NotImplementedException("Please implement process") + + def unprocess(self, data:typing.Union[bytes, bytearray], *args, **kwargs) -> KaitaiProcessorContext: + raise NotImplementedException(self.__class__.__name__ + " processing is not invertible") + + def getArgs(self, data:typing.Union[bytes, bytearray], *args, **kwargs) -> tuple: + raise NotImplementedException("Cannot get args of " + self.__class__.__name__) diff --git a/python/kaitai/compress/lz4.py b/python/kaitai/compress/lz4.py deleted file mode 100644 index 9d4e30b..0000000 --- a/python/kaitai/compress/lz4.py +++ /dev/null @@ -1,5 +0,0 @@ -import lz4.frame - -class Lz4: - def decode(self, data): - return lz4.frame.decompress(data) diff --git a/python/kaitai/compress/lzma_lzma.py b/python/kaitai/compress/lzma_lzma.py deleted file mode 100644 index 2214449..0000000 --- a/python/kaitai/compress/lzma_lzma.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaLzma: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_ALONE) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/lzma_raw.py b/python/kaitai/compress/lzma_raw.py deleted file mode 100644 index e83ede0..0000000 --- a/python/kaitai/compress/lzma_raw.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaRaw: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_RAW) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/lzma_xz.py b/python/kaitai/compress/lzma_xz.py deleted file mode 100644 index 624a430..0000000 --- a/python/kaitai/compress/lzma_xz.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaXz: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_XZ) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/zlib.py b/python/kaitai/compress/zlib.py deleted file mode 100644 index e90197a..0000000 --- a/python/kaitai/compress/zlib.py +++ /dev/null @@ -1,5 +0,0 @@ -import zlib - -class Zlib: - def decode(self, data): - return zlib.decompress(data) diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..e1ee6cb --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,33 @@ +[metadata] +name = kaitai.compress +version = 0.1 +# we need to put it into a separate git repo in order to use setuptools_scm +authors = Kaitai Project +url = https://github.com/kaitai-io/kaitai_compress +description = A library of wrappers of compressors +long_description = file: ReadMe.md +keywords = compression, archive +license = ? +classifiers = + Programming Language :: Python + Programming Language :: Python :: 3 + Development Status :: 4 - Beta + Environment :: Other Environment + Intended Audience :: Developers + License :: Public Domain + License :: Public Domain :: Unlicense + Operating System :: OS Independent + Topic :: Software Development :: Libraries :: Python Modules + Topic :: Science + +[options] +python_requires = >=3.4 +zip_safe = True +packages = kaitai.compress +setup_requires = setuptools; # setuptools_scm; +test_suite = tests.tests + +[options.extras_require] +lz4 = lz4 @ git+https://github.com/python-lz4/python-lz4 +zstd = zstd @ git+https://github.com/sergey-dryabzhinsky/python-zstd +brotli = brotlipy @ git+https://github.com/python-hyper/brotlipy.git diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..26de544 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +from setuptools import setup +setup()