diff --git a/README.md b/README.md index 1018305..438f3b7 100644 --- a/README.md +++ b/README.md @@ -53,10 +53,9 @@ Add [ruby/lib/](https://github.com/kaitai-io/kaitai_compress/tree/master/ruby/li | Algorithm | Process name | Arguments | Conforming | Test file extension | | - | - | - | - | - | -| [Brotli](https://en.wikipedia.org/wiki/Brotli) | `brotli` | None | [RFC 7932](https://datatracker.ietf.org/doc/html/rfc7932) | br | -| [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) | `lz4` | None | [LZ4 block specification](https://lz4.github.io/lz4/lz4_Block_format.md) | lz4 | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_raw` | None | Raw LZMA stream | lzma_raw | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_lzma` | None | Legacy .lzma file format (AKA alone) | lzma | -| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma_xz` | None | .xz file format | xz | -| [DEFLATE](https://en.wikipedia.org/wiki/DEFLATE) (AKA zlib) | `zlib` | None | [RFC 1951](https://tools.ietf.org/html/rfc1951) | zlib | -| [zstd](https://zstd.net) (AKA zstandard) | `zstd` | None | [Spec & ref implementation](http://facebook.github.io/zstd/zstd_manual.html) | zst | +| [brotli](https://en.wikipedia.org/wiki/brotli) | `brotli` | compression level (`0`-`11`), mode (`generic`, `text`, `font`), log window size , log block size, dictionary | [RFC 7932](https://datatracker.ietf.org/doc/html/rfc7932) | `br` | +| [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) | `lz4` | block_size, if should link blocks, compression level (`0`-`16`), if should checksum frame, if should checksum each block | [LZ4 block specification](https://lz4.github.io/lz4/lz4_Block_format.md) | `lz4` | +| [LZMA](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) | `lzma` | algorithm version (`1, 2`), compression level (`0-9`, `-1` - don't compress with lzma, but use other filters specified), format (`auto`, `alone`, `raw`, `xz`), checksumming algorithm (`none`, `crc32`, `crc64`, `sha256`), modifiers (`e` for `--extreme`), dictionary size, literal context bit count, literal position bit count, position bit count, match finder (`hc3`, `hc4`, `bt2`, `bt3`, `bt4`), mode (`normal`, `fast`), additional_filters | Raw LZMA stream | `lzma` | +| [DEFLATE](https://en.wikipedia.org/wiki/DEFLATE) (AKA zlib) | `zlib` | container type (`raw`, `zlib`, `gzip`), log of window size (`9`-`15`), dictionary, compression level (`0`-`9`, `-1` for default), memory level (`0`-`9`), strategy (`filtered`, `huffman_only`), method (currently only `deflated` is supported) | [RFC 1951](https://tools.ietf.org/html/rfc1951) | `zlib`, `gz` | +| [zstd](https://zstd.net) (AKA zstandard) | `zstd` | format (`zstd1_magicless`, `zstd1`), log of (max) window size, dictionary, compression level (`1` - `22`, `-5` - `-1`), if should write checksum, if should write uncompressed size, if should write dict ID, strategy (`fast`, `dfast`, `greedy`, `lazy`, `lazy2`, `btlazy2`, `btopt`, `btultra`, `btultra2`), hash log size, match min size, chain log size, search log size, overlap log size, target length, if should use long distance matching, ldm hash log size, ldm match min size, ldm bucket size log, ldm hash rate log, job size, force max window | [Spec & ref implementation](http://facebook.github.io/zstd/zstd_manual.html) | `zst` | +| [bzip2](https://en.wikipedia.org/wiki/bzip2) | `bz2` | compression level (`1` - `9`) to add |[Official repo](https://gitlab.com/federicomenaquintero/bzip2)|`bz2`| diff --git a/_test/compressed/25k_uuids.lzham b/_test/compressed/25k_uuids.lzham new file mode 100644 index 0000000..0ca3a5f Binary files /dev/null and b/_test/compressed/25k_uuids.lzham differ diff --git a/_test/compressed/90_a.lzham b/_test/compressed/90_a.lzham new file mode 100644 index 0000000..ce82e40 Binary files /dev/null and b/_test/compressed/90_a.lzham differ diff --git a/_test/compressed/ascii_text.lzham b/_test/compressed/ascii_text.lzham new file mode 100644 index 0000000..806e159 Binary files /dev/null and b/_test/compressed/ascii_text.lzham differ diff --git a/_test/generate-data b/_test/generate-data index a4a6aa6..ceb601f 100755 --- a/_test/generate-data +++ b/_test/generate-data @@ -4,7 +4,7 @@ for I in uncompressed/*.dat; do BASE=$(basename "$I" | sed 's/\.dat$//') echo "$BASE.lz4" - lz4 -9 <$I >compressed/$BASE.lz4 + lz4 --best -BD <$I >compressed/$BASE.lz4 echo "$BASE.zlib" ruby -e 'require "zlib"; $stdout.write(Zlib::deflate($stdin.read))' <$I >compressed/$BASE.zlib @@ -19,10 +19,10 @@ for I in uncompressed/*.dat; do xz --format=raw <$I >compressed/$BASE.lzma_raw echo "$BASE.zst" - zstd <$I >compressed/$BASE.zst + zstd --ultra -22 -f -o compressed/$BASE.zst --format=zstd $I echo "$BASE.br" - brotli <$I -o compressed/$BASE.br + brotli -f -o compressed/$BASE.br $I echo "$BASE.raw.sz" python3 -c "import sys, snappy; from pathlib import Path; i = Path(sys.argv[1]); o = Path(sys.argv[2]); o.write_bytes(snappy.compress(i.read_bytes()));" $I compressed/$BASE.raw.sz @@ -32,4 +32,7 @@ for I in uncompressed/*.dat; do echo "$BASE.hadoop.sz" python3 -c "import sys, snappy; from pathlib import Path; i = Path(sys.argv[1]).open('rb'); o = Path(sys.argv[2]).open('wb'); snappy.hadoop_stream_compress(i, o); i.close(); o.close();" $I compressed/$BASE.hadoop.sz + + echo "$BASE.lzham" + lzhamtest -m4 -d29 -u -x -o -e -h0 c $I compressed/$BASE.lzham done diff --git a/_test/ksy/test_lzma_lzma.ksy b/_test/ksy/test_lzma_lzma.ksy index 871fbff..d74d667 100644 --- a/_test/ksy/test_lzma_lzma.ksy +++ b/_test/ksy/test_lzma_lzma.ksy @@ -3,4 +3,4 @@ meta: seq: - id: body size-eos: true - process: kaitai.compress.lzma_lzma + process: kaitai.compress.lzma(1, 9, "alone") diff --git a/_test/ksy/test_lzma_raw.ksy b/_test/ksy/test_lzma_raw.ksy index c8abfe4..f1adb4f 100644 --- a/_test/ksy/test_lzma_raw.ksy +++ b/_test/ksy/test_lzma_raw.ksy @@ -3,4 +3,4 @@ meta: seq: - id: body size-eos: true - process: kaitai.compress.lzma_raw + process: kaitai.compress.lzma(2, 9, "raw") diff --git a/_test/ksy/test_lzma_xz.ksy b/_test/ksy/test_lzma_xz.ksy index 39a5d78..9842f0c 100644 --- a/_test/ksy/test_lzma_xz.ksy +++ b/_test/ksy/test_lzma_xz.ksy @@ -3,4 +3,4 @@ meta: seq: - id: body size-eos: true - process: kaitai.compress.lzma_xz + process: kaitai.compress.lzma(2, 9, "xz") diff --git a/_test/ksy/test_snappy.ksy b/_test/ksy/test_snappy.ksy new file mode 100644 index 0000000..2c7006b --- /dev/null +++ b/_test/ksy/test_snappy.ksy @@ -0,0 +1,6 @@ +meta: + id: test_snappy +seq: + - id: body + size-eos: true + process: kaitai.compress.snappy diff --git a/_test/test-python.py b/_test/test-python.py index ccab760..e53cd8d 100644 --- a/_test/test-python.py +++ b/_test/test-python.py @@ -1,36 +1,49 @@ #!/usr/bin/env python3 -from glob import glob -from os.path import basename +from pathlib import Path import re +import unittest from test_lz4 import TestLz4 from test_lzma_lzma import TestLzmaLzma from test_lzma_raw import TestLzmaRaw from test_lzma_xz import TestLzmaXz from test_zlib import TestZlib +from test_snappy import TestSnappy from test_brotli import TestBrotli +from test_zstd import TestZstd -for uncompressed_fn in glob('uncompressed/*.dat'): - name = re.sub(r'.dat$', '', basename(uncompressed_fn)) - print(name) - - f = open(uncompressed_fn, 'rb') - uncompressed_data = f.read() - f.close() - - algs = [ - (TestLz4, 'lz4'), - (TestLzmaLzma, 'lzma'), -# (TestLzmaRaw, 'lzma_raw'), # requires filters= to be set - (TestLzmaXz, 'xz'), - (TestZlib, 'zlib'), - (TestBrotli, 'brotli'), - ] - - for alg in algs: - test_class = alg[0] - ext = alg[1] - - obj = test_class.from_file('compressed/%s.%s' % (name, ext)) - print(obj.body == uncompressed_data) +cwd = Path(".").absolute() +this_dir = Path(__file__).absolute().parent.relative_to(cwd) +compressed_dir = this_dir / "compressed" +uncompressed_dir = this_dir / "uncompressed" + + +class SimpleTests(unittest.TestCase): + def testCompressors(self): + for uncompressed_fn in uncompressed_dir.glob("*.dat"): + name = uncompressed_fn.stem + print(name) + + uncompressed_data = uncompressed_fn.read_bytes() + + algs = [ + (TestLz4, "lz4"), + (TestLzmaLzma, "lzma"), + # (TestLzmaRaw, 'lzma_raw'), # requires filters= to be set + (TestLzmaXz, "xz"), + (TestZlib, "zlib"), + (TestSnappy, "snappy"), + (TestBrotli, "br"), + (TestZstd, "zst"), + ] + + for test_class, ext in algs: + compressed_fn = compressed_dir / (name + "." + ext) + with self.subTest(test_class=test_class, file=compressed_fn): + obj = test_class.from_file(str(compressed_fn)) + self.assertEqual(obj.body, uncompressed_data) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000..28d1f7a --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +*.pyc +*.pyo +/build +/dist +/*.egg-info diff --git a/python/kaitai/compress/__init__.py b/python/kaitai/compress/__init__.py index f1b261c..4b795cd 100644 --- a/python/kaitai/compress/__init__.py +++ b/python/kaitai/compress/__init__.py @@ -1,6 +1,8 @@ -from .lz4 import Lz4 -from .zlib import Zlib -from .lzma_raw import LzmaRaw -from .lzma_lzma import LzmaLzma -from .lzma_xz import LzmaXz -from .brotli import Brotli +from .core import * +from .algorithms.zlib import Zlib +from .algorithms.lzma import Lzma +from .algorithms.lz4 import Lz4 +from .algorithms.brotli import Brotli +from .algorithms.zstd import Zstd +from .algorithms.bz2 import Bz2 +from .algorithms.snappy import Snappy diff --git a/python/kaitai/compress/algorithms/__init__.py b/python/kaitai/compress/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/kaitai/compress/algorithms/brotli.py b/python/kaitai/compress/algorithms/brotli.py new file mode 100644 index 0000000..0d28256 --- /dev/null +++ b/python/kaitai/compress/algorithms/brotli.py @@ -0,0 +1,43 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class Brotli(KaitaiCompressor): + __slots__ = ("compressorParams", "decompressorParams") + brotli = None + + def __init__(self, level: typing.Optional[int] = None, mode: typing.Optional[str] = "generic", log_window_size: typing.Optional[int] = None, log_block_size: typing.Optional[int] = None, dictionary: typing.Optional[bytes] = None) -> None: # pylint:disable=redefined-builtin,too-many-arguments,too-many-locals,unused-argument + super().__init__() + if self.__class__.brotli is None: + import brotli # pylint:disable=import-outside-toplevel + + self.__class__.brotli = brotli + self.compressorParams = {} + self.decompressorParams = {} + + if mode is not None: + if isinstance(mode, str): + mode = getattr(self.__class__.brotli, "MODE_" + mode.upper()) + self.compressorParams["mode"] = mode + + if level is not None: + self.compressorParams["quality"] = level + + if log_window_size is not None: + self.compressorParams["lgwin"] = log_window_size + + if log_block_size is not None: + self.compressorParams["lgblock"] = log_block_size + + if dictionary is not None: + self.decompressorParams["dictionary"] = self.compressorParams["dictionary"] = dictionary + + # new API + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.brotli.decompress(bytes(data), **self.decompressorParams)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.brotli.compress(data, **self.compressorParams)) diff --git a/python/kaitai/compress/algorithms/bz2.py b/python/kaitai/compress/algorithms/bz2.py new file mode 100644 index 0000000..6c86a09 --- /dev/null +++ b/python/kaitai/compress/algorithms/bz2.py @@ -0,0 +1,22 @@ +import bz2 +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class Bz2(KaitaiCompressor): + __slots__ = ("level",) + + def __init__(self, level: int = 9, *args, **kwargs) -> None: # pylint:disable=unused-argument + super().__init__() + self.level = level + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + decompressor = bz2.BZ2Decompressor() + return ProcessorContextStub(decompressor.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + compressor = bz2.BZ2Compressor(self.level) + return ProcessorContextStub(compressor.compress(data) + compressor.flush()) diff --git a/python/kaitai/compress/algorithms/implode.py b/python/kaitai/compress/algorithms/implode.py new file mode 100644 index 0000000..279ba8c --- /dev/null +++ b/python/kaitai/compress/algorithms/implode.py @@ -0,0 +1,22 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class Implode(KaitaiCompressor): + """PKWare implode format""" + + __slots__ = () + + def __init__(self, *args, **kwargs) -> None: # pylint:disable=unused-argument + super().__init__() + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + import pkblast + + return ProcessorContextStub(pkblast.decompressBytesWholeToBytes(data)[1]) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + raise NotImplementedError("pkimplode is needed, but not yet implemented") diff --git a/python/kaitai/compress/algorithms/lrzip.py b/python/kaitai/compress/algorithms/lrzip.py new file mode 100644 index 0000000..77f4933 --- /dev/null +++ b/python/kaitai/compress/algorithms/lrzip.py @@ -0,0 +1,37 @@ +import typing +from enum import IntEnum + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class LRZip(KaitaiCompressor): + __slots__ = ("algo",) + + lrzip = None + Algos = None + + @classmethod + def initLib(cls): + import lrzip + + self.__class__.lrzip = lrzip + prefix = "LRZIP_MODE_COMPRESS_" + self.__class__.Algos = IntEnum("A", sorted(((k[len(prefix) :].lower(), getattr(lrzip, k)) for k in dir(lrzip) if k[: len(prefix)] == prefix), key=lambda x: x[1])) + + def __init__(self, algo: typing.Union[int, str] = "none", *args, **kwargs) -> None: # pylint:disable=unused-argument + if self.__class__.lrzip is None: + self.__class__.initLib() + if isinstance(algo, str): + algo = self.__class__.Algos[algo.lower()] + else: + algo = self.__class__.Algos(algo) + self.algo = algo + super().__init__() + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lrzip.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lrzip.compress(data, compressMode=self.algo)) diff --git a/python/kaitai/compress/algorithms/lz4.py b/python/kaitai/compress/algorithms/lz4.py new file mode 100644 index 0000000..b789ab9 --- /dev/null +++ b/python/kaitai/compress/algorithms/lz4.py @@ -0,0 +1,42 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class Lz4(KaitaiCompressor): + __slots__ = ("compressorParams",) + lz4Frame = None + + def __init__(self, block_size: typing.Optional[int] = None, should_link_blocks: bool = True, compression_level: typing.Optional[int] = None, frame_checksum: bool = False, block_checksum: bool = False, *args, **kwargs) -> None: # pylint:disable=unused-argument,too-many-arguments + super().__init__() + if self.__class__.lz4Frame is None: + import lz4.frame # pylint:disable=import-outside-toplevel + + self.__class__.lz4Frame = lz4.frame + + if compression_level is None: + compression_level = self.__class__.lz4Frame.COMPRESSIONLEVEL_MAX + if block_size is None: + block_size = self.__class__.lz4Frame.BLOCKSIZE_MAX4MB + self.compressorParams = { + "block_size": block_size, + "block_linked": should_link_blocks, + "compression_level": compression_level, + "content_checksum": frame_checksum, + "block_checksum": block_checksum, + "return_bytearray": False + } + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + obj = self.__class__.lz4Frame.LZ4FrameDecompressor(return_bytearray=False) + return ProcessorContextStub(obj.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + obj = self.__class__.lz4Frame.LZ4FrameCompressor(**self.compressorParams) + return ProcessorContextStub(obj.begin(len(data)) + obj.compress(data) + obj.flush()) + + def extract_args(self, data: typing.Union[bytes, bytearray]): + res = self.__class__.lz4Frame.get_frame_info(data) + return (res["block_size"], res["linker"], res["compression_level"], res["content_checksum"], res["block_checksum"]) diff --git a/python/kaitai/compress/algorithms/lzham.py b/python/kaitai/compress/algorithms/lzham.py new file mode 100644 index 0000000..1c59931 --- /dev/null +++ b/python/kaitai/compress/algorithms/lzham.py @@ -0,0 +1,50 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +raise NotImplementedError("The python bindings for lzham and lzham itself has bad security and design issues. It must be fixed first.") + +"""LZHAM +Must Read: https://github.com/richgel999/lzham_codec + +uncompressed: 1 +table_update_rate: # at default settings + 3: 0.0103 + 8: 0.0105 # default + 20: 0.0106 +level: # "table_update_rate":20, "dict_size_log2": 26 + 1: 0.0108 + 4: 0.0104 +highest: +lzma: 0.008 # at highest settings +""" +# pylint:disable=arguments-differ + + +class LZHAM(KaitaiCompressor): + __slots__ = ("decompressor", "compressor", "dictTrainerParams",) + lzham = None + + def __init__(self, level: int = 1, dict_size_log2: int = 26, table_update_rate: int = 20, max_helper_threads: int = 0, check_adler32: bool = False, table_max_update_interval: int = 0, table_update_interval_slow_rate: int = 0, *args, **kwargs) -> None: # pylint:disable=redefined-builtin,too-many-arguments,too-many-locals,unused-argument,too-many-branches,too-many-statements + super().__init__() + if self.__class__.lzham is None: + import lzham # pylint:disable=import-outside-toplevel + + self.__class__.lzham = lzham + + commonFilters = {"table_update_rate": table_update_rate, "dict_size_log2": dict_size_log2, "table_max_update_interval": table_max_update_interval, "table_update_interval_slow_rate": table_update_interval_slow_rate} + + compFilters = {"level": level, "max_helper_threads": max_helper_threads} + compFilters.update(commonFilters) + + decompFilters = {"compute_adler32_during_decomp": check_adler32, "unbuffered_decompression": True} + decompFilters.update(commonFilters) + + self.compressor = lzham.LZHAMCompressor(compFilters) + self.decompressor = lzham.LZHAMDecompressor(decompFilters) + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.decompressor.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.compressor.compress(data)) diff --git a/python/kaitai/compress/algorithms/lzjb.py b/python/kaitai/compress/algorithms/lzjb.py new file mode 100644 index 0000000..587fa8d --- /dev/null +++ b/python/kaitai/compress/algorithms/lzjb.py @@ -0,0 +1,24 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class LZJB(KaitaiCompressor): + __slots__ = () + + lzjb = None + + def __init__(self, *args, **kwargs) -> None: # pylint:disable=unused-argument + if self.__class__.lzjb is None: + import lzjb + + self.__class__.lzjb = lzjb + super().__init__() + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lzjb.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lzjb.compress(data)) diff --git a/python/kaitai/compress/algorithms/lzma.py b/python/kaitai/compress/algorithms/lzma.py new file mode 100644 index 0000000..1352fb8 --- /dev/null +++ b/python/kaitai/compress/algorithms/lzma.py @@ -0,0 +1,88 @@ +import typing + +try: + import lzma +except ImportError: + import processports.lzma as lzma + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + +modifiersMapping = {"e": lzma.PRESET_EXTREME} + + +class Lzma(KaitaiCompressor): + __slots__ = ("decompressor_params", "compressor_params") + + def __init__(self, algo: int = 2, level: int = 9, format: typing.Optional[typing.Union[str, int]] = lzma.FORMAT_AUTO, check: typing.Optional[typing.Union[str, int]] = -1, modifiers: str = "e", dict_size: typing.Optional[int] = None, literal_context_bits: typing.Optional[int] = None, literal_position_bits: typing.Optional[int] = None, position_bits: typing.Optional[int] = None, match_finder: typing.Optional[str] = "bt4", mode: typing.Optional[str] = "normal", additional_filters: typing.Iterable[typing.Dict[str, typing.Any]] = (), *args, **kwargs) -> None: # pylint:disable=redefined-builtin,too-many-arguments,too-many-locals,unused-argument,too-many-branches + super().__init__() + if isinstance(format, str): + format = getattr(lzma, "FORMAT_" + format.upper()) + + if isinstance(check, str): + check = getattr(lzma, "CHECK_" + check.upper()) + + filters = list(additional_filters) + if algo > -1: + if isinstance(modifiers, str): + modifiersNum = 0 + for m in modifiers: + modifiersNum |= modifiersMapping[m] + modifiers = modifiersNum + del modifiersNum + + lzmaFilter = { + "id": "lzma" + str(algo), + "preset": level | modifiers, + } + + if dict_size is not None: + lzmaFilter["dict"] = (dict_size,) + if literal_context_bits is not None: + lzmaFilter["lc"] = literal_context_bits + if literal_position_bits is not None: + lzmaFilter["lp"] = literal_position_bits + if position_bits is not None: + lzmaFilter["pb"] = position_bits + if match_finder is not None: + if isinstance(match_finder, str): + match_finder = getattr(lzma, "MF_" + match_finder.upper()) + lzmaFilter["mf"] = match_finder + if mode is not None: + if isinstance(mode, str): + mode = getattr(lzma, "MODE_" + mode.upper()) + lzmaFilter["mode"] = mode + filters.append(lzmaFilter) + + for f in filters: + if isinstance(f["id"], str): + f["id"] = getattr(lzma, "FILTER_" + f["id"].upper()) + + compressorParams = { + "format": format, + "check": check, + "preset": None, # set in filters + "filters": filters, + } + decompressorParams = { + "format": format, + "memlimit": None, + } + + if format is lzma.FORMAT_RAW: + decompressorParams["filters"] = filters + + self.decompressor_params = decompressorParams + + if "format" not in compressorParams or compressorParams["format"] == lzma.FORMAT_AUTO: + compressorParams["format"] = lzma.FORMAT_XZ # TODO: detect from stream + self.compressor_params = compressorParams + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + decompressor = lzma.LZMADecompressor(**self.decompressor_params) + return ProcessorContextStub(decompressor.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + compressor = lzma.LZMACompressor(**self.compressor_params) + return ProcessorContextStub(compressor.compress(data) + compressor.flush()) diff --git a/python/kaitai/compress/algorithms/lzss.py b/python/kaitai/compress/algorithms/lzss.py new file mode 100644 index 0000000..96b87ff --- /dev/null +++ b/python/kaitai/compress/algorithms/lzss.py @@ -0,0 +1,24 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class LZSS(KaitaiCompressor): + __slots__ = () + + lzss = None + + def __init__(self, *args, **kwargs) -> None: # pylint:disable=unused-argument + if self.__class__.lzss is None: + import lzss + + self.__class__.lzss = lzss + super().__init__() + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lzss.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.lzss.compress(data)) diff --git a/python/kaitai/compress/algorithms/snappy.py b/python/kaitai/compress/algorithms/snappy.py new file mode 100644 index 0000000..0718548 --- /dev/null +++ b/python/kaitai/compress/algorithms/snappy.py @@ -0,0 +1,24 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +# pylint:disable=arguments-differ + + +class Snappy(KaitaiCompressor): + __slots__ = () + snappy = None + + def __init__(self) -> None: + super().__init__() + if self.__class__.snappy is None: + import snappy # pylint:disable=import-outside-toplevel + + self.__class__.snappy = snappy + + # new API + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.snappy.decompress(bytes(data))) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.__class__.snappy.compress(data)) diff --git a/python/kaitai/compress/algorithms/zlib.py b/python/kaitai/compress/algorithms/zlib.py new file mode 100644 index 0000000..119322c --- /dev/null +++ b/python/kaitai/compress/algorithms/zlib.py @@ -0,0 +1,95 @@ +import typing +import zlib +from enum import IntEnum + +from ..core import KaitaiCompressor, ProcessorContextStub + + +class Container(IntEnum): + raw = -1 + zlib = 1 + gzip = 16 + + +containerWLenTransformers = { + Container.raw: lambda x: -x, + Container.zlib: lambda x: x, + Container.gzip: lambda x: Container.gzip.value + x +} + +# pylint:disable=arguments-differ + + +class VanillaZlib(KaitaiCompressor): + __slots__ = ("compressorParams", "decompressorParams", "dO", "cO") + + def __init__(self, containerType: Container = Container.zlib, log_window_size: int = 15, zdict: typing.Optional[bytes] = None, level: int = -1, mem_level: typing.Union[str, int] = "DEF_MEM_LEVEL", strategy: typing.Union[str, int] = "DEFAULT_STRATEGY", method: typing.Optional[typing.Union[str, int]] = "deflated", *args, **kwargs) -> None: # pylint:disable=too-many-arguments,unused-argument + super().__init__() + # containerType = Container(containerType) + self.compressorParams = {} + self.decompressorParams = {} + if method is not None: + if isinstance(method, str): + method = getattr(zlib, method.upper()) + self.compressorParams["method"] = method + + if mem_level is not None: + if isinstance(mem_level, str): + mem_level = getattr(zlib, mem_level) + self.compressorParams["memLevel"] = mem_level + + if strategy is not None: + if isinstance(strategy, str): + strategy = getattr(zlib, "Z_" + strategy.upper()) + self.compressorParams["strategy"] = strategy + + self.compressorParams["level"] = level + self.decompressorParams["wbits"] = self.compressorParams["wbits"] = containerWLenTransformers[containerType](log_window_size) + + if zdict is not None: + self.decompressorParams["zdict"] = self.compressorParams["zdict"] = zdict + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + dO = zlib.decompressobj(**self.decompressorParams) + return ProcessorContextStub(dO.decompress(data) + dO.flush()) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + cO = zlib.compressobj(**self.compressorParams) + return ProcessorContextStub(cO.compress(data) + cO.flush()) + + +Zlib = VanillaZlib + +try: + import zopfli + + containerZopfliRemap = {Container.raw: zopfli.ZOPFLI_FORMAT_DEFLATE, Container.zlib: zopfli.ZOPFLI_FORMAT_ZLIB, Container.gzip: zopfli.ZOPFLI_FORMAT_GZIP} + + class ZopfliZlib(VanillaZlib): + __slots__ = ("zopfliCompressorParams",) + + def __init__(self, containerType: Container = Container.zlib, log_window_size: int = 15, zdict: typing.Optional[bytes] = None, level: int = -1, mem_level: typing.Union[str, int] = "DEF_MEM_LEVEL", strategy: typing.Union[str, int] = "DEFAULT_STRATEGY", method: typing.Optional[typing.Union[str, int]] = "deflated", *args, **kwargs) -> None: # pylint:disable=too-many-arguments,unused-argument + super().__init__(containerType, log_window_size, zdict, level, mem_level, strategy, method, *args, **kwargs) + + self.zopfliCompressorParams = { + "verbose": False, + "iterations": 15, + "block_splitting": True, + "block_splitting_max": 15, + "format": containerZopfliRemap[containerType] + } + + def compressZopfli(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + cO = zopfli.ZopfliCompressor(**self.zopfliCompressorParams) + return ProcessorContextStub(cO.compress(data) + cO.flush()) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + if self.compressorParams.get("zdict", None) is not None and self.compressorParams.get("wbits", None) == 15: + return self.compressZopfli(data) + else: + return super().unprocess(data) + + Zlib = ZopfliZlib + +except ImportError: + pass diff --git a/python/kaitai/compress/algorithms/zstd.py b/python/kaitai/compress/algorithms/zstd.py new file mode 100644 index 0000000..6d26ff6 --- /dev/null +++ b/python/kaitai/compress/algorithms/zstd.py @@ -0,0 +1,125 @@ +import typing + +from ..core import KaitaiCompressor, ProcessorContextStub + +default_dict_size = 110 << 10 # 110 KiB + + +def splitDataIntoPredefinedCountOfChunks(data: typing.Iterator[typing.Any], count: int = 7) -> typing.Iterable[bytes]: + data = list(data) + while len(data) < count: + spl = max(range(len(data)), key=lambda i: len(data[i])) + d = data[spl] + sl = len(d) // 2 + d1 = d[:sl] + d2 = d[sl:] + data[spl] = d1 + data.insert(spl + 1, d2) + return data + + +# pylint:disable=arguments-differ + + +class Zstd(KaitaiCompressor): + __slots__ = ("decompressor", "compressor", "dictTrainerParams",) + zstd = None + + def __init__(self, format: typing.Union[int, str] = "zstd1", log_window_size: typing.Optional[int] = None, dictionary: typing.Optional[typing.Union[bytes, int]] = None, level: int = 0, should_write_checksum: bool = True, should_write_uncompressed_size: bool = True, should_write_dict_id: bool = True, strategy: typing.Optional[typing.Union[int, str]] = None, hash_log_size: typing.Optional[int] = None, match_min_size: typing.Optional[int] = None, chain_log_size: typing.Optional[int] = None, search_log_size: typing.Optional[int] = None, overlap_log_size: typing.Optional[int] = None, target_length: typing.Optional[int] = None, ldm: typing.Optional[bool] = None, ldm_hash_log_size: typing.Optional[int] = None, ldm_match_min_size: typing.Optional[int] = None, ldm_bucket_size_log: typing.Optional[int] = None, ldm_hash_rate_log: typing.Optional[int] = None, job_size: typing.Optional[int] = None, force_max_window: typing.Optional[int] = None) -> None: # pylint:disable=redefined-builtin,too-many-arguments,too-many-locals,unused-argument,too-many-branches,too-many-statements + super().__init__() + if self.__class__.zstd is None: + import zstandard as zstd # pylint:disable=import-outside-toplevel + + self.__class__.zstd = zstd + if isinstance(format, str): + format = getattr(self.__class__.zstd, "FORMAT_" + format.upper()) + + if level == 0: + level = self.__class__.zstd.MAX_COMPRESSION_LEVEL + + decompressorParams = {"format": format} + compressorParamsDict = {"threads": -1, "format": format} + compressorParams = {} + + if dictionary: + if isinstance(dictionary, int) and dictionary != 0: + raise ValueError("Only 0 is used to indicate there is no data") + dic = self.__class__.zstd.ZstdCompressionDict(dictionary, dict_type=self.__class__.zstd.DICT_TYPE_RAWCONTENT) + dic.precompute_compress(level=level) + decompressorParams["dict_data"] = compressorParams["dict_data"] = dic + + if log_window_size is not None: + decompressorParams["max_window_size"] = 2 ** log_window_size + compressorParamsDict["window_log"] = log_window_size + + self.decompressor = self.__class__.zstd.ZstdDecompressor(**decompressorParams) + + compressorParamsDict["write_checksum"] = should_write_checksum + compressorParamsDict["write_content_size"] = should_write_uncompressed_size + compressorParamsDict["write_dict_id"] = should_write_dict_id + + if strategy is not None: + if isinstance(strategy, str): + strategy = getattr(self.__class__.zstd, "STRATEGY_" + strategy.upper()) + compressorParamsDict["strategy"] = strategy + + if hash_log_size is not None: + compressorParamsDict["hash_log"] = hash_log_size + if match_min_size is not None: + compressorParamsDict["min_match"] = match_min_size + + if chain_log_size is not None: + compressorParamsDict["chain_log"] = chain_log_size + if search_log_size is not None: + compressorParamsDict["search_log"] = search_log_size + if overlap_log_size is not None: + compressorParamsDict["overlap_log"] = overlap_log_size + if target_length is not None: + compressorParamsDict["target_length"] = target_length + if ldm is not None: + compressorParamsDict["enable_ldm"] = ldm + if ldm: + if ldm_hash_log_size is not None: + compressorParamsDict["ldm_hash_log"] = ldm_hash_log_size + if ldm_match_min_size is not None: + compressorParamsDict["ldm_min_match"] = ldm_match_min_size + if ldm_bucket_size_log is not None: + compressorParamsDict["ldm_bucket_size_log"] = ldm_bucket_size_log + if ldm_hash_rate_log is not None: + compressorParamsDict["ldm_hash_rate_log"] = ldm_hash_rate_log + if job_size is not None: + compressorParamsDict["job_size"] = job_size + if force_max_window is not None: + compressorParamsDict["force_max_window"] = force_max_window + + compressorParams["compression_params"] = self.__class__.zstd.ZstdCompressionParameters.from_level(level, **compressorParamsDict) + self.compressor = self.zstd.ZstdCompressor(**compressorParams) + self.dictTrainerParams = { + "threads": 2, + #"threads": compressorParamsDict["threads"], + #"threads": 0, + #"level": level, + "level": 1, + #"dict_size": None, # int + #"k": None, # segment_size, int + #"d": None, # dmer_size,int + "dict_id": 0, # int + #"steps": None, # int + "notifications": 0, + #"notifications": 4, + } + + def process(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.decompressor.decompress(data)) + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> ProcessorContextStub: + return ProcessorContextStub(self.compressor.compress(data)) + + def compute_optimal_dict(self, data: typing.Iterable[typing.Union[bytes, bytearray]], dict_size: typing.Optional[int] = None, *args, **kwargs): + data = splitDataIntoPredefinedCountOfChunks(data, 7) + if dict_size is None: + dict_size = default_dict_size + if dict_size < 256: + dict_size = 256 + dict_data = self.__class__.zstd.train_dictionary(dict_size, samples=data, **self.dictTrainerParams) + return dict_data.as_bytes() diff --git a/python/kaitai/compress/core/__init__.py b/python/kaitai/compress/core/__init__.py new file mode 100644 index 0000000..a171cb4 --- /dev/null +++ b/python/kaitai/compress/core/__init__.py @@ -0,0 +1,51 @@ +import typing +from abc import ABC, abstractmethod + +# pylint:disable=unused-argument + + +class KaitaiProcessorContext(ABC): + __slots__ = () + + @abstractmethod + def __call__(self, slce: slice, *args, **kwargs) -> bytes: + raise NotImplementedError("Please implement process") + + +class ProcessorContextStub(KaitaiProcessorContext): + """A dummy implementation for non-seekable streams. Just decompresses all the data and saves it.""" + + __slots__ = ("data",) + + def __init__(self, data: typing.Union[bytes, bytearray], *args, **kwargs) -> None: + self.data = data + + def __call__(self, slc: slice, *args, **kwargs) -> bytes: + return self.data[slc] + + +class KaitaiProcessor(ABC): + """The base processor class""" + + __slots__ = () + + def decode(self, data: typing.Union[bytes, bytearray], *args, **kwargs) -> bytes: + """The method implementing compatibility to legacy API. Gonna be removed somewhen.""" + return self.process(data, *args, **kwargs)(slice(None, None, None)) + + @abstractmethod + def process(self, data: typing.Union[bytes, bytearray]) -> KaitaiProcessorContext: + raise NotImplementedError("Please implement process") + + def unprocess(self, data: typing.Union[bytes, bytearray]) -> KaitaiProcessorContext: + raise NotImplementedError(self.__class__.__name__ + " processing is not invertible") + + def extract_args(self, data: typing.Union[bytes, bytearray], *args, **kwargs) -> tuple: + raise NotImplementedError("Cannot get args of " + self.__class__.__name__) + + +class KaitaiCompressor(KaitaiProcessor): + __slots__ = () + + def compute_optimal_dict(self, data: typing.Iterable[typing.Union[bytes, bytearray]], *args, **kwargs): + raise NotImplementedError(self.__class__.__name__ + " has no function for creation of custom dictionaries") diff --git a/python/kaitai/compress/lz4.py b/python/kaitai/compress/lz4.py deleted file mode 100644 index 9d4e30b..0000000 --- a/python/kaitai/compress/lz4.py +++ /dev/null @@ -1,5 +0,0 @@ -import lz4.frame - -class Lz4: - def decode(self, data): - return lz4.frame.decompress(data) diff --git a/python/kaitai/compress/lzma_lzma.py b/python/kaitai/compress/lzma_lzma.py deleted file mode 100644 index 2214449..0000000 --- a/python/kaitai/compress/lzma_lzma.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaLzma: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_ALONE) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/lzma_raw.py b/python/kaitai/compress/lzma_raw.py deleted file mode 100644 index e83ede0..0000000 --- a/python/kaitai/compress/lzma_raw.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaRaw: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_RAW) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/lzma_xz.py b/python/kaitai/compress/lzma_xz.py deleted file mode 100644 index 624a430..0000000 --- a/python/kaitai/compress/lzma_xz.py +++ /dev/null @@ -1,8 +0,0 @@ -import lzma - -class LzmaXz: - def __init__(self): - self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_XZ) - - def decode(self, data): - return self.decompressor.decompress(data) diff --git a/python/kaitai/compress/zlib.py b/python/kaitai/compress/zlib.py deleted file mode 100644 index e90197a..0000000 --- a/python/kaitai/compress/zlib.py +++ /dev/null @@ -1,5 +0,0 @@ -import zlib - -class Zlib: - def decode(self, data): - return zlib.decompress(data) diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..16ca60c --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,5 @@ +[build-system] +requires = ["setuptools>=44", "wheel", "setuptools_scm[toml]>=3.4.3"] +build-backend = "setuptools.build_meta" + +#[tool.setuptools_scm] diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..e1ee6cb --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,33 @@ +[metadata] +name = kaitai.compress +version = 0.1 +# we need to put it into a separate git repo in order to use setuptools_scm +authors = Kaitai Project +url = https://github.com/kaitai-io/kaitai_compress +description = A library of wrappers of compressors +long_description = file: ReadMe.md +keywords = compression, archive +license = ? +classifiers = + Programming Language :: Python + Programming Language :: Python :: 3 + Development Status :: 4 - Beta + Environment :: Other Environment + Intended Audience :: Developers + License :: Public Domain + License :: Public Domain :: Unlicense + Operating System :: OS Independent + Topic :: Software Development :: Libraries :: Python Modules + Topic :: Science + +[options] +python_requires = >=3.4 +zip_safe = True +packages = kaitai.compress +setup_requires = setuptools; # setuptools_scm; +test_suite = tests.tests + +[options.extras_require] +lz4 = lz4 @ git+https://github.com/python-lz4/python-lz4 +zstd = zstd @ git+https://github.com/sergey-dryabzhinsky/python-zstd +brotli = brotlipy @ git+https://github.com/python-hyper/brotlipy.git