diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index a0f502b..76941b3 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -30,6 +30,29 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} slug: TLCFEM/msglc plugins: pycoverage + # benchmark: + # name: Benchmark + # runs-on: ubuntu-latest + # timeout-minutes: 100 + # steps: + # - name: Clone + # uses: actions/checkout@v4 + # - name: Python + # uses: actions/setup-python@v5 + # with: + # python-version: 3.11 + # - name: Build + # run: pip install .[dev,msgspec,numpy] matplotlib h5py + # - name: Test + # run: | + # python3 h5/generate.py + # python3 h5/read.py + # tar czf benchmark.tar.gz ./h5/*.pdf + # - name: Upload + # uses: actions/upload-artifact@v4 + # with: + # name: msglc-benchmark + # path: benchmark.tar.gz wheels: name: Build runs-on: ubuntu-latest diff --git a/docs/.pages b/docs/.pages index f292765..600147a 100644 --- a/docs/.pages +++ b/docs/.pages @@ -2,4 +2,5 @@ nav: - msglc: index.md - Motivation: motivation.md - Examples: tutorial.md + - Benchmark: benchmark.md - API: api \ No newline at end of file diff --git a/docs/benchmark.md b/docs/benchmark.md new file mode 100644 index 0000000..6a47ee1 --- /dev/null +++ b/docs/benchmark.md @@ -0,0 +1,60 @@ +# Benchmark + +The embedded structure allows fast read without loading the whole archive, which is the main advantage of this package. +In the following, we benchmark the random read performance and compare with the `HDF5` format. + +## Data Generation + +A square matrix of size 5000 with random floating-point numbers is used. +The matrix is dumped onto the disk with different configurations. + +1. For `msglc`, `small_obj_optimization_threshold` varies from 4KB to 4MB, `numpy_encoder` is switched off so the matrix is stored as plain json instead binary blob. +2. For `h5py`, the chunk size is computed so that each block has a size similar to `small_obj_optimization_threshold`. Compression is optionally switched on. + +The following code snippets show the relevant functions. + +```py +def generate_msg(mat: np.ndarray, block: int): + configure(small_obj_optimization_threshold=2**block, numpy_encoder=False) # 16KB + dump(f"data-{block}.msg", mat) + +def generate_h5(mat: np.ndarray, block: int, **kwargs): + with h5py.File(h5_name(block, **kwargs), "w") as f: + if block > 0: + chunk_size = int(sqrt(2**block / 128)) + kwargs["chunks"] = (chunk_size, chunk_size) + f.create_dataset("data", data=mat, **kwargs) +``` + +The write time of `msglc` is in general constant, because the packer needs to traverse the whole json object. +Depending on different configurations, `h5py` requires different amounts of time to dump the matrix. + +![write time](./write_time.pdf) + +`msglc` shall be used for data that is written to disk for cold storage and does not require frequent changes. +When compression is on, `h5py` needs to traverse the object just like `msglc`, thus requires a similar amount of time. + +## Read Test + +We mainly test the random read. +To this end, we repeatedly read random locations in the matrix and measure the time required. + +```py +@timeit +def read_msg(file: str): + with LazyReader(file, unpacker=MsgspecUnpacker, cached=False) as reader: + for _ in range(repeat): + reader[random.randint(0, 4999)][random.randint(0, 4999)] + + +@timeit +def read_h5(file: str): + with h5py.File(file, "r") as f: + dataset = f["data"] + for _ in range(repeat): + dataset[random.randint(0, 4999)][random.randint(0, 4999)]` +``` + +![read 1k random elements](./read_time_log_1k.pdf) + +![read 10k random elements](./read_time_log_10k.pdf) \ No newline at end of file diff --git a/docs/read_time_log_10k.pdf b/docs/read_time_log_10k.pdf new file mode 100644 index 0000000..753f09b Binary files /dev/null and b/docs/read_time_log_10k.pdf differ diff --git a/docs/read_time_log_1k.pdf b/docs/read_time_log_1k.pdf new file mode 100644 index 0000000..aa799f5 Binary files /dev/null and b/docs/read_time_log_1k.pdf differ diff --git a/docs/write_time.pdf b/docs/write_time.pdf new file mode 100644 index 0000000..366e9f6 Binary files /dev/null and b/docs/write_time.pdf differ diff --git a/tests/h5/generate.py b/tests/h5/generate.py new file mode 100644 index 0000000..c1310e9 --- /dev/null +++ b/tests/h5/generate.py @@ -0,0 +1,113 @@ +import os +from math import sqrt + +import h5py +import matplotlib.pyplot as plt +import numpy as np +from timer import get_color, timeit + +from msglc import dump +from msglc.config import configure + + +@timeit +def generate_msg(mat: np.ndarray, block: int): + configure(small_obj_optimization_threshold=2**block, numpy_encoder=False) # 16KB + dump(f"data-{block}.msg", mat) + + +def h5_name(block: int, **kwargs): + file_name = "data" + if kwargs: + file_name += "-compressed" + elif block > 0: + file_name += "-chunked" + + if block > 0: + file_name += f"-{block}" + + return f"{file_name}.h5" + + +@timeit +def generate_h5(mat: np.ndarray, block: int, **kwargs): + with h5py.File(h5_name(block, **kwargs), "w") as f: + if block > 0: + chunk_size = int(sqrt(2**block / 128)) + kwargs["chunks"] = (chunk_size, chunk_size) + f.create_dataset("data", data=mat, **kwargs) + + +def plot_write_time(write_time: dict): + x = [] + y = [] + color = [] + for k, v in sorted(write_time.items()): + x.append(k) + y.append(v) + color.append(get_color(k)) + + plt.figure(figsize=(10, 10)) + plt.bar(x, y, color=color) + plt.ylabel("time (s)") + plt.xlabel("format") + plt.xticks(rotation=-90) + plt.tight_layout() + plt.savefig("write_time.pdf") + + +def plot_file_size(file_size: dict): + x = [] + y = [] + color = [] + for k in sorted(file_size.keys()): + x.append(k) + y.append(os.path.getsize(k) / 2**20) + color.append(get_color(k)) + + plt.figure(figsize=(10, 10)) + plt.bar(x, y, color=color) + plt.ylabel("size (MB)") + plt.xlabel("format") + plt.xticks(rotation=-90) + plt.tight_layout() + plt.savefig("file_size.pdf") + + +def plot_memory_usage(write_memory: dict): + x = [] + y = [] + color = [] + for k, v in sorted(write_memory.items()): + x.append(k) + y.append(v) + color.append(get_color(k)) + + plt.figure(figsize=(10, 10)) + plt.bar(x, y, color=color) + plt.ylabel("write memory usage") + plt.xlabel("format") + plt.xticks(rotation=-90) + plt.tight_layout() + plt.savefig("write_memory.pdf") + + +if __name__ == "__main__": + os.chdir(os.path.dirname(__file__)) + + collect = {} + + mat = np.random.rand(5000, 5000) + + collect[h5_name(-1)] = generate_h5(mat, -1) + + for i in range(12, 23): + collect[h5_name(i)] = generate_h5(mat, i) + collect[h5_name(i, compression="gzip", compression_opts=9)] = generate_h5( + mat, i, compression="gzip", compression_opts=9 + ) + collect[f"data-{i}.msg"] = generate_msg(mat, i) + + plot_write_time({k: v[0] for k, v in collect.items()}) + plot_file_size(collect) + # plot_memory_usage({k: v[1] for k, v in collect.items()}) diff --git a/tests/h5/read.py b/tests/h5/read.py new file mode 100644 index 0000000..6d2c9a9 --- /dev/null +++ b/tests/h5/read.py @@ -0,0 +1,83 @@ +import os +import random + +import h5py +import matplotlib.pyplot as plt +from timer import get_color, timeit + +from msglc.reader import LazyReader +from msglc.unpacker import MsgspecUnpacker + +repeat = 1000 + + +@timeit +def read_msg(file: str): + with LazyReader(file, unpacker=MsgspecUnpacker, cached=False) as reader: + for _ in range(repeat): + reader[random.randint(0, 4999)][random.randint(0, 4999)] + + +@timeit +def read_h5(file: str): + with h5py.File(file, "r") as f: + dataset = f["data"] + for _ in range(repeat): + dataset[random.randint(0, 4999)][random.randint(0, 4999)] + + +def plot_read_time(time: dict, logscale=False): + x = [] + y = [] + color = [] + for k, v in sorted(time.items()): + x.append(k) + y.append(v) + color.append(get_color(k)) + + plt.figure(figsize=(10, 10)) + plt.bar(x, y, color=color) + plt.ylabel("time") + plt.xlabel("format") + plt.xticks(rotation=-90) + if logscale: + plt.yscale("log") + plt.tight_layout() + plt.savefig(f"read_time{'_log' if logscale else ''}.pdf") + + +def plot_memory_usage(memory: dict): + x = [] + y = [] + color = [] + for k, v in sorted(memory.items()): + x.append(k) + y.append(v) + color.append(get_color(k)) + + plt.figure(figsize=(10, 10)) + plt.bar(x, y, color=color) + plt.ylabel("memory usage") + plt.xlabel("format") + plt.xticks(rotation=-90) + plt.tight_layout() + plt.savefig("read_memory_usage.pdf") + + +if __name__ == "__main__": + os.chdir(os.path.dirname(__file__)) + + collect = {} + for file in os.listdir(): + if "data" not in file: + continue + if "msg" in file: + collect[file] = read_msg(file) + elif "h5" in file: + collect[file] = read_h5(file) + + time_dict = {k: v[0] for k, v in collect.items()} + memory_dict = {k: v[1] for k, v in collect.items()} + plot_read_time(time_dict) + plot_read_time(time_dict, logscale=True) + # plot_memory_usage(memory_dict) diff --git a/tests/h5/timer.py b/tests/h5/timer.py new file mode 100644 index 0000000..ce5e760 --- /dev/null +++ b/tests/h5/timer.py @@ -0,0 +1,26 @@ +import time + + +def timeit(func): + def wrapper(*args, **kwargs): + print( + f"Calling function '{func.__name__}' with arguments: args={[arg for arg in args if isinstance(arg, int | str)]}." + ) + start_time = time.time() + func(*args, **kwargs) + end_time = time.time() + duration = end_time - start_time + print(f"Function '{func.__name__}' executed in {duration:.6f} seconds.") + return duration, 0 + + return wrapper + + +def get_color(input: str): + if "msg" in input: + return "red" + if "compressed" in input: + return "blue" + if "h5" in input: + return "green" + return "black"