Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDF5 benchmark #3

Merged
merged 4 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,29 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
slug: TLCFEM/msglc
plugins: pycoverage
# benchmark:
# name: Benchmark
# runs-on: ubuntu-latest
# timeout-minutes: 100
# steps:
# - name: Clone
# uses: actions/checkout@v4
# - name: Python
# uses: actions/setup-python@v5
# with:
# python-version: 3.11
# - name: Build
# run: pip install .[dev,msgspec,numpy] matplotlib h5py
# - name: Test
# run: |
# python3 h5/generate.py
# python3 h5/read.py
# tar czf benchmark.tar.gz ./h5/*.pdf
# - name: Upload
# uses: actions/upload-artifact@v4
# with:
# name: msglc-benchmark
# path: benchmark.tar.gz
wheels:
name: Build
runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions docs/.pages
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ nav:
- msglc: index.md
- Motivation: motivation.md
- Examples: tutorial.md
- Benchmark: benchmark.md
- API: api
60 changes: 60 additions & 0 deletions docs/benchmark.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Benchmark

The embedded structure allows fast read without loading the whole archive, which is the main advantage of this package.
In the following, we benchmark the random read performance and compare with the `HDF5` format.

## Data Generation

A square matrix of size 5000 with random floating-point numbers is used.
The matrix is dumped onto the disk with different configurations.

1. For `msglc`, `small_obj_optimization_threshold` varies from 4KB to 4MB, `numpy_encoder` is switched off so the matrix is stored as plain json instead binary blob.
2. For `h5py`, the chunk size is computed so that each block has a size similar to `small_obj_optimization_threshold`. Compression is optionally switched on.

The following code snippets show the relevant functions.

```py
def generate_msg(mat: np.ndarray, block: int):
configure(small_obj_optimization_threshold=2**block, numpy_encoder=False) # 16KB
dump(f"data-{block}.msg", mat)

def generate_h5(mat: np.ndarray, block: int, **kwargs):
with h5py.File(h5_name(block, **kwargs), "w") as f:
if block > 0:
chunk_size = int(sqrt(2**block / 128))
kwargs["chunks"] = (chunk_size, chunk_size)
f.create_dataset("data", data=mat, **kwargs)
```

The write time of `msglc` is in general constant, because the packer needs to traverse the whole json object.
Depending on different configurations, `h5py` requires different amounts of time to dump the matrix.

![write time](./write_time.pdf)

`msglc` shall be used for data that is written to disk for cold storage and does not require frequent changes.
When compression is on, `h5py` needs to traverse the object just like `msglc`, thus requires a similar amount of time.

## Read Test

We mainly test the random read.
To this end, we repeatedly read random locations in the matrix and measure the time required.

```py
@timeit
def read_msg(file: str):
with LazyReader(file, unpacker=MsgspecUnpacker, cached=False) as reader:
for _ in range(repeat):
reader[random.randint(0, 4999)][random.randint(0, 4999)]


@timeit
def read_h5(file: str):
with h5py.File(file, "r") as f:
dataset = f["data"]
for _ in range(repeat):
dataset[random.randint(0, 4999)][random.randint(0, 4999)]`
```

![read 1k random elements](./read_time_log_1k.pdf)

![read 10k random elements](./read_time_log_10k.pdf)
Binary file added docs/read_time_log_10k.pdf
Binary file not shown.
Binary file added docs/read_time_log_1k.pdf
Binary file not shown.
Binary file added docs/write_time.pdf
Binary file not shown.
113 changes: 113 additions & 0 deletions tests/h5/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
from math import sqrt

import h5py
import matplotlib.pyplot as plt
import numpy as np
from timer import get_color, timeit

from msglc import dump
from msglc.config import configure


@timeit
def generate_msg(mat: np.ndarray, block: int):
configure(small_obj_optimization_threshold=2**block, numpy_encoder=False) # 16KB
dump(f"data-{block}.msg", mat)


def h5_name(block: int, **kwargs):
file_name = "data"
if kwargs:
file_name += "-compressed"
elif block > 0:
file_name += "-chunked"

if block > 0:
file_name += f"-{block}"

return f"{file_name}.h5"


@timeit
def generate_h5(mat: np.ndarray, block: int, **kwargs):
with h5py.File(h5_name(block, **kwargs), "w") as f:
if block > 0:
chunk_size = int(sqrt(2**block / 128))
kwargs["chunks"] = (chunk_size, chunk_size)
f.create_dataset("data", data=mat, **kwargs)


def plot_write_time(write_time: dict):
x = []
y = []
color = []
for k, v in sorted(write_time.items()):
x.append(k)
y.append(v)
color.append(get_color(k))

plt.figure(figsize=(10, 10))
plt.bar(x, y, color=color)
plt.ylabel("time (s)")
plt.xlabel("format")
plt.xticks(rotation=-90)
plt.tight_layout()
plt.savefig("write_time.pdf")


def plot_file_size(file_size: dict):
x = []
y = []
color = []
for k in sorted(file_size.keys()):
x.append(k)
y.append(os.path.getsize(k) / 2**20)
color.append(get_color(k))

plt.figure(figsize=(10, 10))
plt.bar(x, y, color=color)
plt.ylabel("size (MB)")
plt.xlabel("format")
plt.xticks(rotation=-90)
plt.tight_layout()
plt.savefig("file_size.pdf")


def plot_memory_usage(write_memory: dict):
x = []
y = []
color = []
for k, v in sorted(write_memory.items()):
x.append(k)
y.append(v)
color.append(get_color(k))

plt.figure(figsize=(10, 10))
plt.bar(x, y, color=color)
plt.ylabel("write memory usage")
plt.xlabel("format")
plt.xticks(rotation=-90)
plt.tight_layout()
plt.savefig("write_memory.pdf")


if __name__ == "__main__":
os.chdir(os.path.dirname(__file__))

collect = {}

mat = np.random.rand(5000, 5000)

collect[h5_name(-1)] = generate_h5(mat, -1)

for i in range(12, 23):
collect[h5_name(i)] = generate_h5(mat, i)
collect[h5_name(i, compression="gzip", compression_opts=9)] = generate_h5(
mat, i, compression="gzip", compression_opts=9
)
collect[f"data-{i}.msg"] = generate_msg(mat, i)

plot_write_time({k: v[0] for k, v in collect.items()})
plot_file_size(collect)
# plot_memory_usage({k: v[1] for k, v in collect.items()})
83 changes: 83 additions & 0 deletions tests/h5/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import random

import h5py
import matplotlib.pyplot as plt
from timer import get_color, timeit

from msglc.reader import LazyReader
from msglc.unpacker import MsgspecUnpacker

repeat = 1000


@timeit
def read_msg(file: str):
with LazyReader(file, unpacker=MsgspecUnpacker, cached=False) as reader:
for _ in range(repeat):
reader[random.randint(0, 4999)][random.randint(0, 4999)]


@timeit
def read_h5(file: str):
with h5py.File(file, "r") as f:
dataset = f["data"]
for _ in range(repeat):
dataset[random.randint(0, 4999)][random.randint(0, 4999)]


def plot_read_time(time: dict, logscale=False):
x = []
y = []
color = []
for k, v in sorted(time.items()):
x.append(k)
y.append(v)
color.append(get_color(k))

plt.figure(figsize=(10, 10))
plt.bar(x, y, color=color)
plt.ylabel("time")
plt.xlabel("format")
plt.xticks(rotation=-90)
if logscale:
plt.yscale("log")
plt.tight_layout()
plt.savefig(f"read_time{'_log' if logscale else ''}.pdf")


def plot_memory_usage(memory: dict):
x = []
y = []
color = []
for k, v in sorted(memory.items()):
x.append(k)
y.append(v)
color.append(get_color(k))

plt.figure(figsize=(10, 10))
plt.bar(x, y, color=color)
plt.ylabel("memory usage")
plt.xlabel("format")
plt.xticks(rotation=-90)
plt.tight_layout()
plt.savefig("read_memory_usage.pdf")


if __name__ == "__main__":
os.chdir(os.path.dirname(__file__))

collect = {}
for file in os.listdir():
if "data" not in file:
continue
if "msg" in file:
collect[file] = read_msg(file)
elif "h5" in file:
collect[file] = read_h5(file)

time_dict = {k: v[0] for k, v in collect.items()}
memory_dict = {k: v[1] for k, v in collect.items()}
plot_read_time(time_dict)
plot_read_time(time_dict, logscale=True)
# plot_memory_usage(memory_dict)
26 changes: 26 additions & 0 deletions tests/h5/timer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import time


def timeit(func):
def wrapper(*args, **kwargs):
print(
f"Calling function '{func.__name__}' with arguments: args={[arg for arg in args if isinstance(arg, int | str)]}."
)
start_time = time.time()
func(*args, **kwargs)
end_time = time.time()
duration = end_time - start_time
print(f"Function '{func.__name__}' executed in {duration:.6f} seconds.")
return duration, 0

return wrapper


def get_color(input: str):
if "msg" in input:
return "red"
if "compressed" in input:
return "blue"
if "h5" in input:
return "green"
return "black"