lica-world · jaejung-dev · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -38,6 +38,6 @@ jobs:
           reg = BenchmarkRegistry()
           reg.discover()
           ids = reg.list_ids()
-          assert len(ids) == 39, f'Expected 39 benchmarks, got {len(ids)}'
+          assert len(ids) == 40, f'Expected 40 benchmarks, got {len(ids)}'
           "
           python scripts/run_benchmarks.py --list
@@ -1,6 +1,6 @@
 # GDB: GraphicDesignBench
 
-**GDB** evaluates vision-language models on professional graphic design tasks — layout reasoning, typography, SVG editing, template matching, animation. 39 benchmarks across 7 domains, built on the [Lica dataset](https://github.com/lica-world/lica-dataset) (1,148 real design layouts).
+**GDB** evaluates vision-language models on professional graphic design tasks — layout reasoning, typography, SVG editing, template matching, animation. 40 benchmarks across 7 domains, built on the [Lica dataset](https://github.com/lica-world/lica-dataset) (1,148 real design layouts).
 
 **Paper:** [arXiv:2604.04192](https://arxiv.org/abs/2604.04192) &nbsp;|&nbsp; **Dataset:** [HuggingFace](https://huggingface.co/datasets/lica-world/GDB) &nbsp;|&nbsp; **Blog:** [lica.world](https://lica.world/blog/gdb-real-world-benchmark-for-graphic-design)
 
@@ -16,7 +16,7 @@ Each task is either **understanding** or **generation**:
 | svg | 8 | 8 | SVG reasoning and editing (perceptual and semantic Q/A, bug fixing, optimization, style editing) and generation (text-to-SVG, image-to-SVG, combined input) |
 | template | 5 | 5 | Template matching, retrieval, clustering, and generation (style completion, color transfer) |
 | temporal | 8 | 6 | Keyframe ordering; motion type classification; video/component duration and start-time estimation; generation (animation parameters, motion trajectory, short-form video) |
-| typography | 12 | 8 | Font family, color, size/weight/alignment/letter spacing/line height, style ranges, curvature, rotation, and generation (styled text element, styled text rendering to layout) |
+| typography | 13 | 9 | Font family, color, size/weight/alignment/letter spacing/line height, style ranges, curvature, rotation, and generation (styled text element, styled text rendering to layout, text removal/background inpainting as `image-6`) |
 
 ## Setup
 
@@ -92,7 +92,7 @@ python scripts/run_benchmarks.py --benchmarks svg-1 \
     --provider hf --device auto \
     --dataset-root data/gdb-dataset
 
-# Diffusion / image generation (defaults to FLUX.2 klein 4B)
+# Diffusion / image generation (defaults to FLUX.2 klein 9B)
 python scripts/run_benchmarks.py --benchmarks layout-1 \
     --provider diffusion \
     --dataset-root data/gdb-dataset
@@ -109,7 +109,7 @@ python -m pip install --no-deps --ignore-requires-python \
 python scripts/run_benchmarks.py --benchmarks layout-1 layout-3 layout-8 typography-7 typography-8 \
     --provider custom \
     --custom-entry gdb.models.local_models:Flux2Model \
-    --custom-init-kwargs '{"model_name":"flux.2-klein-4b"}' \
+    --custom-init-kwargs '{"model_name":"flux.2-klein-9b"}' \
     --custom-modality image_generation \
     --dataset-root data/gdb-dataset
 ```
@@ -132,7 +132,7 @@ helm-summarize --suite gdb-eval
 helm-server --suite gdb-eval
 ```
 
-All 39 benchmarks are available. See [integrations/helm/](integrations/helm/) for details.
+All 40 benchmarks are available. See [integrations/helm/](integrations/helm/) for details.
 
 ### API keys
 
@@ -186,12 +186,13 @@ GDB/
 ├── src/gdb/
 │   ├── tasks/              # @benchmark classes — one file per domain
 │   │   ├── category.py     #   category-1, category-2
+│   │   ├── image.py        #   compatibility shim (re-exports image-6)
 │   │   ├── layout.py       #   layout-1 … layout-8
 │   │   ├── lottie.py       #   lottie-1, lottie-2
 │   │   ├── svg.py          #   svg-1 … svg-8
 │   │   ├── template.py     #   template-1 … template-5
 │   │   ├── temporal.py     #   temporal-1 … temporal-6
-│   │   └── typography.py   #   typography-1 … typography-8
+│   │   └── typography.py   #   typography-1 … typography-8 + image-6 implementation
 │   ├── models/             # Provider wrappers (OpenAI, Anthropic, Gemini, HF, vLLM)
 │   ├── metrics/            # Reusable metric functions (IoU, FID, SSIM, LPIPS, edit distance)
 │   ├── evaluation/

@@ -1,6 +1,6 @@
 # lica-gdb-helm
 
-HELM integration for [GDB (GraphicDesignBench)](https://github.com/lica-world/GDB) — run all 39 GDB benchmarks through Stanford CRFM's [HELM](https://github.com/stanford-crfm/helm) framework.
+HELM integration for [GDB (GraphicDesignBench)](https://github.com/lica-world/GDB) — run all 40 GDB benchmarks through Stanford CRFM's [HELM](https://github.com/stanford-crfm/helm) framework.
 
 ## Install
 
@@ -35,7 +35,7 @@ helm-server --suite gdb-eval
 
 ## Available benchmarks
 
-All 39 GDB benchmarks are available. Pass any benchmark ID:
+All 40 GDB benchmarks are available. Pass any benchmark ID:
 
 | Domain | Benchmark IDs |
 |--------|--------------|
@@ -44,7 +44,7 @@ All 39 GDB benchmarks are available. Pass any benchmark ID:
 | SVG | `svg-1` through `svg-8` |
 | Template | `template-1` through `template-5` |
 | Temporal | `temporal-1` through `temporal-6` |
-| Typography | `typography-1` through `typography-8` |
+| Typography | `typography-1` through `typography-8`, `image-6` |
 | Lottie | `lottie-1`, `lottie-2` |
 
 ## Options

@@ -73,6 +73,7 @@ class BenchmarkInfo:
     "typography-6": BenchmarkInfo(method="generation_multimodal", max_tokens=256, has_images=True),
 
     # -- typography: generation --
+    "image-6": BenchmarkInfo(method="generation", max_tokens=0, has_images=True, image_gen=True),
     "typography-7": BenchmarkInfo(method="generation", max_tokens=0, has_images=True, image_gen=True),
     "typography-8": BenchmarkInfo(method="generation", max_tokens=0, image_gen=True),
 

@@ -1,6 +1,6 @@
 """HELM Scenario that wraps any GDB benchmark.
 
-One parameterized class handles all 39 benchmarks by delegating data loading
+One parameterized class handles all 40 benchmarks by delegating data loading
 and prompt construction to the ``gdb`` package.
 """
 

@@ -46,7 +46,7 @@ python scripts/run_benchmarks.py --benchmarks svg-6 \
     --provider vllm --model-id Qwen/Qwen3-VL-4B-Instruct --top-k 20 --top-p 0.8 \
     --dataset-root data/gdb-dataset
 
-# Diffusion / image generation (defaults to FLUX.2 klein 4B)
+# Diffusion / image generation (defaults to FLUX.2 klein 9B)
 python scripts/run_benchmarks.py --benchmarks layout-1 \
     --provider diffusion \
     --dataset-root data/gdb-dataset
@@ -69,7 +69,7 @@ python -m pip install --no-deps --ignore-requires-python \
 python scripts/run_benchmarks.py --benchmarks layout-1 layout-3 layout-8 typography-7 typography-8 \
     --provider custom \
     --custom-entry gdb.models.local_models:Flux2Model \
-    --custom-init-kwargs '{"model_name":"flux.2-klein-4b"}' \
+    --custom-init-kwargs '{"model_name":"flux.2-klein-9b"}' \
     --custom-modality image_generation \
     --dataset-root data/gdb-dataset
 
@@ -100,7 +100,7 @@ from Hugging Face and can use either environment tokens (`HF_TOKEN`,
 `HF_HUB_TOKEN`) or an existing cached login/token file.
 
 The default local text/VLM model ID is now `Qwen/Qwen3-VL-4B-Instruct` for both
-`hf` and `vllm`, and the default `diffusion` model ID is `flux.2-klein-4b`.
+`hf` and `vllm`, and the default `diffusion` model ID is `flux.2-klein-9b`.
 
 ### Batch submit/collect (~50% cheaper)
 

@@ -84,7 +84,7 @@
     "anthropic": "claude-sonnet-4-20250514",
     "hf": "Qwen/Qwen3-VL-4B-Instruct",
     "vllm": "Qwen/Qwen3-VL-4B-Instruct",
-    "diffusion": "flux.2-klein-4b",
+    "diffusion": "flux.2-klein-9b",
     "custom": "custom-entrypoint",
 }
 

@@ -442,7 +442,7 @@ def generate_dataset_card(config_names: Optional[List[str]] = None) -> str:
 
 # GDB: GraphicDesignBench
 
-39 benchmarks for evaluating vision-language models on graphic design tasks — layout, typography, SVG, template matching, animation. Built on 1,148 real design layouts from the [Lica dataset](https://lica.world).
+40 benchmarks for evaluating vision-language models on graphic design tasks — layout, typography, SVG, template matching, animation. Built on 1,148 real design layouts from the [Lica dataset](https://lica.world).
 
 **Paper:** [arXiv:2604.04192](https://arxiv.org/abs/2604.04192) &nbsp;|&nbsp; **Code:** [github.com/lica-world/GDB](https://github.com/lica-world/GDB) &nbsp;|&nbsp; **Blog:** [lica.world](https://lica.world/blog/gdb-real-world-benchmark-for-graphic-design)
 

@@ -1,13 +1,14 @@
 """Shared metric implementations for GDB benchmarks."""
 
-from .core import edit_distance, fid, iou, lpips_score, ssim
+from .core import edit_distance, fid, iou, lpips_score, psnr, ssim
 from .text import normalize_font_name
 
 __all__ = [
     "edit_distance",
     "fid",
     "iou",
     "lpips_score",
+    "psnr",
     "normalize_font_name",
     "ssim",
 ]
@@ -76,6 +76,27 @@ def edit_distance(source: str, target: str) -> float:
 # ---------------------------------------------------------------------------
 
 
+def psnr(pred: Any, gt: Any) -> float:
+    """Peak signal-to-noise ratio.
+
+    If the optional third-party ``evaluation.image`` module is available,
+    delegates to it; otherwise uses ``scikit-image``.
+    """
+    try:
+        from evaluation.image import psnr as _psnr
+
+        return _psnr(pred, gt)
+    except ImportError:
+        pass
+
+    try:
+        from skimage.metrics import peak_signal_noise_ratio
+    except ImportError:
+        raise _missing_extra("scikit-image", "metrics")
+
+    return float(peak_signal_noise_ratio(gt, pred))
+
+
 def ssim(pred: Any, gt: Any) -> float:
     """Structural similarity index.