Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 266 additions & 0 deletions records/track_non_record_16mb/2026-04-11_MercuryInRetrograde/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
{
"metadata": {
"baseline_script": "/workspace/parameter-golf/records/track_10min_16mb/2026-03-17_NaiveBaseline/train_gpt.py",
"baseline_checkpoint": "/workspace/parameter-golf/records/track_non_record_16mb/2026-04-09_MercuryStyleCompactTextDiffusion/benchmarks/real_8x_seed2026_vs_baseline_20260410_140157/ar_baseline/final_model.pt",
"mercury_script": "/workspace/parameter-golf/records/track_non_record_16mb/2026-04-09_MercuryStyleCompactTextDiffusion/train_mercury_compact_diffusion.py",
"mercury_checkpoint": "/workspace/parameter-golf/records/track_non_record_16mb/2026-04-09_MercuryStyleCompactTextDiffusion/final_model.pt",
"data_path": "/tmp/parameter-golf-data/datasets/fineweb10B_sp1024",
"tokenizer_path": "/tmp/parameter-golf-data/tokenizers/fineweb_1024_bpe.model",
"device": "cuda",
"num_examples": 32,
"prefix_len": 128,
"continuation_len": 64,
"infill_span_len": 64,
"infill_suffix_len": 64,
"benchmark_steps": [
1,
2,
4,
8
],
"showcase_indices": [
192664,
34663,
165861,
152617
]
},
"ar_continuation": {
"token_acc": 0.021484375,
"exact_seq_acc": 0.0,
"tok_per_sec": 1518.7874378194003,
"batch_latency_ms": 1348.444126546383,
"single_latency_ms": 510.45899260789156,
"tokens": 2048.0
},
"mercury_continuation": {
"1": {
"token_acc": 0.0400390625,
"exact_seq_acc": 0.0,
"tok_per_sec": 33315.36467238215,
"batch_latency_ms": 61.47313769906759,
"single_latency_ms": 7.682473957538605,
"tokens": 2048.0
},
"2": {
"token_acc": 0.0400390625,
"exact_seq_acc": 0.0,
"tok_per_sec": 52423.930452321954,
"batch_latency_ms": 39.0661284327507,
"single_latency_ms": 15.342708677053452,
"tokens": 2048.0
},
"4": {
"token_acc": 0.03955078125,
"exact_seq_acc": 0.0,
"tok_per_sec": 43382.67441657036,
"batch_latency_ms": 47.20778577029705,
"single_latency_ms": 30.572386272251606,
"tokens": 2048.0
},
"8": {
"token_acc": 0.03759765625,
"exact_seq_acc": 0.0,
"tok_per_sec": 21693.84538256514,
"batch_latency_ms": 94.40465550869703,
"single_latency_ms": 61.32346969097853,
"tokens": 2048.0
}
},
"mercury_infill": {
"1": {
"token_acc": 0.0400390625,
"exact_seq_acc": 0.0,
"tok_per_sec": 147729.23657320702,
"batch_latency_ms": 13.863200321793556,
"single_latency_ms": 7.631411962211132,
"tokens": 2048.0
},
"2": {
"token_acc": 0.0400390625,
"exact_seq_acc": 0.0,
"tok_per_sec": 86371.16188730345,
"batch_latency_ms": 23.711618036031723,
"single_latency_ms": 15.290173701941967,
"tokens": 2048.0
},
"4": {
"token_acc": 0.03955078125,
"exact_seq_acc": 0.0,
"tok_per_sec": 43263.386592025425,
"batch_latency_ms": 47.33794927597046,
"single_latency_ms": 30.280465446412563,
"tokens": 2048.0
},
"8": {
"token_acc": 0.03759765625,
"exact_seq_acc": 0.0,
"tok_per_sec": 21754.902379810777,
"batch_latency_ms": 94.13970075547695,
"single_latency_ms": 44.69598866999149,
"tokens": 2048.0
}
},
"comparisons": {
"continuation_speedup_vs_ar": {
"1": 21.935501863390904,
"2": 34.516963432084765,
"4": 28.564019780712073,
"8": 14.283661322424479
},
"infill_speedup_vs_ar_continuation": {
"1": 97.26788153140727,
"2": 56.868498998984926,
"4": 28.485478293223736,
"8": 14.323862469553598
}
},
"examples": [
{
"prefix": "customers and building brand awareness through social media initiatives *Performing website analyses utilizing Google Analytics *Participating in planning, executing and analyzing email marketing campaigns - Web Design *Designing web imagery used on Mason Companies websites, display advert",
"continuation_target": "ising and landing pages *Designing imagery and HTML for email marketing campaigns *Participating in various projects such as launching new mobile sit",
"ar_continuation": "ising, and marketing - Web Design *Producing web design ideas for websites, websites, and websites - Web Design *Producing web design ideas for we",
"mercury_continuation_step1": "isment,,,,,,,,,,,,,,inginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginging,inginging",
"infill_target": "ising and landing pages *Designing imagery and HTML for email marketing campaigns *Participating in various projects such as launching new mobile sit",
"infill_suffix": "es and websites *Researching and implementing emerging web design technologies and processes - IT Developer *Java J2EE development *Ecommerce Development on Mason",
"mercury_infill_step1": "isment,,,,,,,,,,,,,,inginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginging,inginging"
},
{
"prefix": "and are responsible for compliance with all applicable laws. You may not access, download, use or export the Information on this web site in violation of U.S. export laws or regulations, or in violation of any applicable local laws or regulations. Webroot Inc. (\"Webroot\") is committed to protecting the intellectual property rights of third parties, and",
"continuation_target": "Webroot requests that its customers and community members do the same. Webroot has no responsibility for content on other websites that you may find or access when using Webroot's produ",
"ar_continuation": "to protecting the privacy of third parties. Webroot is not responsible for the privacy of third parties. Webroot is not responsible for the privacy of third part",
"mercury_continuation_step1": "theinging the the,,,ss,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",
"infill_target": "Webroot requests that its customers and community members do the same. Webroot has no responsibility for content on other websites that you may find or access when using Webroot's produ",
"infill_suffix": "cts or services, and such content may be protected by copyright and the intellectual property laws of the United States and/or other countries. Without prior notice and at any time, We",
"mercury_infill_step1": "theinging the the,,,ss,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
},
{
"prefix": "os predicted will emerge without deliberate and intentional actions to support them. And the extent to which they can be shaped to further societal goals will depend on constructive dialogue between governments and citizens themselves. Ultimately, this new publication aims to contribute to this dialogue, so that both developing and developed countries",
"continuation_target": "are more likely to leap into better futures. Text co-authored with Tom Steinberg, originally cross-posted from the World Bank\u2019s Governance for Development",
"ar_continuation": "can be more effectively dialogued and delivered to the public. The publication is aimed at the public and private sectors, and is based on the publication\u2019s publication d",
"mercury_continuation_step1": "will the the to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the",
"infill_target": "are more likely to leap into better futures. Text co-authored with Tom Steinberg, originally cross-posted from the World Bank\u2019s Governance for Development",
"infill_suffix": "blog. You can also read another article about this report in Apolitical here. While I\u2019m at it: if you work in public service and care about making government work better, I highly recommend",
"mercury_infill_step1": "will the the to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# AR vs Mercury Decode Benchmark

## Setup
- Baseline checkpoint: `/workspace/parameter-golf/records/track_non_record_16mb/2026-04-09_MercuryStyleCompactTextDiffusion/benchmarks/real_8x_seed2026_vs_baseline_20260410_140157/ar_baseline/final_model.pt`
- Mercury checkpoint: `/workspace/parameter-golf/records/track_non_record_16mb/2026-04-09_MercuryStyleCompactTextDiffusion/final_model.pt`
- Device: `cuda`
- Examples: `32`
- Continuation task: prefix `128` tokens, predict next `64` tokens
- Infill task: prefix `128` tokens, infill `64` tokens with suffix `64` visible

## Highlights

- AR continuation throughput: `1518.79` tok/s for `64`-token greedy decode.
- Best Mercury continuation accuracy in this run: `0.0400` at `1` refinement step(s).
- Fastest Mercury continuation setting: `52423.93` tok/s at `2` step(s), which is `34.52x` AR continuation throughput.
- Fastest Mercury infill setting: `147729.24` tok/s at `1` step(s), which is `97.27x` AR continuation throughput on the same hardware.

## Continuation

| Model | Mode | Token Acc | Exact Seq | Tok/s | Speedup vs AR | Batch Latency ms | Single-example Latency ms |
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
| AR baseline | greedy | 0.0215 | 0.0000 | 1518.79 | 1.00x | 1348.44 | 510.46 |
| Mercury | 1 refinement step(s) | 0.0400 | 0.0000 | 33315.36 | 21.94x | 61.47 | 7.68 |
| Mercury | 2 refinement step(s) | 0.0400 | 0.0000 | 52423.93 | 34.52x | 39.07 | 15.34 |
| Mercury | 4 refinement step(s) | 0.0396 | 0.0000 | 43382.67 | 28.56x | 47.21 | 30.57 |
| Mercury | 8 refinement step(s) | 0.0376 | 0.0000 | 21693.85 | 14.28x | 94.40 | 61.32 |

## Infill

| Model | Mode | Token Acc | Exact Seq | Tok/s | Speedup vs AR Continuation | Batch Latency ms | Single-example Latency ms |
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
| Mercury | 1 refinement step(s) | 0.0400 | 0.0000 | 147729.24 | 97.27x | 13.86 | 7.63 |
| Mercury | 2 refinement step(s) | 0.0400 | 0.0000 | 86371.16 | 56.87x | 23.71 | 15.29 |
| Mercury | 4 refinement step(s) | 0.0396 | 0.0000 | 43263.39 | 28.49x | 47.34 | 30.28 |
| Mercury | 8 refinement step(s) | 0.0376 | 0.0000 | 21754.90 | 14.32x | 94.14 | 44.70 |

## Example Outputs

### Example 1
- Prefix: customers and building brand awareness through social media initiatives *Performing website analyses utilizing Google Analytics *Participating in planning, executing and analyzing email marketing campaigns - Web Design *Designing web imagery used on Mason Companies websites, display advert
- Continuation target: ising and landing pages *Designing imagery and HTML for email marketing campaigns *Participating in various projects such as launching new mobile sit
- AR continuation: ising, and marketing - Web Design *Producing web design ideas for websites, websites, and websites - Web Design *Producing web design ideas for we
- Mercury continuation (1 step): isment,,,,,,,,,,,,,,inginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginging,inginging
- Infill target: ising and landing pages *Designing imagery and HTML for email marketing campaigns *Participating in various projects such as launching new mobile sit
- Infill suffix: es and websites *Researching and implementing emerging web design technologies and processes - IT Developer *Java J2EE development *Ecommerce Development on Mason
- Mercury infill (1 step): isment,,,,,,,,,,,,,,inginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginginging,inginging

### Example 2
- Prefix: and are responsible for compliance with all applicable laws. You may not access, download, use or export the Information on this web site in violation of U.S. export laws or regulations, or in violation of any applicable local laws or regulations. Webroot Inc. ("Webroot") is committed to protecting the intellectual property rights of third parties, and
- Continuation target: Webroot requests that its customers and community members do the same. Webroot has no responsibility for content on other websites that you may find or access when using Webroot's produ
- AR continuation: to protecting the privacy of third parties. Webroot is not responsible for the privacy of third parties. Webroot is not responsible for the privacy of third part
- Mercury continuation (1 step): theinging the the,,,ss,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
- Infill target: Webroot requests that its customers and community members do the same. Webroot has no responsibility for content on other websites that you may find or access when using Webroot's produ
- Infill suffix: cts or services, and such content may be protected by copyright and the intellectual property laws of the United States and/or other countries. Without prior notice and at any time, We
- Mercury infill (1 step): theinging the the,,,ss,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

### Example 3
- Prefix: os predicted will emerge without deliberate and intentional actions to support them. And the extent to which they can be shaped to further societal goals will depend on constructive dialogue between governments and citizens themselves. Ultimately, this new publication aims to contribute to this dialogue, so that both developing and developed countries
- Continuation target: are more likely to leap into better futures. Text co-authored with Tom Steinberg, originally cross-posted from the World Bank’s Governance for Development
- AR continuation: can be more effectively dialogued and delivered to the public. The publication is aimed at the public and private sectors, and is based on the publication’s publication d
- Mercury continuation (1 step): will the the to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
- Infill target: are more likely to leap into better futures. Text co-authored with Tom Steinberg, originally cross-posted from the World Bank’s Governance for Development
- Infill suffix: blog. You can also read another article about this report in Apolitical here. While I’m at it: if you work in public service and care about making government work better, I highly recommend
- Mercury infill (1 step): will the the to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"author": "Simon Marcus",
"github_id": "simon-marcus",
"name": "Mercury in Retrograde",
"blurb": "Non-record text-diffusion submission: a compact Mercury-style denoising model with hybrid corruption, self-conditioning, continuation/infill parallel refinement metrics, and an intentionally documented negative result.",
"date": "2026-04-11",
"track": "non_record_16mb",
"non_record": true,
"val_loss": 2.45910030,
"val_bpb": 1.45641771,
"val_loss_std": 0.00620926,
"val_bpb_std": 0.00367747,
"bytes_total": 15677283,
"bytes_code": 83831,
"bytes_model_int8_zlib_max": 15593452,
"hardware": "8xH100 80GB SXM",
"seeds": [1337, 42, 2026],
"seed_results": {
"1337": {
"val_loss": 2.45349105,
"val_bpb": 1.45309560,
"artifact_bytes": 15677283,
"model_int8_zlib_bytes": 15593452,
"steps": 4628,
"step_avg_ms": 129.64
},
"42": {
"val_loss": 2.46577237,
"val_bpb": 1.46036929,
"artifact_bytes": 15531183,
"model_int8_zlib_bytes": 15447352,
"steps": 4912,
"step_avg_ms": 122.14
},
"2026": {
"val_loss": 2.45803749,
"val_bpb": 1.45578825,
"artifact_bytes": 15500938,
"model_int8_zlib_bytes": 15417107,
"steps": 4926,
"step_avg_ms": 121.82
}
},
"technique_summary": "Mercury-style compact text diffusion: progressive 25-35% hybrid corruption, mixed continuation/infill denoising, self-conditioning, small clean CE prior, and parallel refinement evaluation.",
"diffusion_recipe": {
"DIFFUSION_TRAIN_MODE": "mercury",
"DIFFUSION_CORRUPTION_MODE": "hybrid",
"DIFFUSION_HYBRID_MASK_PROB": 0.5,
"DIFFUSION_LOSS_WEIGHT": 1.0,
"DIFFUSION_PROGRESSIVE": 1,
"DIFFUSION_MIN_MASK_RATIO": 0.25,
"DIFFUSION_MAX_MASK_RATIO": 0.35,
"DIFFUSION_SPAN_LEN": 64,
"DIFFUSION_SPANS_PER_SEQ": 8,
"MERCURY_TASK_MODE": "mixed",
"MERCURY_CONTINUATION_PROB": 0.75,
"MERCURY_CLEAN_LOSS_WEIGHT": 0.10,
"MERCURY_SELF_CONDITION_PROB": 1.0,
"MERCURY_SELF_CONDITION_FRACTION": 0.75
},
"parallel_eval_mean": {
"continuation": {
"steps_1": {"token_acc": 0.0348, "tok_per_sec": 25123.81},
"steps_2": {"token_acc": 0.0345, "tok_per_sec": 36784.34},
"steps_4": {"token_acc": 0.0348, "tok_per_sec": 25990.89},
"steps_8": {"token_acc": 0.0355, "tok_per_sec": 13005.42}
},
"infill": {
"steps_1": {"token_acc": 0.0377, "tok_per_sec": 104971.51},
"steps_2": {"token_acc": 0.0384, "tok_per_sec": 52357.51},
"steps_4": {"token_acc": 0.0410, "tok_per_sec": 26075.30},
"steps_8": {"token_acc": 0.0404, "tok_per_sec": 12922.12}
}
},
"matched_decode_benchmark": {
"hardware": "1xH100",
"checkpoint_seed": 2026,
"examples": 32,
"prefix_tokens": 128,
"target_tokens": 64,
"ar_continuation_tok_per_sec": 1518.79,
"mercury_continuation_step1_token_acc": 0.0400,
"mercury_continuation_step1_tok_per_sec": 33315.36,
"mercury_continuation_step1_speedup_vs_ar": 21.94,
"mercury_infill_step1_token_acc": 0.0400,
"mercury_infill_step1_tok_per_sec": 147729.24,
"mercury_infill_step1_speedup_vs_ar_continuation": 97.27
}
}
Loading