diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 86dc197..91cb844 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -342,7 +342,7 @@ EOF
     --embedding models/Qwen3-Embedding-0.6B-Q8_0.gguf \
     --dit models/acestep-v15-base-Q8_0.gguf \
     --vae models/vae-BF16.gguf \
-    --wav
+    --output wav
 ```
 
 Available track names for lego, extract, and complete: `vocals`, `backing_vocals`,
@@ -711,8 +711,14 @@ LoRA:
 
 Output:
   Default: MP3 at 128 kbps. input.json -> input0.mp3, input1.mp3, ...
+  --output <format>       Output audio file format (default: mp3)
+                            Supported values: mp3, wav, wav16, wav24, wav32
+                              mp3: MPEG-1 Audio Layer III encoded audio
+                              wav/wav16: 16-bit signed-integer WAVE audio
+                              wav24: 24-bit signed-integer WAVE audio
+                              wav32: 32-bit IEEE floating-point WAVE audio
+                                (wav32 disables normalization & peak clip)
   --mp3-bitrate <kbps>    MP3 bitrate (default: 128)
-  --wav                   Output WAV instead of MP3
 
 Memory control:
   --vae-chunk <N>         Latent frames per tile (default: 256)
@@ -923,6 +929,13 @@ Output:
   -o <path>               Output file (auto-named if omitted)
   --q8                    Quantize latent to int8 (~13 kbit/s)
   --q4                    Quantize latent to int4 (~6.8 kbit/s)
+  --wav-format <fmt>      WAV audio format (default: wav16)
+                            Requires use of --decode
+                            Supported values: wav, wav16, wav24, wav32
+                              wav/wav16: 16-bit signed-integer WAVE audio
+                              wav24: 24-bit signed-integer WAVE audio
+                              wav32: 32-bit IEEE floating-point WAVE audio
+                                (wav32 disables normalization & peak clip)
 
 Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)
                song.latent -> song.wav
@@ -979,9 +992,16 @@ from output extension).
 ```
 Usage: mp3-codec -i <input> -o <o> [options]
 
-  -i <path>   Input file (WAV or MP3)
-  -o <path>   Output file (WAV or MP3)
-  -b <kbps>   Bitrate for MP3 encoding (default: 128)
+  -i <path>          Input file (WAV or MP3)
+  -o <path>          Output file (WAV or MP3)
+  -b <kbps>          Bitrate for MP3 encoding (default: 128)
+  --wav-format <fmt> WAV audio format (default: wav16)
+                       Requires use of -o with a .wav extension
+                       Supported values: wav, wav16, wav24, wav32
+                         wav/wav16: 16-bit signed-integer WAVE audio
+                         wav24: 24-bit signed-integer WAVE audio
+                         wav32: 32-bit IEEE floating-point WAVE audio
+                           (wav32 disables normalization & peak clip)
 
 Mode is auto-detected from output extension.
 
@@ -989,6 +1009,7 @@ Examples:
   mp3-codec -i song.wav -o song.mp3
   mp3-codec -i song.wav -o song.mp3 -b 192
   mp3-codec -i song.mp3 -o song.wav
+  mp3-codec -i song.mp3 -o song.wav --wav-format wav32
 ```
 
 ## ace-understand reference
diff --git a/examples/lego.sh b/examples/lego.sh
index 491820c..98c7a63 100755
--- a/examples/lego.sh
+++ b/examples/lego.sh
@@ -21,7 +21,7 @@ set -eu
     --embedding ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
     --dit ../models/acestep-v15-turbo-Q8_0.gguf \
     --vae ../models/vae-BF16.gguf \
-    --wav
+    --output wav
 
 # Phase 2: lego guitar on the generated track (base model required)
 ../build/ace-synth \
@@ -30,4 +30,4 @@ set -eu
     --embedding ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
     --dit ../models/acestep-v15-base-Q8_0.gguf \
     --vae ../models/vae-BF16.gguf \
-    --wav
+    --output wav
diff --git a/src/audio-io.h b/src/audio-io.h
index 96b6e95..af0cb6c 100644
--- a/src/audio-io.h
+++ b/src/audio-io.h
@@ -6,11 +6,14 @@
 // Part of acestep.cpp. MIT license.
 
 #include <algorithm>
+#include <cassert>
 #include <chrono>
 #include <cmath>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <string>
 #include <thread>
 #include <vector>
@@ -355,70 +358,392 @@ static float * audio_planar_to_interleaved(const float * planar, int T) {
     return out;
 }
 
-// audio_encode_wav is the core: encode planar stereo to WAV 16-bit PCM in memory.
-// 44-byte RIFF header + interleaved int16 samples.
-// audio is planar [L0..LN, R0..RN], pre-normalized by caller.
-// Does NOT normalize - caller is responsible (audio_write does it).
-// Returns empty string on failure.
-static std::string audio_encode_wav(const float * audio, int T_audio, int sr) {
-    int n_channels = 2, bits = 16;
-    int byte_rate   = sr * n_channels * (bits / 8);
-    int block_align = n_channels * (bits / 8);
-    int data_size   = T_audio * n_channels * (bits / 8);
-    int file_size   = 36 + data_size;
+static_assert(sizeof(float) == 4, "requires 32-bit float");
+static_assert(std::numeric_limits<float>::is_iec559, "requires IEEE-754 float");
 
-    std::string out;
-    out.resize(44 + (size_t) data_size);
-    char * p = &out[0];
+inline void wav_write_u16le(char *& p, std::uint16_t x) {
+    *p++ = (char) (x & 0xff);
+    *p++ = (char) ((x >> 8) & 0xff);
+}
+
+inline void wav_write_u24le(char *& p, std::uint32_t x) {
+    *p++ = (char) (x & 0xff);
+    *p++ = (char) ((x >> 8) & 0xff);
+    *p++ = (char) ((x >> 16) & 0xff);
+}
+
+inline void wav_write_u32le(char *& p, std::uint32_t x) {
+    *p++ = (char) (x & 0xff);
+    *p++ = (char) ((x >> 8) & 0xff);
+    *p++ = (char) ((x >> 16) & 0xff);
+    *p++ = (char) ((x >> 24) & 0xff);
+}
+
+inline float wav_sanitize(float x) {
+    return std::isfinite(x) ? x : 0.0f;
+}
+
+inline void wav_write_f32le(char *& p, float x) {
+    x = wav_sanitize(x);
+    std::uint32_t o;
+    std::memcpy(&o, &x, 4);
+    wav_write_u32le(p, o);
+}
+ 
+inline float wav_clamp1(float x) {
+    return x < -1.0f ? -1.0f : (x > 1.0f ? 1.0f : x);
+}
+
+inline std::int16_t wav_quantize_s16(float x) {
+    x = wav_clamp1(wav_sanitize(x));
+    return (std::int16_t) (x * 32767.0f);
+}
+
+inline std::int32_t wav_quantize_s24(float x) {
+    x = wav_clamp1(wav_sanitize(x));
+    return (std::int32_t) (x * 8388607.0f);
+}
+
+inline void wav_write_s16le(char *& p, std::int16_t x) {
+    wav_write_u16le(p, (std::uint16_t) x);
+}
+
+inline void wav_write_s24le(char *& p, std::int32_t x) {
+    wav_write_u24le(p, (std::uint32_t) x);
+}
+
+inline void wav_write_guid_pcm(char *& p) {
+    wav_write_u32le(p, 0x00000001u);
+    wav_write_u16le(p, 0x0000u);
+    wav_write_u16le(p, 0x0010u);
+    *p++ = (char) 0x80;
+    *p++ = (char) 0x00;
+    *p++ = (char) 0x00;
+    *p++ = (char) 0xAA;
+    *p++ = (char) 0x00;
+    *p++ = (char) 0x38;
+    *p++ = (char) 0x9B;
+    *p++ = (char) 0x71;
+}
+
+inline void wav_write_header_basic(
+        char *& p,
+        int T_audio,
+        int sr,
+        int n_channels,
+        int bits,
+        std::uint16_t fmt_tag) {
+    assert(T_audio >= 0);
+    assert(sr > 0);
+    assert(n_channels > 0);
+    assert(bits == 16 || bits == 32);
+
+    const std::uint32_t bytes_per_sample = (std::uint32_t) bits / 8;
+    const std::uint32_t byte_rate        = (std::uint32_t) sr * (std::uint32_t) n_channels * bytes_per_sample;
+    const std::uint16_t block_align      = (std::uint16_t) (n_channels * (int) bytes_per_sample);
+    const std::uint32_t data_size        = (std::uint32_t) T_audio * (std::uint32_t) n_channels * bytes_per_sample;
+    const std::uint32_t file_size        = 36 + data_size;
 
-    // RIFF header
     memcpy(p, "RIFF", 4);
     p += 4;
-    memcpy(p, &file_size, 4);
-    p += 4;
+    wav_write_u32le(p, file_size);
     memcpy(p, "WAVE", 4);
     p += 4;
+
     memcpy(p, "fmt ", 4);
     p += 4;
-    int   fmt_size = 16;
-    short fmt_tag  = 1;
-    short nc       = (short) n_channels;
-    short ba       = (short) block_align;
-    short bp       = (short) bits;
-    memcpy(p, &fmt_size, 4);
+    wav_write_u32le(p, 16);
+    wav_write_u16le(p, fmt_tag);
+    wav_write_u16le(p, (std::uint16_t) n_channels);
+    wav_write_u32le(p, (std::uint32_t) sr);
+    wav_write_u32le(p, byte_rate);
+    wav_write_u16le(p, block_align);
+    wav_write_u16le(p, (std::uint16_t) bits);
+
+    memcpy(p, "data", 4);
     p += 4;
-    memcpy(p, &fmt_tag, 2);
-    p += 2;
-    memcpy(p, &nc, 2);
-    p += 2;
-    memcpy(p, &sr, 4);
+    wav_write_u32le(p, data_size);
+}
+
+inline void wav_write_header_extensible_pcm_s24(
+        char *& p,
+        int T_audio,
+        int sr,
+        int n_channels) {
+    assert(T_audio >= 0);
+    assert(sr > 0);
+    assert(n_channels == 2);
+
+    const int bits = 24;
+    const std::uint32_t bytes_per_sample = 3;
+    const std::uint32_t byte_rate        = (std::uint32_t) sr * (std::uint32_t) n_channels * bytes_per_sample;
+    const std::uint16_t block_align      = (std::uint16_t) (n_channels * (int) bytes_per_sample);
+    const std::uint32_t data_size        = (std::uint32_t) T_audio * (std::uint32_t) n_channels * bytes_per_sample;
+    const std::uint32_t file_size        = 60 + data_size;
+
+    memcpy(p, "RIFF", 4);
     p += 4;
-    memcpy(p, &byte_rate, 4);
+    wav_write_u32le(p, file_size);
+    memcpy(p, "WAVE", 4);
     p += 4;
-    memcpy(p, &ba, 2);
-    p += 2;
-    memcpy(p, &bp, 2);
-    p += 2;
-    memcpy(p, "data", 4);
+
+    memcpy(p, "fmt ", 4);
     p += 4;
-    memcpy(p, &data_size, 4);
+    wav_write_u32le(p, 40);
+    wav_write_u16le(p, 0xfffe);
+    wav_write_u16le(p, (std::uint16_t) n_channels);
+    wav_write_u32le(p, (std::uint32_t) sr);
+    wav_write_u32le(p, byte_rate);
+    wav_write_u16le(p, block_align);
+    wav_write_u16le(p, (std::uint16_t) bits);
+    wav_write_u16le(p, 22);
+    wav_write_u16le(p, (std::uint16_t) bits);
+    wav_write_u32le(p, 0x00000003u);
+    wav_write_guid_pcm(p);
+
+    memcpy(p, "data", 4);
     p += 4;
+    wav_write_u32le(p, data_size);
+}
+
+inline std::string audio_encode_wav_pcm_s16(const float * audio, int T_audio, int sr) {
+    const int n_channels = 2;
+    const int bits       = 16;
 
-    // interleave planar float to PCM int16
-    const float * L   = audio;
-    const float * R   = audio + T_audio;
-    short *       pcm = (short *) p;
-    for (int t = 0; t < T_audio; t++) {
-        pcm[t * 2 + 0] = (short) (L[t] * 32767.0f);
-        pcm[t * 2 + 1] = (short) (R[t] * 32767.0f);
+    assert(audio != nullptr);
+    assert(T_audio >= 0);
+    assert(sr > 0);
+
+    const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8);
+
+    std::string out;
+    out.resize(44 + (size_t) data_size);
+
+    char * p = out.data();
+    wav_write_header_basic(p, T_audio, sr, n_channels, bits, 1);
+
+    const float * L = audio;
+    const float * R = audio + T_audio;
+
+    for (int t = 0; t < T_audio; ++t) {
+        wav_write_s16le(p, wav_quantize_s16(L[t]));
+        wav_write_s16le(p, wav_quantize_s16(R[t]));
     }
 
     return out;
 }
 
+inline std::string audio_encode_wav_pcm_s24(const float * audio, int T_audio, int sr) {
+    const int n_channels = 2;
+    const int bits       = 24;
+
+    assert(audio != nullptr);
+    assert(T_audio >= 0);
+    assert(sr > 0);
+
+    const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8);
+
+    std::string out;
+    out.resize(68 + (size_t) data_size);
+
+    char * p = out.data();
+    wav_write_header_extensible_pcm_s24(p, T_audio, sr, n_channels);
+
+    const float * L = audio;
+    const float * R = audio + T_audio;
+
+    for (int t = 0; t < T_audio; ++t) {
+        wav_write_s24le(p, wav_quantize_s24(L[t]));
+        wav_write_s24le(p, wav_quantize_s24(R[t]));
+    }
+
+    return out;
+}
+
+inline std::string audio_encode_wav_ieee_f32(const float * audio, int T_audio, int sr) {
+    const int n_channels = 2;
+    const int bits       = 32;
+
+    assert(audio != nullptr);
+    assert(T_audio >= 0);
+    assert(sr > 0);
+
+    const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8);
+
+    std::string out;
+    out.resize(44 + (size_t) data_size);
+
+    char * p = out.data();
+    wav_write_header_basic(p, T_audio, sr, n_channels, bits, 3);
+
+    const float * L = audio;
+    const float * R = audio + T_audio;
+
+    for (int t = 0; t < T_audio; ++t) {
+        wav_write_f32le(p, L[t]);
+        wav_write_f32le(p, R[t]);
+    }
+
+    return out;
+}
+
+enum AudioFileKind {
+    AUDIO_FILE_KIND_MP3,
+    AUDIO_FILE_KIND_WAV,
+};
+
+enum AudioFileFormat {
+    AUDIO_FILE_FORMAT_MP3,
+    AUDIO_FILE_FORMAT_WAV_S16,
+    AUDIO_FILE_FORMAT_WAV_S24,
+    AUDIO_FILE_FORMAT_WAV_IEEE_F32,
+};
+
+enum WavFormat {
+    WAV_FORMAT_S16,
+    WAV_FORMAT_S24,
+    WAV_FORMAT_IEEE_F32,
+};
+
+inline AudioFileKind convert_audio_file_format_to_kind(AudioFileFormat audio_file_format) {
+    switch (audio_file_format) {
+        case AUDIO_FILE_FORMAT_MP3:
+            return AUDIO_FILE_KIND_MP3;
+        case AUDIO_FILE_FORMAT_WAV_S16:
+            return AUDIO_FILE_KIND_WAV;
+        case AUDIO_FILE_FORMAT_WAV_S24:
+            return AUDIO_FILE_KIND_WAV;
+        case AUDIO_FILE_FORMAT_WAV_IEEE_F32:
+            return AUDIO_FILE_KIND_WAV;
+    }
+
+    assert(false && "unsupported AudioFileFormat");
+    std::terminate();
+}
+
+inline bool convert_audio_file_format_to_wav_format(AudioFileFormat audio_file_format, WavFormat& out) {
+    switch (audio_file_format) {
+        case AUDIO_FILE_FORMAT_MP3:
+            return false;
+        case AUDIO_FILE_FORMAT_WAV_S16:
+            out = WAV_FORMAT_S16;
+            return true;
+        case AUDIO_FILE_FORMAT_WAV_S24:
+            out = WAV_FORMAT_S24;
+            return true;
+        case AUDIO_FILE_FORMAT_WAV_IEEE_F32:
+            out = WAV_FORMAT_IEEE_F32;
+            return true;
+    }
+
+    assert(false && "unsupported AudioFileFormat");
+    std::terminate();
+}
+
+inline AudioFileFormat convert_wav_format_to_audio_file_format(WavFormat wav_format) {
+    switch (wav_format) {
+        case WAV_FORMAT_S16:
+            return AUDIO_FILE_FORMAT_WAV_S16;
+        case WAV_FORMAT_S24:
+            return AUDIO_FILE_FORMAT_WAV_S24;
+        case WAV_FORMAT_IEEE_F32:
+            return AUDIO_FILE_FORMAT_WAV_IEEE_F32;
+    }
+
+    assert(false && "unsupported WavFormat");
+    std::terminate();
+}
+
+static bool parse_audio_file_format(const char* s, AudioFileFormat& out) {
+    if (!s) {
+        return false;
+    }
+
+    AudioFileFormat parsed;
+    if (std::strcmp(s, "mp3") == 0) {
+        parsed = AUDIO_FILE_FORMAT_MP3;
+    } else if (std::strcmp(s, "wav") == 0) {
+        parsed = AUDIO_FILE_FORMAT_WAV_S16;
+    } else if (std::strcmp(s, "wav16") == 0) {
+        parsed = AUDIO_FILE_FORMAT_WAV_S16;
+    } else if (std::strcmp(s, "wav24") == 0) {
+        parsed = AUDIO_FILE_FORMAT_WAV_S24;
+    } else if (std::strcmp(s, "wav32") == 0) {
+        parsed = AUDIO_FILE_FORMAT_WAV_IEEE_F32;
+    } else {
+        return false;
+    }
+
+    out = parsed;
+    return true;
+}
+
+static bool parse_wav_format(const char* s, WavFormat& out) {
+    if (!s) {
+        return true;
+    }
+
+    WavFormat parsed;
+    if (std::strcmp(s, "wav") == 0) {
+        parsed = WAV_FORMAT_S16;
+    } else if (std::strcmp(s, "wav16") == 0) {
+        parsed = WAV_FORMAT_S16;
+    } else if (std::strcmp(s, "wav24") == 0) {
+        parsed = WAV_FORMAT_S24;
+    } else if (std::strcmp(s, "wav32") == 0) {
+        parsed = WAV_FORMAT_IEEE_F32;
+    } else {
+        return false;
+    }
+
+    out = parsed;
+    return true;
+}
+
+inline bool should_normalize_audio(AudioFileFormat audio_file_format) {
+    switch (audio_file_format) {
+        case AUDIO_FILE_FORMAT_MP3:
+            return true;
+        case AUDIO_FILE_FORMAT_WAV_S16:
+            return true;
+        case AUDIO_FILE_FORMAT_WAV_S24:
+            return true;
+        case AUDIO_FILE_FORMAT_WAV_IEEE_F32:
+            return false;
+    }
+
+    assert(false && "unsupported AudioFileFormat");
+    std::terminate();
+}
+
+// audio_encode_wav is the core: encode planar stereo to WAV 16-bit signed-integer,
+//   24-bit signed-integer, or 32-bit floating point PCM in memory.
+// 16-bit outputs classic RIFF header and little-endian integer samples.
+// 24-bit outputs extensible header and little-endian integer samples.
+// 32-bit floating-point outputs classic RIFF header and little-endian IEEE-754 floating-point samples.
+// audio is planar [L0..LN, R0..RN], pre-normalized by caller.
+// Does NOT normalize - caller is responsible (audio_write does it).
+// NaN, -Inf, and +Inf are automatically coerced to zero.
+// 16-bit and 24-bit signed-integer output is automatically clamped to -1/+1 range.
+// 32-bit floating-point output may exceed -1/+1 range.
+// Returns empty string on failure.
+inline std::string audio_encode_wav(const float * audio, int T_audio, int sr, WavFormat wav_format) {
+    switch (wav_format) {
+        case WAV_FORMAT_S16:
+            return audio_encode_wav_pcm_s16(audio, T_audio, sr);
+        case WAV_FORMAT_S24:
+            return audio_encode_wav_pcm_s24(audio, T_audio, sr);
+        case WAV_FORMAT_IEEE_F32:
+            return audio_encode_wav_ieee_f32(audio, T_audio, sr);
+    }
+
+    assert(false && "unsupported WavFormat");
+    std::terminate();
+}
+
 // Write planar stereo audio to WAV file. Thin wrapper around audio_encode_wav.
-static bool audio_write_wav(const char * path, const float * audio, int T_audio, int sr) {
-    std::string wav = audio_encode_wav(audio, T_audio, sr);
+static bool audio_write_wav(const char * path, const float * audio, int T_audio, int sr, WavFormat wav_format) {
+    std::string wav = audio_encode_wav(audio, T_audio, sr, wav_format);
     if (wav.empty()) {
         return false;
     }
@@ -651,11 +976,20 @@ static bool audio_write_mp3(const char * path, const float * audio, int T_audio,
 // .mp3 -> MP3 encoding at the given kbps (default 128).
 // .wav (or anything else) -> WAV 16-bit PCM.
 // Normalizes in place before writing (single normalization point).
-static bool audio_write(const char * path, float * audio, int T_audio, int sr, int kbps, int peak_clip = 10) {
-    audio_normalize(audio, T_audio * 2, peak_clip);
+static bool audio_write(const char * path, float * audio, int T_audio, int sr, int kbps, AudioFileFormat audio_file_format, int peak_clip = 10) {
+    const bool write_mp3 = audio_io_ends_with(path, ".mp3");
 
-    if (audio_io_ends_with(path, ".mp3")) {
+    if (write_mp3 || should_normalize_audio(audio_file_format)) {
+        audio_normalize(audio, T_audio * 2, peak_clip);
+    }
+
+    if (write_mp3) {
         return audio_write_mp3(path, audio, T_audio, sr, (kbps > 0) ? kbps : 128);
     }
-    return audio_write_wav(path, audio, T_audio, sr);
+
+    WavFormat wav_format = WAV_FORMAT_S16;
+    if (!convert_audio_file_format_to_wav_format(audio_file_format, wav_format)) {
+        // TODO Serveurperso - implement preferred conversion failure handling
+    }
+    return audio_write_wav(path, audio, T_audio, sr, wav_format);
 }
diff --git a/src/wav.h b/src/wav.h
index 4898559..ba4be84 100644
--- a/src/wav.h
+++ b/src/wav.h
@@ -10,6 +10,36 @@
 #include <cstring>
 #include <vector>
 
+static uint16_t wav_read_u16le(const uint8_t * p) {
+    return (uint16_t) (p[0] | (p[1] << 8));
+}
+
+static uint32_t wav_read_u32le(const uint8_t * p) {
+    return (uint32_t) p[0] |
+           ((uint32_t) p[1] << 8) |
+           ((uint32_t) p[2] << 16) |
+           ((uint32_t) p[3] << 24);
+}
+
+static int32_t wav_read_s24le(const uint8_t * p) {
+    uint32_t u = (uint32_t) p[0] |
+                 ((uint32_t) p[1] << 8) |
+                 ((uint32_t) p[2] << 16);
+
+    if (u & 0x00800000u) {
+        u |= 0xff000000u;
+    }
+
+    return (int32_t) u;
+}
+
+static float wav_read_f32le(const uint8_t * p) {
+    uint32_t u = wav_read_u32le(p);
+    float    f;
+    memcpy(&f, &u, 4);
+    return f;
+}
+
 // Read WAV from memory buffer.
 // Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees.
 static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, int * sr) {
@@ -21,71 +51,110 @@ static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, in
         return NULL;
     }
 
-    int     n_channels = 0, sample_rate = 0, bits_per_sample = 0;
-    short   audio_format = 0;
-    float * audio        = NULL;
-    int     n_samples    = 0;
-    size_t  pos          = 12;
+    int      n_channels = 0;
+    int      sample_rate = 0;
+    int      bits_per_sample = 0;
+    uint16_t audio_format = 0;
+    uint16_t extensible_subformat = 0;
+    float *  audio = NULL;
+    int      n_samples = 0;
+    size_t   pos = 12;
 
     while (pos + 8 <= size) {
         const uint8_t * chunk_id   = data + pos;
-        int             chunk_size = 0;
-        memcpy(&chunk_size, data + pos + 4, 4);
+        uint32_t        chunk_size = wav_read_u32le(data + pos + 4);
         pos += 8;
 
-        if (memcmp(chunk_id, "fmt ", 4) == 0 && pos + 16 <= size) {
-            memcpy(&audio_format, data + pos, 2);
-            short nc;
-            memcpy(&nc, data + pos + 2, 2);
-            n_channels = nc;
-            memcpy(&sample_rate, data + pos + 4, 4);
-            // skip byte_rate(4) + block_align(2)
-            short bps;
-            memcpy(&bps, data + pos + 14, 2);
-            bits_per_sample = bps;
+        if (pos + (size_t) chunk_size > size) {
+            chunk_size = (uint32_t) (size - pos);
+        }
+
+        if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) {
+            audio_format    = wav_read_u16le(data + pos + 0);
+            n_channels      = (int) wav_read_u16le(data + pos + 2);
+            sample_rate     = (int) wav_read_u32le(data + pos + 4);
+            bits_per_sample = (int) wav_read_u16le(data + pos + 14);
+
+            extensible_subformat = 0;
+            if (audio_format == 0xfffe && chunk_size >= 40) {
+                extensible_subformat = wav_read_u16le(data + pos + 24);
+            }
+
             pos += (size_t) chunk_size;
 
         } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) {
             size_t data_bytes = (size_t) chunk_size;
-            if (pos + data_bytes > size) {
-                data_bytes = size - pos;
-            }
 
             if (audio_format == 1 && bits_per_sample == 16) {
                 n_samples         = (int) (data_bytes / ((size_t) n_channels * 2));
                 audio             = (float *) malloc((size_t) n_samples * 2 * sizeof(float));
-                const short * pcm = (const short *) (data + pos);
+                const uint8_t * p = data + pos;
+
                 for (int t = 0; t < n_samples; t++) {
                     if (n_channels == 1) {
-                        float s          = (float) pcm[t] / 32768.0f;
-                        audio[t * 2 + 0] = s;
-                        audio[t * 2 + 1] = s;
+                        int16_t s = (int16_t) wav_read_u16le(p + t * 2);
+                        float   f = (float) s / 32768.0f;
+                        audio[t * 2 + 0] = f;
+                        audio[t * 2 + 1] = f;
                     } else {
-                        audio[t * 2 + 0] = (float) pcm[t * n_channels + 0] / 32768.0f;
-                        audio[t * 2 + 1] = (float) pcm[t * n_channels + 1] / 32768.0f;
+                        const uint8_t * frame = p + (size_t) t * n_channels * 2;
+                        int16_t         l     = (int16_t) wav_read_u16le(frame + 0);
+                        int16_t         r     = (int16_t) wav_read_u16le(frame + 2);
+                        audio[t * 2 + 0] = (float) l / 32768.0f;
+                        audio[t * 2 + 1] = (float) r / 32768.0f;
+                    }
+                }
+            } else if (audio_format == 0xfffe && bits_per_sample == 24 && extensible_subformat == 1) {
+                n_samples         = (int) (data_bytes / ((size_t) n_channels * 3));
+                audio             = (float *) malloc((size_t) n_samples * 2 * sizeof(float));
+                const uint8_t * p = data + pos;
+
+                for (int t = 0; t < n_samples; t++) {
+                    if (n_channels == 1) {
+                        int32_t s = wav_read_s24le(p + t * 3);
+                        float   f = (float) s / 8388608.0f;
+                        audio[t * 2 + 0] = f;
+                        audio[t * 2 + 1] = f;
+                    } else {
+                        const uint8_t * frame = p + (size_t) t * n_channels * 3;
+                        int32_t         l     = wav_read_s24le(frame + 0);
+                        int32_t         r     = wav_read_s24le(frame + 3);
+                        audio[t * 2 + 0] = (float) l / 8388608.0f;
+                        audio[t * 2 + 1] = (float) r / 8388608.0f;
                     }
                 }
             } else if (audio_format == 3 && bits_per_sample == 32) {
-                n_samples          = (int) (data_bytes / ((size_t) n_channels * 4));
-                audio              = (float *) malloc((size_t) n_samples * 2 * sizeof(float));
-                const float * fbuf = (const float *) (data + pos);
+                n_samples         = (int) (data_bytes / ((size_t) n_channels * 4));
+                audio             = (float *) malloc((size_t) n_samples * 2 * sizeof(float));
+                const uint8_t * p = data + pos;
+
                 for (int t = 0; t < n_samples; t++) {
                     if (n_channels == 1) {
-                        audio[t * 2 + 0] = fbuf[t];
-                        audio[t * 2 + 1] = fbuf[t];
+                        float s = wav_read_f32le(p + t * 4);
+                        audio[t * 2 + 0] = s;
+                        audio[t * 2 + 1] = s;
                     } else {
-                        audio[t * 2 + 0] = fbuf[t * n_channels + 0];
-                        audio[t * 2 + 1] = fbuf[t * n_channels + 1];
+                        const uint8_t * frame = p + (size_t) t * n_channels * 4;
+                        float           l     = wav_read_f32le(frame + 0);
+                        float           r     = wav_read_f32le(frame + 4);
+                        audio[t * 2 + 0] = l;
+                        audio[t * 2 + 1] = r;
                     }
                 }
             } else {
-                fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d\n", audio_format, bits_per_sample);
+                fprintf(stderr, "[WAV] Unsupported: format=%u bits=%d subformat=%u\n",
+                        (unsigned) audio_format, bits_per_sample, (unsigned) extensible_subformat);
                 return NULL;
             }
+
             break;
         } else {
             pos += (size_t) chunk_size;
         }
+
+        if (chunk_size & 1) {
+            pos += 1;
+        }
     }
 
     if (!audio) {
@@ -95,7 +164,7 @@ static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, in
 
     *T_audio = n_samples;
     *sr      = sample_rate;
-    fprintf(stderr, "[WAV] Read buffer: %d samples, %d Hz, %d ch, %d bit\n", n_samples, sample_rate, n_channels,
-            bits_per_sample);
+    fprintf(stderr, "[WAV] Read buffer: %d samples, %d Hz, %d ch, %d bit\n",
+            n_samples, sample_rate, n_channels, bits_per_sample);
     return audio;
 }
diff --git a/tools/ace-server.cpp b/tools/ace-server.cpp
index efd6cfd..9993f95 100644
--- a/tools/ace-server.cpp
+++ b/tools/ace-server.cpp
@@ -190,9 +190,9 @@ static AceSynthParams      g_synth_params;
 static AceUnderstandParams g_und_params;
 
 // limits
-static int  g_max_batch   = 1;
-static int  g_mp3_kbps    = 128;
-static bool g_keep_loaded = false;
+static int       g_max_batch   = 1;
+static int       g_mp3_kbps    = 128;
+static bool      g_keep_loaded = false;
 
 // job system: all compute endpoints create a job and return its ID
 // immediately. the worker thread processes jobs in FIFO order, stores
@@ -836,9 +836,22 @@ static void synth_worker(std::shared_ptr<Job>    job,
         if (!audio[b].samples) {
             continue;
         }
-        audio_normalize(audio[b].samples, audio[b].n_samples * 2, peak_clip);
+
+        // TODO Serveurperso - wire AudioFileFormat into http/json
+        const AudioFileFormat audio_file_format = output_wav
+            ? AUDIO_FILE_FORMAT_WAV_S16
+            : AUDIO_FILE_FORMAT_MP3;
+
+        if (should_normalize_audio(audio_file_format)) {
+            audio_normalize(audio[b].samples, audio[b].n_samples * 2, peak_clip);
+        }
+
         if (output_wav) {
-            encoded[b] = audio_encode_wav(audio[b].samples, audio[b].n_samples, 48000);
+            WavFormat wav_format = WAV_FORMAT_S16;
+            if (!convert_audio_file_format_to_wav_format(audio_file_format, wav_format)) {
+                // TODO Serveurperso - implement preferred conversion failure handling
+            }
+            encoded[b] = audio_encode_wav(audio[b].samples, audio[b].n_samples, 48000, wav_format);
         } else {
             encoded[b] = audio_encode_mp3(audio[b].samples, audio[b].n_samples, 48000, g_mp3_kbps, server_cancel_job,
                                           (void *) &job->cancel);
@@ -1240,10 +1253,10 @@ int main(int argc, char ** argv) {
     ace_lm_default_params(&g_lm_params);
     ace_synth_default_params(&g_synth_params);
 
-    const char * host       = "127.0.0.1";
-    int          port       = 8080;
-    const char * models_dir = nullptr;
-    const char * loras_dir  = nullptr;
+    const char * host           = "127.0.0.1";
+    int          port           = 8080;
+    const char * models_dir     = nullptr;
+    const char * loras_dir      = nullptr;
 
     if (argc < 2) {
         usage(argv[0]);
diff --git a/tools/ace-synth.cpp b/tools/ace-synth.cpp
index e83d848..ee9c8e4 100644
--- a/tools/ace-synth.cpp
+++ b/tools/ace-synth.cpp
@@ -30,8 +30,9 @@ static void usage(const char * prog) {
             "  --lora-scale <float>    LoRA scaling factor (default: 1.0)\n\n"
             "Output:\n"
             "  Default: MP3 at 128 kbps. input.json -> input0.mp3, input1.mp3, ...\n"
+            "  --output <format>       Output audio file format (default: mp3)\n"
             "  --mp3-bitrate <kbps>    MP3 bitrate (default: 128)\n"
-            "  --wav                   Output WAV instead of MP3\n\n"
+            "\n"
             "Memory control:\n"
             "  --vae-chunk <N>         Latent frames per tile (default: 256)\n"
             "  --vae-overlap <N>       Overlap frames per side (default: 64)\n\n"
@@ -50,21 +51,22 @@ int main(int argc, char ** argv) {
     }
 
     std::vector<const char *> request_paths;
-    const char *              text_enc_gguf  = NULL;
-    const char *              dit_gguf       = NULL;
-    const char *              vae_gguf       = NULL;
-    const char *              src_audio_path = NULL;
-    const char *              ref_audio_path = NULL;
-    const char *              dump_dir       = NULL;
-    const char *              lora_path      = NULL;
-    float                     lora_scale     = 1.0f;
-    bool                      use_fa         = true;
-    bool                      use_batch_cfg  = true;
-    bool                      clamp_fp16     = false;
-    int                       vae_chunk      = 256;
-    int                       vae_overlap    = 64;
-    bool                      output_wav     = false;  // default MP3, --wav forces WAV
-    int                       mp3_kbps       = 128;
+    const char *              text_enc_gguf         = NULL;
+    const char *              dit_gguf              = NULL;
+    const char *              vae_gguf              = NULL;
+    const char *              src_audio_path        = NULL;
+    const char *              ref_audio_path        = NULL;
+    const char *              dump_dir              = NULL;
+    const char *              lora_path             = NULL;
+    float                     lora_scale            = 1.0f;
+    bool                      use_fa                = true;
+    bool                      use_batch_cfg         = true;
+    bool                      clamp_fp16            = false;
+    int                       vae_chunk             = 256;
+    int                       vae_overlap           = 64;
+    const char *              audio_file_format_str = nullptr;
+    AudioFileFormat           audio_file_format     = AUDIO_FILE_FORMAT_MP3;
+    int                       mp3_kbps              = 128;
 
     for (int i = 1; i < argc; i++) {
         if (!strcmp(argv[i], "--request")) {
@@ -98,8 +100,8 @@ int main(int argc, char ** argv) {
             vae_chunk = atoi(argv[++i]);
         } else if (!strcmp(argv[i], "--vae-overlap") && i + 1 < argc) {
             vae_overlap = atoi(argv[++i]);
-        } else if (!strcmp(argv[i], "--wav")) {
-            output_wav = true;
+        } else if (!strcmp(argv[i], "--output") && i + 1 < argc) {
+            audio_file_format_str = argv[++i];
         } else if (!strcmp(argv[i], "--mp3-bitrate") && i + 1 < argc) {
             mp3_kbps = atoi(argv[++i]);
         } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
@@ -127,6 +129,11 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
         return 1;
     }
+    if (audio_file_format_str != nullptr && !parse_audio_file_format(audio_file_format_str, audio_file_format)) {
+        fprintf(stderr, "[CLI] ERROR: --output requires a supported value\n");
+        usage(argv[0]);
+        return 1;
+    }
 
     // Load models
     AceSynthParams params;
@@ -284,10 +291,11 @@ int main(int argc, char ** argv) {
         if (!all_audio[b].samples) {
             continue;
         }
-        const char * ext = output_wav ? ".wav" : ".mp3";
+        const AudioFileKind audio_file_kind = convert_audio_file_format_to_kind(audio_file_format);
+        const char * ext = audio_file_kind == AUDIO_FILE_KIND_WAV ? ".wav" : ".mp3";
         char         out_path[1024];
         snprintf(out_path, sizeof(out_path), "%s%d%s", all_basenames[b].c_str(), all_synth_indices[b], ext);
-        if (!audio_write(out_path, all_audio[b].samples, all_audio[b].n_samples, 48000, mp3_kbps)) {
+        if (!audio_write(out_path, all_audio[b].samples, all_audio[b].n_samples, 48000, mp3_kbps, audio_file_format)) {
             fprintf(stderr, "[Ace-Synth Batch%d] FATAL: failed to write %s\n", b, out_path);
         }
         ace_audio_free(&all_audio[b]);
diff --git a/tools/mp3-codec.cpp b/tools/mp3-codec.cpp
index c673a2f..0ef024d 100644
--- a/tools/mp3-codec.cpp
+++ b/tools/mp3-codec.cpp
@@ -41,9 +41,10 @@ int main(int argc, char ** argv) {
         fprintf(stderr,
                 "Usage: %s -i <input> -o <output> [options]\n"
                 "\n"
-                "  -i <path>   Input file (WAV or MP3)\n"
-                "  -o <path>   Output file (WAV or MP3)\n"
-                "  -b <kbps>   Bitrate for MP3 encoding (default: 128)\n"
+                "  -i <path>          Input file (WAV or MP3)\n"
+                "  -o <path>          Output file (WAV or MP3)\n"
+                "  -b <kbps>          Bitrate for MP3 encoding (default: 128)\n"
+                "  --wav-format <fmt> WAV audio format (default: wav16)\n"
                 "\n"
                 "Mode is auto-detected from output extension.\n"
                 "\n"
@@ -58,6 +59,8 @@ int main(int argc, char ** argv) {
     const char * input   = NULL;
     const char * output  = NULL;
     int          bitrate = 128;
+    const char * wav_format_str = nullptr;
+    WavFormat    wav_format     = WAV_FORMAT_S16;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) {
@@ -66,6 +69,8 @@ int main(int argc, char ** argv) {
             output = argv[++i];
         } else if (strcmp(argv[i], "-b") == 0 && i + 1 < argc) {
             bitrate = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--wav-format") == 0 && i + 1 < argc) {
+            wav_format_str = argv[++i];
         } else {
             fprintf(stderr, "[MP3-Codec] Unknown option: %s\n", argv[i]);
             return 1;
@@ -76,6 +81,14 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[MP3-Codec] Both -i and -o are required\n");
         return 1;
     }
+    if (!ends_with(output, ".wav") && wav_format_str != nullptr) {
+        fprintf(stderr, "[MP3-Codec] ERROR: --wav-format requires .wav output file\n");
+        return 1;
+    }
+    if (!parse_wav_format(wav_format_str, wav_format)) {
+        fprintf(stderr, "[MP3-Codec] ERROR: --wav-format requires a supported value\n");
+        return 1;
+    }
 
     // read input (WAV or MP3, auto-detected)
     int     T = 0, sr = 0;
@@ -89,7 +102,7 @@ int main(int argc, char ** argv) {
     if (ends_with(output, ".mp3")) {
         ok = audio_write_mp3(output, audio, T, sr, bitrate);
     } else if (ends_with(output, ".wav")) {
-        ok = audio_write_wav(output, audio, T, sr);
+        ok = audio_write_wav(output, audio, T, sr, wav_format);
     } else {
         fprintf(stderr, "[MP3-Codec] Cannot determine format from output extension\n");
         fprintf(stderr, "  use .mp3 for encoding, .wav for decoding\n");
diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp
index 698b3da..c6e9387 100644
--- a/tools/neural-codec.cpp
+++ b/tools/neural-codec.cpp
@@ -316,6 +316,7 @@ static void print_usage(const char * prog) {
             "  -o <path>               Output file (auto-named if omitted)\n"
             "  --q8                    Quantize latent to int8 (~13 kbit/s)\n"
             "  --q4                    Quantize latent to int4 (~6.8 kbit/s)\n\n"
+            "  --wav-format <fmt>      WAV audio format (default: wav16)\n\n"
             "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n"
             "               song.latent -> song.wav\n\n"
             "Memory control:\n"
@@ -338,13 +339,15 @@ static std::string auto_output(const char * input, const char * ext) {
 }
 
 int main(int argc, char ** argv) {
-    const char * vae_path    = NULL;
-    const char * input_path  = NULL;
-    const char * output_path = NULL;
-    int          chunk_size  = 256;
-    int          overlap     = 64;
-    int          mode        = -1;  // 0 = encode, 1 = decode
-    int          quant       = 0;   // 0 = f32, 8 = q8, 4 = q4
+    const char * vae_path       = NULL;
+    const char * input_path     = NULL;
+    const char * output_path    = NULL;
+    const char * wav_format_str = nullptr;
+    WavFormat    wav_format     = WAV_FORMAT_S16;
+    int          chunk_size     = 256;
+    int          overlap        = 64;
+    int          mode           = -1;  // 0 = encode, 1 = decode
+    int          quant          = 0;   // 0 = f32, 8 = q8, 4 = q4
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "--vae") == 0 && i + 1 < argc) {
@@ -357,6 +360,8 @@ int main(int argc, char ** argv) {
             output_path = argv[++i];
         } else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc) {
             output_path = argv[++i];
+        } else if (strcmp(argv[i], "--wav-format") == 0 && i + 1 < argc) {
+            wav_format_str = argv[++i];
         } else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc) {
             chunk_size = atoi(argv[++i]);
         } else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) {
@@ -383,6 +388,16 @@ int main(int argc, char ** argv) {
         print_usage(argv[0]);
         return 1;
     }
+    if (mode != 1 && wav_format_str != nullptr) {
+        fprintf(stderr, "[CLI] ERROR: --wav-format requires usage of --decode\n");
+        print_usage(argv[0]);
+        return 1;
+    }
+    if (!parse_wav_format(wav_format_str, wav_format)) {
+        fprintf(stderr, "[CLI] ERROR: --wav-format requires a supported value\n");
+        print_usage(argv[0]);
+        return 1;
+    }
 
     // Auto output names
     std::string out_str;
@@ -473,7 +488,8 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        if (audio_write(output_path, audio.data(), T_audio, 48000, 0)) {
+        const AudioFileFormat audio_file_format = convert_wav_format_to_audio_file_format(wav_format);
+        if (audio_write(output_path, audio.data(), T_audio, 48000, 0, audio_file_format)) {
             fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n", output_path, T_audio,
                     (float) T_audio / 48000.0f);
         } else {