diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 86dc197..91cb844 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -342,7 +342,7 @@ EOF --embedding models/Qwen3-Embedding-0.6B-Q8_0.gguf \ --dit models/acestep-v15-base-Q8_0.gguf \ --vae models/vae-BF16.gguf \ - --wav + --output wav ``` Available track names for lego, extract, and complete: `vocals`, `backing_vocals`, @@ -711,8 +711,14 @@ LoRA: Output: Default: MP3 at 128 kbps. input.json -> input0.mp3, input1.mp3, ... + --output Output audio file format (default: mp3) + Supported values: mp3, wav, wav16, wav24, wav32 + mp3: MPEG-1 Audio Layer III encoded audio + wav/wav16: 16-bit signed-integer WAVE audio + wav24: 24-bit signed-integer WAVE audio + wav32: 32-bit IEEE floating-point WAVE audio + (wav32 disables normalization & peak clip) --mp3-bitrate MP3 bitrate (default: 128) - --wav Output WAV instead of MP3 Memory control: --vae-chunk Latent frames per tile (default: 256) @@ -923,6 +929,13 @@ Output: -o Output file (auto-named if omitted) --q8 Quantize latent to int8 (~13 kbit/s) --q4 Quantize latent to int4 (~6.8 kbit/s) + --wav-format WAV audio format (default: wav16) + Requires use of --decode + Supported values: wav, wav16, wav24, wav32 + wav/wav16: 16-bit signed-integer WAVE audio + wav24: 24-bit signed-integer WAVE audio + wav32: 32-bit IEEE floating-point WAVE audio + (wav32 disables normalization & peak clip) Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4) song.latent -> song.wav @@ -979,9 +992,16 @@ from output extension). ``` Usage: mp3-codec -i -o [options] - -i Input file (WAV or MP3) - -o Output file (WAV or MP3) - -b Bitrate for MP3 encoding (default: 128) + -i Input file (WAV or MP3) + -o Output file (WAV or MP3) + -b Bitrate for MP3 encoding (default: 128) + --wav-format WAV audio format (default: wav16) + Requires use of -o with a .wav extension + Supported values: wav, wav16, wav24, wav32 + wav/wav16: 16-bit signed-integer WAVE audio + wav24: 24-bit signed-integer WAVE audio + wav32: 32-bit IEEE floating-point WAVE audio + (wav32 disables normalization & peak clip) Mode is auto-detected from output extension. @@ -989,6 +1009,7 @@ Examples: mp3-codec -i song.wav -o song.mp3 mp3-codec -i song.wav -o song.mp3 -b 192 mp3-codec -i song.mp3 -o song.wav + mp3-codec -i song.mp3 -o song.wav --wav-format wav32 ``` ## ace-understand reference diff --git a/examples/lego.sh b/examples/lego.sh index 491820c..98c7a63 100755 --- a/examples/lego.sh +++ b/examples/lego.sh @@ -21,7 +21,7 @@ set -eu --embedding ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ --dit ../models/acestep-v15-turbo-Q8_0.gguf \ --vae ../models/vae-BF16.gguf \ - --wav + --output wav # Phase 2: lego guitar on the generated track (base model required) ../build/ace-synth \ @@ -30,4 +30,4 @@ set -eu --embedding ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ --dit ../models/acestep-v15-base-Q8_0.gguf \ --vae ../models/vae-BF16.gguf \ - --wav + --output wav diff --git a/src/audio-io.h b/src/audio-io.h index 96b6e95..af0cb6c 100644 --- a/src/audio-io.h +++ b/src/audio-io.h @@ -6,11 +6,14 @@ // Part of acestep.cpp. MIT license. #include +#include #include #include +#include #include #include #include +#include #include #include #include @@ -355,70 +358,392 @@ static float * audio_planar_to_interleaved(const float * planar, int T) { return out; } -// audio_encode_wav is the core: encode planar stereo to WAV 16-bit PCM in memory. -// 44-byte RIFF header + interleaved int16 samples. -// audio is planar [L0..LN, R0..RN], pre-normalized by caller. -// Does NOT normalize - caller is responsible (audio_write does it). -// Returns empty string on failure. -static std::string audio_encode_wav(const float * audio, int T_audio, int sr) { - int n_channels = 2, bits = 16; - int byte_rate = sr * n_channels * (bits / 8); - int block_align = n_channels * (bits / 8); - int data_size = T_audio * n_channels * (bits / 8); - int file_size = 36 + data_size; +static_assert(sizeof(float) == 4, "requires 32-bit float"); +static_assert(std::numeric_limits::is_iec559, "requires IEEE-754 float"); - std::string out; - out.resize(44 + (size_t) data_size); - char * p = &out[0]; +inline void wav_write_u16le(char *& p, std::uint16_t x) { + *p++ = (char) (x & 0xff); + *p++ = (char) ((x >> 8) & 0xff); +} + +inline void wav_write_u24le(char *& p, std::uint32_t x) { + *p++ = (char) (x & 0xff); + *p++ = (char) ((x >> 8) & 0xff); + *p++ = (char) ((x >> 16) & 0xff); +} + +inline void wav_write_u32le(char *& p, std::uint32_t x) { + *p++ = (char) (x & 0xff); + *p++ = (char) ((x >> 8) & 0xff); + *p++ = (char) ((x >> 16) & 0xff); + *p++ = (char) ((x >> 24) & 0xff); +} + +inline float wav_sanitize(float x) { + return std::isfinite(x) ? x : 0.0f; +} + +inline void wav_write_f32le(char *& p, float x) { + x = wav_sanitize(x); + std::uint32_t o; + std::memcpy(&o, &x, 4); + wav_write_u32le(p, o); +} + +inline float wav_clamp1(float x) { + return x < -1.0f ? -1.0f : (x > 1.0f ? 1.0f : x); +} + +inline std::int16_t wav_quantize_s16(float x) { + x = wav_clamp1(wav_sanitize(x)); + return (std::int16_t) (x * 32767.0f); +} + +inline std::int32_t wav_quantize_s24(float x) { + x = wav_clamp1(wav_sanitize(x)); + return (std::int32_t) (x * 8388607.0f); +} + +inline void wav_write_s16le(char *& p, std::int16_t x) { + wav_write_u16le(p, (std::uint16_t) x); +} + +inline void wav_write_s24le(char *& p, std::int32_t x) { + wav_write_u24le(p, (std::uint32_t) x); +} + +inline void wav_write_guid_pcm(char *& p) { + wav_write_u32le(p, 0x00000001u); + wav_write_u16le(p, 0x0000u); + wav_write_u16le(p, 0x0010u); + *p++ = (char) 0x80; + *p++ = (char) 0x00; + *p++ = (char) 0x00; + *p++ = (char) 0xAA; + *p++ = (char) 0x00; + *p++ = (char) 0x38; + *p++ = (char) 0x9B; + *p++ = (char) 0x71; +} + +inline void wav_write_header_basic( + char *& p, + int T_audio, + int sr, + int n_channels, + int bits, + std::uint16_t fmt_tag) { + assert(T_audio >= 0); + assert(sr > 0); + assert(n_channels > 0); + assert(bits == 16 || bits == 32); + + const std::uint32_t bytes_per_sample = (std::uint32_t) bits / 8; + const std::uint32_t byte_rate = (std::uint32_t) sr * (std::uint32_t) n_channels * bytes_per_sample; + const std::uint16_t block_align = (std::uint16_t) (n_channels * (int) bytes_per_sample); + const std::uint32_t data_size = (std::uint32_t) T_audio * (std::uint32_t) n_channels * bytes_per_sample; + const std::uint32_t file_size = 36 + data_size; - // RIFF header memcpy(p, "RIFF", 4); p += 4; - memcpy(p, &file_size, 4); - p += 4; + wav_write_u32le(p, file_size); memcpy(p, "WAVE", 4); p += 4; + memcpy(p, "fmt ", 4); p += 4; - int fmt_size = 16; - short fmt_tag = 1; - short nc = (short) n_channels; - short ba = (short) block_align; - short bp = (short) bits; - memcpy(p, &fmt_size, 4); + wav_write_u32le(p, 16); + wav_write_u16le(p, fmt_tag); + wav_write_u16le(p, (std::uint16_t) n_channels); + wav_write_u32le(p, (std::uint32_t) sr); + wav_write_u32le(p, byte_rate); + wav_write_u16le(p, block_align); + wav_write_u16le(p, (std::uint16_t) bits); + + memcpy(p, "data", 4); p += 4; - memcpy(p, &fmt_tag, 2); - p += 2; - memcpy(p, &nc, 2); - p += 2; - memcpy(p, &sr, 4); + wav_write_u32le(p, data_size); +} + +inline void wav_write_header_extensible_pcm_s24( + char *& p, + int T_audio, + int sr, + int n_channels) { + assert(T_audio >= 0); + assert(sr > 0); + assert(n_channels == 2); + + const int bits = 24; + const std::uint32_t bytes_per_sample = 3; + const std::uint32_t byte_rate = (std::uint32_t) sr * (std::uint32_t) n_channels * bytes_per_sample; + const std::uint16_t block_align = (std::uint16_t) (n_channels * (int) bytes_per_sample); + const std::uint32_t data_size = (std::uint32_t) T_audio * (std::uint32_t) n_channels * bytes_per_sample; + const std::uint32_t file_size = 60 + data_size; + + memcpy(p, "RIFF", 4); p += 4; - memcpy(p, &byte_rate, 4); + wav_write_u32le(p, file_size); + memcpy(p, "WAVE", 4); p += 4; - memcpy(p, &ba, 2); - p += 2; - memcpy(p, &bp, 2); - p += 2; - memcpy(p, "data", 4); + + memcpy(p, "fmt ", 4); p += 4; - memcpy(p, &data_size, 4); + wav_write_u32le(p, 40); + wav_write_u16le(p, 0xfffe); + wav_write_u16le(p, (std::uint16_t) n_channels); + wav_write_u32le(p, (std::uint32_t) sr); + wav_write_u32le(p, byte_rate); + wav_write_u16le(p, block_align); + wav_write_u16le(p, (std::uint16_t) bits); + wav_write_u16le(p, 22); + wav_write_u16le(p, (std::uint16_t) bits); + wav_write_u32le(p, 0x00000003u); + wav_write_guid_pcm(p); + + memcpy(p, "data", 4); p += 4; + wav_write_u32le(p, data_size); +} + +inline std::string audio_encode_wav_pcm_s16(const float * audio, int T_audio, int sr) { + const int n_channels = 2; + const int bits = 16; - // interleave planar float to PCM int16 - const float * L = audio; - const float * R = audio + T_audio; - short * pcm = (short *) p; - for (int t = 0; t < T_audio; t++) { - pcm[t * 2 + 0] = (short) (L[t] * 32767.0f); - pcm[t * 2 + 1] = (short) (R[t] * 32767.0f); + assert(audio != nullptr); + assert(T_audio >= 0); + assert(sr > 0); + + const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8); + + std::string out; + out.resize(44 + (size_t) data_size); + + char * p = out.data(); + wav_write_header_basic(p, T_audio, sr, n_channels, bits, 1); + + const float * L = audio; + const float * R = audio + T_audio; + + for (int t = 0; t < T_audio; ++t) { + wav_write_s16le(p, wav_quantize_s16(L[t])); + wav_write_s16le(p, wav_quantize_s16(R[t])); } return out; } +inline std::string audio_encode_wav_pcm_s24(const float * audio, int T_audio, int sr) { + const int n_channels = 2; + const int bits = 24; + + assert(audio != nullptr); + assert(T_audio >= 0); + assert(sr > 0); + + const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8); + + std::string out; + out.resize(68 + (size_t) data_size); + + char * p = out.data(); + wav_write_header_extensible_pcm_s24(p, T_audio, sr, n_channels); + + const float * L = audio; + const float * R = audio + T_audio; + + for (int t = 0; t < T_audio; ++t) { + wav_write_s24le(p, wav_quantize_s24(L[t])); + wav_write_s24le(p, wav_quantize_s24(R[t])); + } + + return out; +} + +inline std::string audio_encode_wav_ieee_f32(const float * audio, int T_audio, int sr) { + const int n_channels = 2; + const int bits = 32; + + assert(audio != nullptr); + assert(T_audio >= 0); + assert(sr > 0); + + const std::uint32_t data_size = (std::uint32_t) T_audio * n_channels * (bits / 8); + + std::string out; + out.resize(44 + (size_t) data_size); + + char * p = out.data(); + wav_write_header_basic(p, T_audio, sr, n_channels, bits, 3); + + const float * L = audio; + const float * R = audio + T_audio; + + for (int t = 0; t < T_audio; ++t) { + wav_write_f32le(p, L[t]); + wav_write_f32le(p, R[t]); + } + + return out; +} + +enum AudioFileKind { + AUDIO_FILE_KIND_MP3, + AUDIO_FILE_KIND_WAV, +}; + +enum AudioFileFormat { + AUDIO_FILE_FORMAT_MP3, + AUDIO_FILE_FORMAT_WAV_S16, + AUDIO_FILE_FORMAT_WAV_S24, + AUDIO_FILE_FORMAT_WAV_IEEE_F32, +}; + +enum WavFormat { + WAV_FORMAT_S16, + WAV_FORMAT_S24, + WAV_FORMAT_IEEE_F32, +}; + +inline AudioFileKind convert_audio_file_format_to_kind(AudioFileFormat audio_file_format) { + switch (audio_file_format) { + case AUDIO_FILE_FORMAT_MP3: + return AUDIO_FILE_KIND_MP3; + case AUDIO_FILE_FORMAT_WAV_S16: + return AUDIO_FILE_KIND_WAV; + case AUDIO_FILE_FORMAT_WAV_S24: + return AUDIO_FILE_KIND_WAV; + case AUDIO_FILE_FORMAT_WAV_IEEE_F32: + return AUDIO_FILE_KIND_WAV; + } + + assert(false && "unsupported AudioFileFormat"); + std::terminate(); +} + +inline bool convert_audio_file_format_to_wav_format(AudioFileFormat audio_file_format, WavFormat& out) { + switch (audio_file_format) { + case AUDIO_FILE_FORMAT_MP3: + return false; + case AUDIO_FILE_FORMAT_WAV_S16: + out = WAV_FORMAT_S16; + return true; + case AUDIO_FILE_FORMAT_WAV_S24: + out = WAV_FORMAT_S24; + return true; + case AUDIO_FILE_FORMAT_WAV_IEEE_F32: + out = WAV_FORMAT_IEEE_F32; + return true; + } + + assert(false && "unsupported AudioFileFormat"); + std::terminate(); +} + +inline AudioFileFormat convert_wav_format_to_audio_file_format(WavFormat wav_format) { + switch (wav_format) { + case WAV_FORMAT_S16: + return AUDIO_FILE_FORMAT_WAV_S16; + case WAV_FORMAT_S24: + return AUDIO_FILE_FORMAT_WAV_S24; + case WAV_FORMAT_IEEE_F32: + return AUDIO_FILE_FORMAT_WAV_IEEE_F32; + } + + assert(false && "unsupported WavFormat"); + std::terminate(); +} + +static bool parse_audio_file_format(const char* s, AudioFileFormat& out) { + if (!s) { + return false; + } + + AudioFileFormat parsed; + if (std::strcmp(s, "mp3") == 0) { + parsed = AUDIO_FILE_FORMAT_MP3; + } else if (std::strcmp(s, "wav") == 0) { + parsed = AUDIO_FILE_FORMAT_WAV_S16; + } else if (std::strcmp(s, "wav16") == 0) { + parsed = AUDIO_FILE_FORMAT_WAV_S16; + } else if (std::strcmp(s, "wav24") == 0) { + parsed = AUDIO_FILE_FORMAT_WAV_S24; + } else if (std::strcmp(s, "wav32") == 0) { + parsed = AUDIO_FILE_FORMAT_WAV_IEEE_F32; + } else { + return false; + } + + out = parsed; + return true; +} + +static bool parse_wav_format(const char* s, WavFormat& out) { + if (!s) { + return true; + } + + WavFormat parsed; + if (std::strcmp(s, "wav") == 0) { + parsed = WAV_FORMAT_S16; + } else if (std::strcmp(s, "wav16") == 0) { + parsed = WAV_FORMAT_S16; + } else if (std::strcmp(s, "wav24") == 0) { + parsed = WAV_FORMAT_S24; + } else if (std::strcmp(s, "wav32") == 0) { + parsed = WAV_FORMAT_IEEE_F32; + } else { + return false; + } + + out = parsed; + return true; +} + +inline bool should_normalize_audio(AudioFileFormat audio_file_format) { + switch (audio_file_format) { + case AUDIO_FILE_FORMAT_MP3: + return true; + case AUDIO_FILE_FORMAT_WAV_S16: + return true; + case AUDIO_FILE_FORMAT_WAV_S24: + return true; + case AUDIO_FILE_FORMAT_WAV_IEEE_F32: + return false; + } + + assert(false && "unsupported AudioFileFormat"); + std::terminate(); +} + +// audio_encode_wav is the core: encode planar stereo to WAV 16-bit signed-integer, +// 24-bit signed-integer, or 32-bit floating point PCM in memory. +// 16-bit outputs classic RIFF header and little-endian integer samples. +// 24-bit outputs extensible header and little-endian integer samples. +// 32-bit floating-point outputs classic RIFF header and little-endian IEEE-754 floating-point samples. +// audio is planar [L0..LN, R0..RN], pre-normalized by caller. +// Does NOT normalize - caller is responsible (audio_write does it). +// NaN, -Inf, and +Inf are automatically coerced to zero. +// 16-bit and 24-bit signed-integer output is automatically clamped to -1/+1 range. +// 32-bit floating-point output may exceed -1/+1 range. +// Returns empty string on failure. +inline std::string audio_encode_wav(const float * audio, int T_audio, int sr, WavFormat wav_format) { + switch (wav_format) { + case WAV_FORMAT_S16: + return audio_encode_wav_pcm_s16(audio, T_audio, sr); + case WAV_FORMAT_S24: + return audio_encode_wav_pcm_s24(audio, T_audio, sr); + case WAV_FORMAT_IEEE_F32: + return audio_encode_wav_ieee_f32(audio, T_audio, sr); + } + + assert(false && "unsupported WavFormat"); + std::terminate(); +} + // Write planar stereo audio to WAV file. Thin wrapper around audio_encode_wav. -static bool audio_write_wav(const char * path, const float * audio, int T_audio, int sr) { - std::string wav = audio_encode_wav(audio, T_audio, sr); +static bool audio_write_wav(const char * path, const float * audio, int T_audio, int sr, WavFormat wav_format) { + std::string wav = audio_encode_wav(audio, T_audio, sr, wav_format); if (wav.empty()) { return false; } @@ -651,11 +976,20 @@ static bool audio_write_mp3(const char * path, const float * audio, int T_audio, // .mp3 -> MP3 encoding at the given kbps (default 128). // .wav (or anything else) -> WAV 16-bit PCM. // Normalizes in place before writing (single normalization point). -static bool audio_write(const char * path, float * audio, int T_audio, int sr, int kbps, int peak_clip = 10) { - audio_normalize(audio, T_audio * 2, peak_clip); +static bool audio_write(const char * path, float * audio, int T_audio, int sr, int kbps, AudioFileFormat audio_file_format, int peak_clip = 10) { + const bool write_mp3 = audio_io_ends_with(path, ".mp3"); - if (audio_io_ends_with(path, ".mp3")) { + if (write_mp3 || should_normalize_audio(audio_file_format)) { + audio_normalize(audio, T_audio * 2, peak_clip); + } + + if (write_mp3) { return audio_write_mp3(path, audio, T_audio, sr, (kbps > 0) ? kbps : 128); } - return audio_write_wav(path, audio, T_audio, sr); + + WavFormat wav_format = WAV_FORMAT_S16; + if (!convert_audio_file_format_to_wav_format(audio_file_format, wav_format)) { + // TODO Serveurperso - implement preferred conversion failure handling + } + return audio_write_wav(path, audio, T_audio, sr, wav_format); } diff --git a/src/wav.h b/src/wav.h index 4898559..ba4be84 100644 --- a/src/wav.h +++ b/src/wav.h @@ -10,6 +10,36 @@ #include #include +static uint16_t wav_read_u16le(const uint8_t * p) { + return (uint16_t) (p[0] | (p[1] << 8)); +} + +static uint32_t wav_read_u32le(const uint8_t * p) { + return (uint32_t) p[0] | + ((uint32_t) p[1] << 8) | + ((uint32_t) p[2] << 16) | + ((uint32_t) p[3] << 24); +} + +static int32_t wav_read_s24le(const uint8_t * p) { + uint32_t u = (uint32_t) p[0] | + ((uint32_t) p[1] << 8) | + ((uint32_t) p[2] << 16); + + if (u & 0x00800000u) { + u |= 0xff000000u; + } + + return (int32_t) u; +} + +static float wav_read_f32le(const uint8_t * p) { + uint32_t u = wav_read_u32le(p); + float f; + memcpy(&f, &u, 4); + return f; +} + // Read WAV from memory buffer. // Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees. static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, int * sr) { @@ -21,71 +51,110 @@ static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, in return NULL; } - int n_channels = 0, sample_rate = 0, bits_per_sample = 0; - short audio_format = 0; - float * audio = NULL; - int n_samples = 0; - size_t pos = 12; + int n_channels = 0; + int sample_rate = 0; + int bits_per_sample = 0; + uint16_t audio_format = 0; + uint16_t extensible_subformat = 0; + float * audio = NULL; + int n_samples = 0; + size_t pos = 12; while (pos + 8 <= size) { const uint8_t * chunk_id = data + pos; - int chunk_size = 0; - memcpy(&chunk_size, data + pos + 4, 4); + uint32_t chunk_size = wav_read_u32le(data + pos + 4); pos += 8; - if (memcmp(chunk_id, "fmt ", 4) == 0 && pos + 16 <= size) { - memcpy(&audio_format, data + pos, 2); - short nc; - memcpy(&nc, data + pos + 2, 2); - n_channels = nc; - memcpy(&sample_rate, data + pos + 4, 4); - // skip byte_rate(4) + block_align(2) - short bps; - memcpy(&bps, data + pos + 14, 2); - bits_per_sample = bps; + if (pos + (size_t) chunk_size > size) { + chunk_size = (uint32_t) (size - pos); + } + + if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) { + audio_format = wav_read_u16le(data + pos + 0); + n_channels = (int) wav_read_u16le(data + pos + 2); + sample_rate = (int) wav_read_u32le(data + pos + 4); + bits_per_sample = (int) wav_read_u16le(data + pos + 14); + + extensible_subformat = 0; + if (audio_format == 0xfffe && chunk_size >= 40) { + extensible_subformat = wav_read_u16le(data + pos + 24); + } + pos += (size_t) chunk_size; } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) { size_t data_bytes = (size_t) chunk_size; - if (pos + data_bytes > size) { - data_bytes = size - pos; - } if (audio_format == 1 && bits_per_sample == 16) { n_samples = (int) (data_bytes / ((size_t) n_channels * 2)); audio = (float *) malloc((size_t) n_samples * 2 * sizeof(float)); - const short * pcm = (const short *) (data + pos); + const uint8_t * p = data + pos; + for (int t = 0; t < n_samples; t++) { if (n_channels == 1) { - float s = (float) pcm[t] / 32768.0f; - audio[t * 2 + 0] = s; - audio[t * 2 + 1] = s; + int16_t s = (int16_t) wav_read_u16le(p + t * 2); + float f = (float) s / 32768.0f; + audio[t * 2 + 0] = f; + audio[t * 2 + 1] = f; } else { - audio[t * 2 + 0] = (float) pcm[t * n_channels + 0] / 32768.0f; - audio[t * 2 + 1] = (float) pcm[t * n_channels + 1] / 32768.0f; + const uint8_t * frame = p + (size_t) t * n_channels * 2; + int16_t l = (int16_t) wav_read_u16le(frame + 0); + int16_t r = (int16_t) wav_read_u16le(frame + 2); + audio[t * 2 + 0] = (float) l / 32768.0f; + audio[t * 2 + 1] = (float) r / 32768.0f; + } + } + } else if (audio_format == 0xfffe && bits_per_sample == 24 && extensible_subformat == 1) { + n_samples = (int) (data_bytes / ((size_t) n_channels * 3)); + audio = (float *) malloc((size_t) n_samples * 2 * sizeof(float)); + const uint8_t * p = data + pos; + + for (int t = 0; t < n_samples; t++) { + if (n_channels == 1) { + int32_t s = wav_read_s24le(p + t * 3); + float f = (float) s / 8388608.0f; + audio[t * 2 + 0] = f; + audio[t * 2 + 1] = f; + } else { + const uint8_t * frame = p + (size_t) t * n_channels * 3; + int32_t l = wav_read_s24le(frame + 0); + int32_t r = wav_read_s24le(frame + 3); + audio[t * 2 + 0] = (float) l / 8388608.0f; + audio[t * 2 + 1] = (float) r / 8388608.0f; } } } else if (audio_format == 3 && bits_per_sample == 32) { - n_samples = (int) (data_bytes / ((size_t) n_channels * 4)); - audio = (float *) malloc((size_t) n_samples * 2 * sizeof(float)); - const float * fbuf = (const float *) (data + pos); + n_samples = (int) (data_bytes / ((size_t) n_channels * 4)); + audio = (float *) malloc((size_t) n_samples * 2 * sizeof(float)); + const uint8_t * p = data + pos; + for (int t = 0; t < n_samples; t++) { if (n_channels == 1) { - audio[t * 2 + 0] = fbuf[t]; - audio[t * 2 + 1] = fbuf[t]; + float s = wav_read_f32le(p + t * 4); + audio[t * 2 + 0] = s; + audio[t * 2 + 1] = s; } else { - audio[t * 2 + 0] = fbuf[t * n_channels + 0]; - audio[t * 2 + 1] = fbuf[t * n_channels + 1]; + const uint8_t * frame = p + (size_t) t * n_channels * 4; + float l = wav_read_f32le(frame + 0); + float r = wav_read_f32le(frame + 4); + audio[t * 2 + 0] = l; + audio[t * 2 + 1] = r; } } } else { - fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d\n", audio_format, bits_per_sample); + fprintf(stderr, "[WAV] Unsupported: format=%u bits=%d subformat=%u\n", + (unsigned) audio_format, bits_per_sample, (unsigned) extensible_subformat); return NULL; } + break; } else { pos += (size_t) chunk_size; } + + if (chunk_size & 1) { + pos += 1; + } } if (!audio) { @@ -95,7 +164,7 @@ static float * read_wav_buf(const uint8_t * data, size_t size, int * T_audio, in *T_audio = n_samples; *sr = sample_rate; - fprintf(stderr, "[WAV] Read buffer: %d samples, %d Hz, %d ch, %d bit\n", n_samples, sample_rate, n_channels, - bits_per_sample); + fprintf(stderr, "[WAV] Read buffer: %d samples, %d Hz, %d ch, %d bit\n", + n_samples, sample_rate, n_channels, bits_per_sample); return audio; } diff --git a/tools/ace-server.cpp b/tools/ace-server.cpp index efd6cfd..9993f95 100644 --- a/tools/ace-server.cpp +++ b/tools/ace-server.cpp @@ -190,9 +190,9 @@ static AceSynthParams g_synth_params; static AceUnderstandParams g_und_params; // limits -static int g_max_batch = 1; -static int g_mp3_kbps = 128; -static bool g_keep_loaded = false; +static int g_max_batch = 1; +static int g_mp3_kbps = 128; +static bool g_keep_loaded = false; // job system: all compute endpoints create a job and return its ID // immediately. the worker thread processes jobs in FIFO order, stores @@ -836,9 +836,22 @@ static void synth_worker(std::shared_ptr job, if (!audio[b].samples) { continue; } - audio_normalize(audio[b].samples, audio[b].n_samples * 2, peak_clip); + + // TODO Serveurperso - wire AudioFileFormat into http/json + const AudioFileFormat audio_file_format = output_wav + ? AUDIO_FILE_FORMAT_WAV_S16 + : AUDIO_FILE_FORMAT_MP3; + + if (should_normalize_audio(audio_file_format)) { + audio_normalize(audio[b].samples, audio[b].n_samples * 2, peak_clip); + } + if (output_wav) { - encoded[b] = audio_encode_wav(audio[b].samples, audio[b].n_samples, 48000); + WavFormat wav_format = WAV_FORMAT_S16; + if (!convert_audio_file_format_to_wav_format(audio_file_format, wav_format)) { + // TODO Serveurperso - implement preferred conversion failure handling + } + encoded[b] = audio_encode_wav(audio[b].samples, audio[b].n_samples, 48000, wav_format); } else { encoded[b] = audio_encode_mp3(audio[b].samples, audio[b].n_samples, 48000, g_mp3_kbps, server_cancel_job, (void *) &job->cancel); @@ -1240,10 +1253,10 @@ int main(int argc, char ** argv) { ace_lm_default_params(&g_lm_params); ace_synth_default_params(&g_synth_params); - const char * host = "127.0.0.1"; - int port = 8080; - const char * models_dir = nullptr; - const char * loras_dir = nullptr; + const char * host = "127.0.0.1"; + int port = 8080; + const char * models_dir = nullptr; + const char * loras_dir = nullptr; if (argc < 2) { usage(argv[0]); diff --git a/tools/ace-synth.cpp b/tools/ace-synth.cpp index e83d848..ee9c8e4 100644 --- a/tools/ace-synth.cpp +++ b/tools/ace-synth.cpp @@ -30,8 +30,9 @@ static void usage(const char * prog) { " --lora-scale LoRA scaling factor (default: 1.0)\n\n" "Output:\n" " Default: MP3 at 128 kbps. input.json -> input0.mp3, input1.mp3, ...\n" + " --output Output audio file format (default: mp3)\n" " --mp3-bitrate MP3 bitrate (default: 128)\n" - " --wav Output WAV instead of MP3\n\n" + "\n" "Memory control:\n" " --vae-chunk Latent frames per tile (default: 256)\n" " --vae-overlap Overlap frames per side (default: 64)\n\n" @@ -50,21 +51,22 @@ int main(int argc, char ** argv) { } std::vector request_paths; - const char * text_enc_gguf = NULL; - const char * dit_gguf = NULL; - const char * vae_gguf = NULL; - const char * src_audio_path = NULL; - const char * ref_audio_path = NULL; - const char * dump_dir = NULL; - const char * lora_path = NULL; - float lora_scale = 1.0f; - bool use_fa = true; - bool use_batch_cfg = true; - bool clamp_fp16 = false; - int vae_chunk = 256; - int vae_overlap = 64; - bool output_wav = false; // default MP3, --wav forces WAV - int mp3_kbps = 128; + const char * text_enc_gguf = NULL; + const char * dit_gguf = NULL; + const char * vae_gguf = NULL; + const char * src_audio_path = NULL; + const char * ref_audio_path = NULL; + const char * dump_dir = NULL; + const char * lora_path = NULL; + float lora_scale = 1.0f; + bool use_fa = true; + bool use_batch_cfg = true; + bool clamp_fp16 = false; + int vae_chunk = 256; + int vae_overlap = 64; + const char * audio_file_format_str = nullptr; + AudioFileFormat audio_file_format = AUDIO_FILE_FORMAT_MP3; + int mp3_kbps = 128; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--request")) { @@ -98,8 +100,8 @@ int main(int argc, char ** argv) { vae_chunk = atoi(argv[++i]); } else if (!strcmp(argv[i], "--vae-overlap") && i + 1 < argc) { vae_overlap = atoi(argv[++i]); - } else if (!strcmp(argv[i], "--wav")) { - output_wav = true; + } else if (!strcmp(argv[i], "--output") && i + 1 < argc) { + audio_file_format_str = argv[++i]; } else if (!strcmp(argv[i], "--mp3-bitrate") && i + 1 < argc) { mp3_kbps = atoi(argv[++i]); } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { @@ -127,6 +129,11 @@ int main(int argc, char ** argv) { usage(argv[0]); return 1; } + if (audio_file_format_str != nullptr && !parse_audio_file_format(audio_file_format_str, audio_file_format)) { + fprintf(stderr, "[CLI] ERROR: --output requires a supported value\n"); + usage(argv[0]); + return 1; + } // Load models AceSynthParams params; @@ -284,10 +291,11 @@ int main(int argc, char ** argv) { if (!all_audio[b].samples) { continue; } - const char * ext = output_wav ? ".wav" : ".mp3"; + const AudioFileKind audio_file_kind = convert_audio_file_format_to_kind(audio_file_format); + const char * ext = audio_file_kind == AUDIO_FILE_KIND_WAV ? ".wav" : ".mp3"; char out_path[1024]; snprintf(out_path, sizeof(out_path), "%s%d%s", all_basenames[b].c_str(), all_synth_indices[b], ext); - if (!audio_write(out_path, all_audio[b].samples, all_audio[b].n_samples, 48000, mp3_kbps)) { + if (!audio_write(out_path, all_audio[b].samples, all_audio[b].n_samples, 48000, mp3_kbps, audio_file_format)) { fprintf(stderr, "[Ace-Synth Batch%d] FATAL: failed to write %s\n", b, out_path); } ace_audio_free(&all_audio[b]); diff --git a/tools/mp3-codec.cpp b/tools/mp3-codec.cpp index c673a2f..0ef024d 100644 --- a/tools/mp3-codec.cpp +++ b/tools/mp3-codec.cpp @@ -41,9 +41,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "Usage: %s -i -o [options]\n" "\n" - " -i Input file (WAV or MP3)\n" - " -o Output file (WAV or MP3)\n" - " -b Bitrate for MP3 encoding (default: 128)\n" + " -i Input file (WAV or MP3)\n" + " -o Output file (WAV or MP3)\n" + " -b Bitrate for MP3 encoding (default: 128)\n" + " --wav-format WAV audio format (default: wav16)\n" "\n" "Mode is auto-detected from output extension.\n" "\n" @@ -58,6 +59,8 @@ int main(int argc, char ** argv) { const char * input = NULL; const char * output = NULL; int bitrate = 128; + const char * wav_format_str = nullptr; + WavFormat wav_format = WAV_FORMAT_S16; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) { @@ -66,6 +69,8 @@ int main(int argc, char ** argv) { output = argv[++i]; } else if (strcmp(argv[i], "-b") == 0 && i + 1 < argc) { bitrate = atoi(argv[++i]); + } else if (strcmp(argv[i], "--wav-format") == 0 && i + 1 < argc) { + wav_format_str = argv[++i]; } else { fprintf(stderr, "[MP3-Codec] Unknown option: %s\n", argv[i]); return 1; @@ -76,6 +81,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "[MP3-Codec] Both -i and -o are required\n"); return 1; } + if (!ends_with(output, ".wav") && wav_format_str != nullptr) { + fprintf(stderr, "[MP3-Codec] ERROR: --wav-format requires .wav output file\n"); + return 1; + } + if (!parse_wav_format(wav_format_str, wav_format)) { + fprintf(stderr, "[MP3-Codec] ERROR: --wav-format requires a supported value\n"); + return 1; + } // read input (WAV or MP3, auto-detected) int T = 0, sr = 0; @@ -89,7 +102,7 @@ int main(int argc, char ** argv) { if (ends_with(output, ".mp3")) { ok = audio_write_mp3(output, audio, T, sr, bitrate); } else if (ends_with(output, ".wav")) { - ok = audio_write_wav(output, audio, T, sr); + ok = audio_write_wav(output, audio, T, sr, wav_format); } else { fprintf(stderr, "[MP3-Codec] Cannot determine format from output extension\n"); fprintf(stderr, " use .mp3 for encoding, .wav for decoding\n"); diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp index 698b3da..c6e9387 100644 --- a/tools/neural-codec.cpp +++ b/tools/neural-codec.cpp @@ -316,6 +316,7 @@ static void print_usage(const char * prog) { " -o Output file (auto-named if omitted)\n" " --q8 Quantize latent to int8 (~13 kbit/s)\n" " --q4 Quantize latent to int4 (~6.8 kbit/s)\n\n" + " --wav-format WAV audio format (default: wav16)\n\n" "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n" " song.latent -> song.wav\n\n" "Memory control:\n" @@ -338,13 +339,15 @@ static std::string auto_output(const char * input, const char * ext) { } int main(int argc, char ** argv) { - const char * vae_path = NULL; - const char * input_path = NULL; - const char * output_path = NULL; - int chunk_size = 256; - int overlap = 64; - int mode = -1; // 0 = encode, 1 = decode - int quant = 0; // 0 = f32, 8 = q8, 4 = q4 + const char * vae_path = NULL; + const char * input_path = NULL; + const char * output_path = NULL; + const char * wav_format_str = nullptr; + WavFormat wav_format = WAV_FORMAT_S16; + int chunk_size = 256; + int overlap = 64; + int mode = -1; // 0 = encode, 1 = decode + int quant = 0; // 0 = f32, 8 = q8, 4 = q4 for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "--vae") == 0 && i + 1 < argc) { @@ -357,6 +360,8 @@ int main(int argc, char ** argv) { output_path = argv[++i]; } else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc) { output_path = argv[++i]; + } else if (strcmp(argv[i], "--wav-format") == 0 && i + 1 < argc) { + wav_format_str = argv[++i]; } else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc) { chunk_size = atoi(argv[++i]); } else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) { @@ -383,6 +388,16 @@ int main(int argc, char ** argv) { print_usage(argv[0]); return 1; } + if (mode != 1 && wav_format_str != nullptr) { + fprintf(stderr, "[CLI] ERROR: --wav-format requires usage of --decode\n"); + print_usage(argv[0]); + return 1; + } + if (!parse_wav_format(wav_format_str, wav_format)) { + fprintf(stderr, "[CLI] ERROR: --wav-format requires a supported value\n"); + print_usage(argv[0]); + return 1; + } // Auto output names std::string out_str; @@ -473,7 +488,8 @@ int main(int argc, char ** argv) { return 1; } - if (audio_write(output_path, audio.data(), T_audio, 48000, 0)) { + const AudioFileFormat audio_file_format = convert_wav_format_to_audio_file_format(wav_format); + if (audio_write(output_path, audio.data(), T_audio, 48000, 0, audio_file_format)) { fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n", output_path, T_audio, (float) T_audio / 48000.0f); } else {