Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions src/voxkit/storage/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]:
- Each speaker directory contains audio files (.wav, .flac, .mp3, .ogg, .m4a)
- Each speaker directory contains label files (.lab, .txt)
- Number of audio files matches number of label files per speaker
- Each audio file has a matching label file with the same stem name

Expected structure:

Expand Down Expand Up @@ -562,15 +563,9 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]:
audio_files = [
f
for f in os.listdir(speaker_path)
if f.endswith(".wav")
or f.endswith(".flac")
or f.endswith(".mp3")
or f.endswith(".ogg")
or f.endswith(".m4a")
]
label_files = [
f for f in os.listdir(speaker_path) if f.endswith(".lab") or f.endswith(".txt")
if f.endswith((".wav", ".flac", ".mp3", ".ogg", ".m4a"))
]
label_files = [f for f in os.listdir(speaker_path) if f.endswith((".lab", ".txt"))]

if not audio_files:
return False, f"No audio files found in speaker directory '{speaker_path}'."
Expand All @@ -585,4 +580,14 @@ def validate_dataset(dataset_path: Path) -> Tuple[bool, str]:
f"directory '{speaker_path}'.",
)

audio_stems = {Path(f).stem for f in audio_files}
label_stems = {Path(f).stem for f in label_files}
unmatched = audio_stems.symmetric_difference(label_stems)
if unmatched:
return (
False,
f"Unpaired audio/label files in speaker directory '{speaker_path}': "
f"{', '.join(sorted(unmatched))}.",
)

return True, "Dataset is valid."
16 changes: 16 additions & 0 deletions tests/storage/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,3 +813,19 @@ def test_validate_dataset_mismatched_counts(self, monkeypatch):

assert is_valid is False
assert "Mismatch" in msg

def test_validate_dataset_unpaired_stems(self, monkeypatch):
from voxkit.storage.datasets import validate_dataset

# Create a dataset where counts match but stems do not
# (e.g. recording_A.wav paired with recording_B.lab)
unpaired_path = mock_get_storage_root() / "fake_datasets" / "unpaired_stems"
speaker_path = unpaired_path / "speaker_1"
speaker_path.mkdir(parents=True, exist_ok=True)
(speaker_path / "recording_A.wav").touch()
(speaker_path / "recording_B.lab").touch()

is_valid, msg = validate_dataset(unpaired_path)

assert is_valid is False
assert "Unpaired" in msg
Loading