juanmc2005
diff --git a/‎README.md
+67-26 b/‎README.md
+67-26
diff --git a/‎demo.gif
5.48 MB b/‎demo.gif
5.48 MB
diff --git a/‎logo.jpg
45.9 KB b/‎logo.jpg
45.9 KB
diff --git a/‎logo.png
-6.1 KB b/‎logo.png
-6.1 KB
diff --git a/‎requirements.txt
+4-1 b/‎requirements.txt
+4-1
diff --git a/‎setup.cfg
+11-6 b/‎setup.cfg
+11-6
diff --git a/‎src/diart/__init__.py
+6-1 b/‎src/diart/__init__.py
+6-1
diff --git a/‎src/diart/argdoc.py
+2 b/‎src/diart/argdoc.py
+2
diff --git a/‎src/diart/blocks/__init__.py
+2-1 b/‎src/diart/blocks/__init__.py
+2-1
@@ -1,7 +1,7 @@
 <br/>
 
 <p align="center">
-<img src="/logo.png" title="Logo" />
+<img width="40%" src="/logo.jpg" title="Logo" />
 </p>
 
 <p align="center">
@@ -67,11 +67,10 @@ conda create -n diart python=3.8
 conda activate diart
 ```
 
-2) Install `PortAudio` and `soundfile`:
+2) Install audio libraries:
 
 ```shell
-conda install portaudio
-conda install pysoundfile -c conda-forge
+conda install portaudio pysoundfile ffmpeg -c conda-forge
 ```
 
 3) Install diart:
@@ -101,6 +100,8 @@ diart.stream /path/to/audio.wav
 A live conversation:
 
 ```shell
+# Use "microphone:ID" to select a non-default device
+# See `python -m sounddevice` for available devices
 diart.stream microphone
 ```
 
@@ -127,29 +128,49 @@ For inference and evaluation on a dataset we recommend to use `Benchmark` (see n
 
 ## Custom models
 
-Third-party models can be integrated seamlessly by subclassing `SegmentationModel` and `EmbeddingModel`:
+Third-party models can be integrated seamlessly by subclassing `SegmentationModel` and `EmbeddingModel` (which are PyTorch `Module` subclasses):
 
 ```python
-import torch
-from typing import Optional
 from diart import OnlineSpeakerDiarization, PipelineConfig
-from diart.models import EmbeddingModel
+from diart.models import EmbeddingModel, SegmentationModel
 from diart.sources import MicrophoneAudioSource
 from diart.inference import RealTimeInference
 
+
+def model_loader():
+    return load_pretrained_model("my_model.ckpt")
+
+
+class MySegmentationModel(SegmentationModel):
+    def __init__(self):
+        super().__init__(model_loader)
+    
+    @property
+    def sample_rate(self) -> int:
+        return 16000
+    
+    @property
+    def duration(self) -> float:
+        return 2  # seconds
+    
+    def forward(self, waveform):
+        # self.model is created lazily
+        return self.model(waveform)
+
+    
 class MyEmbeddingModel(EmbeddingModel):
     def __init__(self):
-        super().__init__()
-        self.my_pretrained_model = load("my_model.ckpt")
+        super().__init__(model_loader)
+    
+    def forward(self, waveform, weights):
+        # self.model is created lazily
+        return self.model(waveform, weights)
+
 
-    def __call__(
-        self,
-        waveform: torch.Tensor,
-        weights: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        return self.my_pretrained_model(waveform, weights)
-
-config = PipelineConfig(embedding=MyEmbeddingModel())
+config = PipelineConfig(
+    segmentation=MySegmentationModel(),
+    embedding=MyEmbeddingModel()
+)
 pipeline = OnlineSpeakerDiarization(config)
 mic = MicrophoneAudioSource(config.sample_rate)
 inference = RealTimeInference(pipeline, mic)
@@ -225,7 +246,7 @@ from diart.blocks import SpeakerSegmentation, OverlapAwareSpeakerEmbedding
 
 segmentation = SpeakerSegmentation.from_pyannote("pyannote/segmentation")
 embedding = OverlapAwareSpeakerEmbedding.from_pyannote("pyannote/embedding")
-sample_rate = segmentation.model.get_sample_rate()
+sample_rate = segmentation.model.sample_rate
 mic = MicrophoneAudioSource(sample_rate)
 
 stream = mic.stream.pipe(
@@ -252,7 +273,20 @@ torch.Size([1, 3, 512])
 
 Diart is also compatible with the WebSocket protocol to serve pipelines on the web.
 
-In the following example we build a minimal server that receives audio chunks and sends back predictions in RTTM format:
+### From the command line
+
+```commandline
+diart.serve --host 0.0.0.0 --port 7007
+diart.client microphone --host <server-address> --port 7007
+```
+
+**Note:** please make sure that the client uses the same `step` and `sample_rate` than the server with `--step` and `-sr`.
+
+See `-h` for more options.
+
+### From python
+
+For customized solutions, a server can also be created in python using the `WebSocketAudioSource`:
 
 ```python
 from diart import OnlineSpeakerDiarization
@@ -261,7 +295,7 @@ from diart.inference import RealTimeInference
 
 pipeline = OnlineSpeakerDiarization()
 source = WebSocketAudioSource(pipeline.config.sample_rate, "localhost", 7007)
-inference = RealTimeInference(pipeline, source, do_plot=True)
+inference = RealTimeInference(pipeline, source)
 inference.attach_hooks(lambda ann_wav: source.send(ann_wav[0].to_rttm()))
 prediction = inference()
 ```
@@ -318,22 +352,29 @@ diart.benchmark /wav/dir --reference /rttm/dir --tau=0.555 --rho=0.422 --delta=1
 or using the inference API:
 
 ```python
-from diart.inference import Benchmark
+from diart.inference import Benchmark, Parallelize
 from diart import OnlineSpeakerDiarization, PipelineConfig
 from diart.models import SegmentationModel
 
+benchmark = Benchmark("/wav/dir", "/rttm/dir")
+
+name = "pyannote/segmentation@Interspeech2021"
+segmentation = SegmentationModel.from_pyannote(name)
 config = PipelineConfig(
     # Set the model used in the paper
-    segmentation=SegmentationModel.from_pyannote("pyannote/segmentation@Interspeech2021"),
+    segmentation=segmentation,
     step=0.5,
     latency=0.5,
     tau_active=0.555,
     rho_update=0.422,
     delta_new=1.517
 )
-pipeline = OnlineSpeakerDiarization(config)
-benchmark = Benchmark("/wav/dir", "/rttm/dir")
-benchmark(pipeline)
+benchmark(OnlineSpeakerDiarization, config)
+
+# Run the same benchmark in parallel
+p_benchmark = Parallelize(benchmark, num_workers=4)
+if __name__ == "__main__":  # Needed for multiprocessing
+    p_benchmark(OnlineSpeakerDiarization, config)
 ```
 
 This pre-calculates model outputs in batches, so it runs a lot faster.
 
@@ -7,10 +7,13 @@ einops>=0.3.0
 tqdm>=4.64.0
 pandas>=1.4.2
 torch>=1.12.1
+torchvision>=0.14.0
 torchaudio>=0.12.1,<1.0
 pyannote.audio>=2.1.1
 pyannote.core>=4.5
 pyannote.database>=4.1.1
 pyannote.metrics>=3.2
 optuna>=2.10
-websockets>=10.3
+websocket-server>=0.6.4
+websocket-client>=0.58.0
+rich>=12.5.1
@@ -1,10 +1,10 @@
 [metadata]
 name=diart
-version=0.6.0
+version=0.7.0
 author=Juan Manuel Coria
 description=Speaker diarization in real time
 long_description=file: README.md
-long_description_content_type = text/markdown
+long_description_content_type=text/markdown
 keywords=speaker diarization, streaming, online, real time, rxpy
 url=https://github.com/juanmc2005/StreamingSpeakerDiarization
 license=MIT
@@ -29,19 +29,24 @@ install_requires=
     tqdm>=4.64.0
     pandas>=1.4.2
     torch>=1.12.1
+    torchvision>=0.14.0
     torchaudio>=0.12.1,<1.0
     pyannote.audio>=2.1.1
     pyannote.core>=4.5
     pyannote.database>=4.1.1
     pyannote.metrics>=3.2
     optuna>=2.10
-    websockets>=10.3
+    websocket-server>=0.6.4
+    websocket-client>=0.58.0
+    rich>=12.5.1
 
 [options.packages.find]
 where=src
 
 [options.entry_points]
 console_scripts=
-    diart.stream=diart.stream:run
-    diart.benchmark=diart.benchmark:run
-    diart.tune=diart.tune:run
+    diart.stream=diart.console.stream:run
+    diart.benchmark=diart.console.benchmark:run
+    diart.tune=diart.console.tune:run
+    diart.serve=diart.console.serve:run
+    diart.client=diart.console.client:run
@@ -1 +1,6 @@
-from .blocks import OnlineSpeakerDiarization, PipelineConfig
+from .blocks import (
+    OnlineSpeakerDiarization,
+    BasePipeline,
+    PipelineConfig,
+    BasePipelineConfig,
+)
@@ -10,5 +10,7 @@
 MAX_SPEAKERS = "Maximum number of speakers"
 CPU = "Force models to run on CPU"
 BATCH_SIZE = "For segmentation and embedding pre-calculation. If BATCH_SIZE < 2, run fully online and estimate real-time latency"
+NUM_WORKERS = "Number of parallel workers"
 OUTPUT = "Directory to store the system's output in RTTM format"
 HF_TOKEN = "Huggingface authentication token for hosted models ('true' | 'false' | <token>). If 'true', it will use the token from huggingface-cli login"
+SAMPLE_RATE = "Sample rate of the audio stream"
@@ -13,5 +13,6 @@
     OverlapAwareSpeakerEmbedding,
 )
 from .segmentation import SpeakerSegmentation
-from .diarization import OnlineSpeakerDiarization, PipelineConfig
+from .diarization import OnlineSpeakerDiarization, BasePipeline
+from .config import BasePipelineConfig, PipelineConfig
 from .utils import Binarize, Resample, AdjustVolume
Original file line number	Diff line number	Diff line change
`@@ -13,5 +13,6 @@`
`13`	`13`	`OverlapAwareSpeakerEmbedding,`
`14`	`14`	`)`
`15`	`15`	`from .segmentation import SpeakerSegmentation`
`16`		`-from .diarization import OnlineSpeakerDiarization, PipelineConfig`
	`16`	`+from .diarization import OnlineSpeakerDiarization, BasePipeline`
	`17`	`+from .config import BasePipelineConfig, PipelineConfig`
`17`	`18`	`from .utils import Binarize, Resample, AdjustVolume`