Real-time speech-to-text with VAD | Apache 2.0 License
Endpoint: wss://voxtral.dudoxx.com/asr?language=auto
Audio Format: PCM Int16, 16kHz, Mono
Send: Binary ArrayBuffer (Int16Array.buffer)
Receive: JSON messages with enhanced metrics
{
"type": "partial|final",
"text": "current chunk text",
"full_transcript": "complete transcript",
"audio": {
"energy_rms": 1234.5,
"snr_db": 25.3,
"zero_crossing_rate": 0.12,
"duration_sec": 1.5
},
"processing": {
"latency_ms": 180.5,
"rtf": 0.12,
"chunk_index": 5
},
"language": {
"requested": "auto",
"detected": "en",
"confidence": 0.85
},
"stats": {
"word_count": 12,
"wpm": 145.2,
"total_audio_sec": 5.2
}
}