Spaces:

nvidia
/

nemotron-speech-streaming-en-0.6b

Running

fciannella commited on Jan 7

Commit

7f30e56

1 Parent(s): 00467eb

Switch from NVCF gRPC to Triton Inference Server

- Replace NVCF gRPC client with Triton client (tritonclient[grpc])
- Update environment variables: NGC_API_KEY, FUNCTION_ID, VERSION_ID
- Add new triton_client.py for async streaming ASR
- Remove old proto files and grpc_client.py
- Simplify Dockerfile (no proto generation needed)
- Remove attention context UI (not supported by Triton model)
- Add proto directory with Riva-compatible definitions
- Add test_triton_asr.py for testing
- Update README with new configuration

Files changed (19) hide show

Dockerfile +7 -16
README.md +68 -6
bridge/config.py +23 -17
bridge/grpc_client.py +0 -289
bridge/main.py +55 -73
bridge/proto/__init__.py +0 -19
bridge/proto/streaming_asr.proto +0 -170
bridge/proto/streaming_asr_pb2.py +0 -50
bridge/proto/streaming_asr_pb2_grpc.py +0 -170
bridge/requirements.txt +4 -4
bridge/triton_client.py +346 -0
proto/generate.sh +95 -0
proto/health.proto +35 -0
proto/riva_asr.proto +163 -0
proto/riva_audio.proto +32 -0
test_triton_asr.py +266 -0
web/src/App.tsx +6 -15
web/src/components/ControlBar.tsx +2 -44
web/src/types/messages.ts +0 -13

Dockerfile CHANGED Viewed

@@ -1,8 +1,13 @@
 # =============================================================================
-# Multi-stage Dockerfile for Streaming ASR Client
 #
 # Stage 1: Build React frontend
 # Stage 2: Python runtime with static files
 # =============================================================================
 # -----------------------------------------------------------------------------
@@ -42,22 +47,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 COPY bridge/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy Python application first
 COPY bridge/ ./bridge/
-# Generate proto files AFTER copying (to ensure we use the latest proto definition)
-# Remove any old generated files first
-RUN rm -f ./bridge/proto/streaming_asr_pb2.py ./bridge/proto/streaming_asr_pb2_grpc.py && \
-    python -m grpc_tools.protoc \
-    -I./bridge/proto \
-    --python_out=./bridge/proto \
-    --grpc_python_out=./bridge/proto \
-    ./bridge/proto/streaming_asr.proto
-# Fix proto imports (grpc generates with wrong import path)
-RUN sed -i 's/import streaming_asr_pb2/from . import streaming_asr_pb2/' \
-    ./bridge/proto/streaming_asr_pb2_grpc.py
 # Copy built frontend from stage 1
 COPY --from=frontend-builder /app/web/dist ./static/
@@ -75,4 +67,3 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
 # Run the application
 CMD ["python", "-m", "bridge.main"]

 # =============================================================================
+# Multi-stage Dockerfile for Streaming ASR Client with Triton
 #
 # Stage 1: Build React frontend
 # Stage 2: Python runtime with static files
+#
+# Required environment variables:
+#   - NGC_API_KEY: NVIDIA NGC API key for authentication
+#   - FUNCTION_ID: NVCF function ID
+#   - VERSION_ID: (optional) NVCF function version ID
 # =============================================================================
 # -----------------------------------------------------------------------------
 COPY bridge/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy Python application
 COPY bridge/ ./bridge/
 # Copy built frontend from stage 1
 COPY --from=frontend-builder /app/web/dist ./static/
 # Run the application
 CMD ["python", "-m", "bridge.main"]

README.md CHANGED Viewed

@@ -1,11 +1,73 @@
 ---
-title: Nemotron Speech En
-emoji: 🏢
-colorFrom: red
-colorTo: blue
 sdk: docker
 pinned: false
-short_description: Preview Nemotron speech english model
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Nemotron Speech Streaming
+emoji: 🎤
+colorFrom: green
+colorTo: green
 sdk: docker
 pinned: false
+short_description: Real-time speech recognition with NVIDIA Triton
 ---
+# Nemotron Speech Streaming
+Real-time speech recognition powered by NVIDIA Triton Inference Server.
+## Features
+- **Real-time streaming ASR**: Bidirectional streaming for live transcription
+- **File upload support**: Transcribe WAV, MP3, OGG, WebM files
+- **Beautiful UI**: Modern React interface with NVIDIA branding
+- **WebSocket bridge**: FastAPI server bridging browser to Triton
+## Environment Variables
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `NGC_API_KEY` | Yes | NVIDIA NGC API key for authentication |
+| `FUNCTION_ID` | Yes | NVCF function ID for the ASR model |
+| `VERSION_ID` | No | NVCF function version ID |
+| `TRITON_URL` | No | Triton server URL (default: `grpc.nvcf.nvidia.com:443`) |
+| `MODEL_NAME` | No | Model name in Triton (default: `nemotron_asr`) |
+| `PORT` | No | Server port (default: `8080`) |
+## Local Development
+```bash
+# Install Python dependencies
+cd bridge
+pip install -r requirements.txt
+# Build React frontend
+cd web
+npm install
+npm run build
+# Run the server
+NGC_API_KEY=your_key FUNCTION_ID=your_function_id python -m bridge.main
+```
+## Docker
+```bash
+# Build
+docker build -t nemotron-speech .
+# Run
+docker run -p 8080:8080 \
+  -e NGC_API_KEY=your_key \
+  -e FUNCTION_ID=your_function_id \
+  nemotron-speech
+```
+## Architecture
+```
+┌─────────────┐     WebSocket     ┌─────────────┐     gRPC      ┌─────────────┐
+│   Browser   │ ◄──────────────► │   FastAPI   │ ◄───────────► │   Triton    │
+│  (React UI) │                  │   Bridge    │               │   Server    │
+└─────────────┘                  └─────────────┘               └─────────────┘
+```
+## License
+Apache 2.0 - See LICENSE file for details.

bridge/config.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Configuration settings for the WS-to-gRPC bridge."""
 import os
 from dataclasses import dataclass
@@ -20,12 +20,17 @@ from typing import Optional
 @dataclass
-class NVCFConfig:
-    """NVCF connection configuration."""
-    api_key: str
     function_id: str
-    function_version_id: Optional[str] = None
-    grpc_url: str = "grpc.nvcf.nvidia.com:443"
 @dataclass
@@ -39,26 +44,28 @@ class ServerConfig:
 @dataclass
 class Settings:
     """Application settings."""
-    nvcf: NVCFConfig
     server: ServerConfig
 def load_settings() -> Settings:
     """Load settings from environment variables."""
-    api_key = os.getenv("NVCF_API_KEY")
-    function_id = os.getenv("NVCF_FUNCTION_ID")
-    if not api_key:
-        raise ValueError("NVCF_API_KEY environment variable is required")
     if not function_id:
-        raise ValueError("NVCF_FUNCTION_ID environment variable is required")
     return Settings(
-        nvcf=NVCFConfig(
-            api_key=api_key,
             function_id=function_id,
-            function_version_id=os.getenv("NVCF_FUNCTION_VERSION_ID"),
-            grpc_url=os.getenv("NVCF_GRPC_URL", "grpc.nvcf.nvidia.com:443"),
         ),
         server=ServerConfig(
             host=os.getenv("HOST", "0.0.0.0"),
@@ -66,4 +73,3 @@ def load_settings() -> Settings:
             log_level=os.getenv("LOG_LEVEL", "INFO"),
         ),
     )

 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Configuration settings for the WS-to-Triton bridge."""
 import os
 from dataclasses import dataclass
 @dataclass
+class TritonConfig:
+    """Triton/NVCF connection configuration."""
+    ngc_api_key: str
     function_id: str
+    version_id: Optional[str] = None
+    # Triton server URL (for local) or NVCF gRPC endpoint (for cloud)
+    server_url: str = "grpc.nvcf.nvidia.com:443"
+    # Model name in Triton
+    model_name: str = "nemotron_asr"
+    # Whether to use SSL (required for NVCF)
+    use_ssl: bool = True
 @dataclass
 @dataclass
 class Settings:
     """Application settings."""
+    triton: TritonConfig
     server: ServerConfig
 def load_settings() -> Settings:
     """Load settings from environment variables."""
+    ngc_api_key = os.getenv("NGC_API_KEY")
+    function_id = os.getenv("FUNCTION_ID")
+    if not ngc_api_key:
+        raise ValueError("NGC_API_KEY environment variable is required")
     if not function_id:
+        raise ValueError("FUNCTION_ID environment variable is required")
     return Settings(
+        triton=TritonConfig(
+            ngc_api_key=ngc_api_key,
             function_id=function_id,
+            version_id=os.getenv("VERSION_ID"),
+            server_url=os.getenv("TRITON_URL", "grpc.nvcf.nvidia.com:443"),
+            model_name=os.getenv("MODEL_NAME", "nemotron_asr"),
+            use_ssl=os.getenv("USE_SSL", "true").lower() in ("true", "1", "yes"),
         ),
         server=ServerConfig(
             host=os.getenv("HOST", "0.0.0.0"),
             log_level=os.getenv("LOG_LEVEL", "INFO"),
         ),
     )

bridge/grpc_client.py DELETED Viewed

@@ -1,289 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Async gRPC client for connecting to NVCF streaming ASR service."""
-import asyncio
-from typing import AsyncIterator, Optional, Callable, Any
-from dataclasses import dataclass
-import grpc
-from grpc import aio
-from loguru import logger
-from .proto import streaming_asr_pb2
-from .proto import streaming_asr_pb2_grpc
-from .config import NVCFConfig
-@dataclass
-class TranscriptResult:
-    """Transcription result from the ASR service."""
-    text: str
-    is_final: bool
-    confidence: float = 0.0
-    latency_ms: float = 0.0
-    stability: float = 0.0
-    session_id: str = ""
-class NVCFStreamingClient:
-    """
-    Async gRPC client for NVCF streaming ASR.
-    Handles bidirectional streaming to NVCF with proper authentication.
-    """
-    def __init__(self, config: NVCFConfig):
-        """
-        Initialize the NVCF client.
-        Args:
-            config: NVCF configuration with API key and function ID
-        """
-        self.config = config
-        self._channel: Optional[aio.Channel] = None
-        self._stub: Optional[streaming_asr_pb2_grpc.StreamingASRStub] = None
-    def _get_metadata(self) -> list:
-        """Get gRPC metadata for NVCF authentication."""
-        metadata = [
-            ("authorization", f"Bearer {self.config.api_key}"),
-            ("function-id", self.config.function_id),
-        ]
-        if self.config.function_version_id:
-            metadata.append(("function-version-id", self.config.function_version_id))
-        return metadata
-    async def connect(self) -> None:
-        """Establish connection to NVCF."""
-        if self._channel is not None:
-            return
-        logger.info(f"Connecting to NVCF at {self.config.grpc_url}")
-        # NVCF requires SSL/TLS
-        credentials = grpc.ssl_channel_credentials()
-        self._channel = aio.secure_channel(
-            self.config.grpc_url,
-            credentials,
-            options=[
-                ('grpc.max_send_message_length', 50 * 1024 * 1024),
-                ('grpc.max_receive_message_length', 50 * 1024 * 1024),
-                ('grpc.keepalive_time_ms', 10000),
-                ('grpc.keepalive_timeout_ms', 5000),
-                ('grpc.keepalive_permit_without_calls', True),
-            ]
-        )
-        self._stub = streaming_asr_pb2_grpc.StreamingASRStub(self._channel)
-        logger.info("Connected to NVCF")
-    async def disconnect(self) -> None:
-        """Close connection to NVCF."""
-        if self._channel is not None:
-            await self._channel.close()
-            self._channel = None
-            self._stub = None
-            logger.info("Disconnected from NVCF")
-    async def health_check(self) -> dict:
-        """
-        Check NVCF service health.
-        Returns:
-            Health status dictionary
-        """
-        if self._stub is None:
-            await self.connect()
-        try:
-            response = await self._stub.HealthCheck(
-                streaming_asr_pb2.HealthCheckRequest(),
-                metadata=self._get_metadata(),
-                timeout=10.0,
-            )
-            status_name = streaming_asr_pb2.HealthCheckResponse.ServingStatus.Name(
-                response.status
-            )
-            return {
-                "status": status_name,
-                "model_loaded": response.model_loaded,
-                "healthy": response.status == streaming_asr_pb2.HealthCheckResponse.SERVING,
-            }
-        except grpc.aio.AioRpcError as e:
-            logger.error(f"Health check failed: {e.code()} - {e.details()}")
-            return {
-                "status": "ERROR",
-                "error": str(e.details()),
-                "healthy": False,
-            }
-    async def get_config(self) -> dict:
-        """
-        Get NVCF service configuration.
-        Returns:
-            Configuration dictionary
-        """
-        if self._stub is None:
-            await self.connect()
-        try:
-            response = await self._stub.GetConfig(
-                streaming_asr_pb2.GetConfigRequest(),
-                metadata=self._get_metadata(),
-                timeout=10.0,
-            )
-            return {
-                "model_path": response.model_path,
-                "device": response.device,
-                "decoder_type": response.decoder_type,
-                "sample_rate": response.sample_rate,
-                "chunk_size_ms": response.chunk_size_ms,
-                "buffer_size_ms": response.buffer_size_ms,
-            }
-        except grpc.aio.AioRpcError as e:
-            logger.error(f"Get config failed: {e.code()} - {e.details()}")
-            return {"error": str(e.details())}
-    async def stream_audio(
-        self,
-        audio_iterator: AsyncIterator[bytes],
-        sample_rate: int = 16000,
-        encoding: str = "pcm_s16le",
-        on_transcript: Optional[Callable[[TranscriptResult], Any]] = None,
-        att_context_size: Optional[list] = None,
-    ) -> AsyncIterator[TranscriptResult]:
-        """
-        Stream audio to NVCF and yield transcription results.
-        Args:
-            audio_iterator: Async iterator yielding audio chunks (bytes)
-            sample_rate: Audio sample rate (default: 16000)
-            encoding: Audio encoding (default: pcm_s16le)
-            on_transcript: Optional callback for each transcript
-            att_context_size: Optional attention context [left, right] (e.g., [70, 1])
-        Yields:
-            TranscriptResult objects
-        """
-        if self._stub is None:
-            await self.connect()
-        async def request_generator():
-            """Generate gRPC request messages."""
-            logger.info("request_generator started")
-            try:
-                # Send configuration first
-                config = streaming_asr_pb2.StreamingRecognitionConfig(
-                    encoding=encoding,
-                    sample_rate_hz=sample_rate,
-                    language_code="en-US",
-                    interim_results=True,
-                )
-                logger.info("Config object created")
-                # Add attention context size if specified
-                if att_context_size is not None and len(att_context_size) == 2:
-                    try:
-                        config.att_context_size.extend(att_context_size)
-                        logger.info(f"Using attention context size: {att_context_size}")
-                    except Exception as e:
-                        logger.error(f"Failed to set att_context_size: {e}")
-                logger.info("Yielding config to gRPC stream...")
-                yield streaming_asr_pb2.StreamingRecognizeRequest(streaming_config=config)
-                logger.info("Config sent, now streaming audio chunks...")
-            except Exception as e:
-                logger.error(f"Error in request_generator setup: {e}", exc_info=True)
-                raise
-            # Stream audio chunks
-            chunk_count = 0
-            logger.info("Starting to iterate over audio chunks...")
-            async for audio_chunk in audio_iterator:
-                if audio_chunk:
-                    yield streaming_asr_pb2.StreamingRecognizeRequest(
-                        audio_content=audio_chunk
-                    )
-                    chunk_count += 1
-                    if chunk_count == 1:
-                        logger.info("First audio chunk sent to gRPC")
-                    elif chunk_count % 50 == 0:  # Log every 50 chunks
-                        logger.debug(f"Sent {chunk_count} audio chunks so far...")
-            logger.info(f"Sent {chunk_count} total audio chunks to NVCF")
-            # Send end of stream
-            yield streaming_asr_pb2.StreamingRecognizeRequest(
-                control=streaming_asr_pb2.StreamingControl(
-                    type=streaming_asr_pb2.StreamingControl.END_OF_STREAM
-                )
-            )
-            logger.debug("Sent end of stream to NVCF")
-        try:
-            logger.info("Creating gRPC StreamingRecognize call...")
-            response_stream = self._stub.StreamingRecognize(
-                request_generator(),
-                metadata=self._get_metadata(),
-            )
-            logger.info("gRPC call created, iterating over responses...")
-            response_count = 0
-            async for response in response_stream:
-                response_count += 1
-                if response_count == 1:
-                    logger.info("Received first response from NVCF")
-                # Check for errors
-                if response.HasField('error') and response.error.code != 0:
-                    logger.error(
-                        f"NVCF error: [{response.error.code}] {response.error.message}"
-                    )
-                    continue
-                # Extract transcript
-                if response.HasField('result'):
-                    result = TranscriptResult(
-                        text=response.result.transcript,
-                        is_final=response.result.is_final,
-                        confidence=response.result.confidence,
-                        latency_ms=response.result.latency_ms,
-                        stability=response.result.stability,
-                        session_id=response.session_id,
-                    )
-                    if on_transcript:
-                        on_transcript(result)
-                    yield result
-        except grpc.aio.AioRpcError as e:
-            logger.error(f"gRPC streaming error: {e.code()} - {e.details()}")
-            raise
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self.connect()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        await self.disconnect()

bridge/main.py CHANGED Viewed

@@ -13,10 +13,10 @@
 # limitations under the License.
 """
-WebSocket-to-gRPC bridge for streaming ASR.
 This server accepts WebSocket connections from the browser,
-forwards audio to NVCF via gRPC, and returns transcriptions.
 It also serves the React frontend as static files.
 """
@@ -38,12 +38,12 @@ from fastapi.middleware.cors import CORSMiddleware
 from loguru import logger
 from .config import load_settings, Settings
-from .grpc_client import NVCFStreamingClient, TranscriptResult
 # Global settings and client
 settings: Optional[Settings] = None
-nvcf_client: Optional[NVCFStreamingClient] = None
 def setup_logging(log_level: str = "INFO"):
@@ -62,8 +62,8 @@ def setup_logging(log_level: str = "INFO"):
 # Create FastAPI app
 app = FastAPI(
     title="Streaming ASR Client",
-    description="WebSocket-to-gRPC bridge for NVCF streaming ASR",
-    version="1.0.0",
 )
 # Add CORS middleware
@@ -79,46 +79,46 @@ app.add_middleware(
 @app.on_event("startup")
 async def startup_event():
     """Initialize on startup."""
-    global settings, nvcf_client
     # Load settings
     try:
         settings = load_settings()
     except ValueError as e:
         logger.error(f"Configuration error: {e}")
-        logger.error("Please set NVCF_API_KEY and NVCF_FUNCTION_ID environment variables")
         # Don't exit - allow the app to start for health checks
         return
     setup_logging(settings.server.log_level)
     logger.info("=" * 60)
-    logger.info("Streaming ASR Client - WebSocket-to-gRPC Bridge")
     logger.info("=" * 60)
-    logger.info(f"NVCF URL: {settings.nvcf.grpc_url}")
-    logger.info(f"Function ID: {settings.nvcf.function_id}")
     logger.info(f"Server: {settings.server.host}:{settings.server.port}")
-    # Initialize NVCF client
-    nvcf_client = NVCFStreamingClient(settings.nvcf)
-    # Test connection
     try:
-        await nvcf_client.connect()
-        health = await nvcf_client.health_check()
-        logger.info(f"NVCF health check: {health}")
     except Exception as e:
-        logger.warning(f"Initial NVCF connection failed: {e}")
         logger.warning("Will retry on first request")
 @app.on_event("shutdown")
 async def shutdown_event():
     """Cleanup on shutdown."""
-    global nvcf_client
-    if nvcf_client:
-        await nvcf_client.disconnect()
-        logger.info("Disconnected from NVCF")
 @app.get("/health")
@@ -126,30 +126,30 @@ async def health_check():
     """Health check endpoint."""
     result = {
         "status": "healthy",
-        "nvcf_configured": settings is not None,
     }
-    if nvcf_client:
         try:
-            nvcf_health = await nvcf_client.health_check()
-            result["nvcf"] = nvcf_health
         except Exception as e:
-            result["nvcf"] = {"status": "error", "error": str(e)}
     return result
 @app.get("/api/config")
 async def get_config():
-    """Get NVCF service configuration."""
-    if not nvcf_client:
-        raise HTTPException(status_code=503, detail="NVCF client not initialized")
-    try:
-        config = await nvcf_client.get_config()
-        return config
-    except Exception as e:
-        raise HTTPException(status_code=503, detail=str(e))
 def convert_audio_to_pcm(file_content: bytes, filename: str) -> tuple[bytes, int]:
@@ -215,8 +215,8 @@ async def transcribe_file(file: UploadFile = File(...)):
     Returns:
         Transcription result
     """
-    if not nvcf_client:
-        raise HTTPException(status_code=503, detail="NVCF client not initialized")
     # Read file content
     content = await file.read()
@@ -230,8 +230,8 @@ async def transcribe_file(file: UploadFile = File(...)):
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
-    # Stream to NVCF
-    chunk_duration_ms = 80
     chunk_size = int(sample_rate * chunk_duration_ms / 1000) * 2  # 2 bytes per sample
     async def audio_generator() -> AsyncIterator[bytes]:
@@ -250,16 +250,12 @@ async def transcribe_file(file: UploadFile = File(...)):
     final_text = ""
     try:
-        async for result in nvcf_client.stream_audio(
-            audio_generator(),
-            sample_rate=sample_rate,
-        ):
             if result.is_final:
                 final_text = result.text
             transcripts.append({
                 "text": result.text,
                 "is_final": result.is_final,
-                "latency_ms": result.latency_ms,
             })
     except Exception as e:
         logger.error(f"Transcription error: {e}")
@@ -288,24 +284,24 @@ async def websocket_transcribe(websocket: WebSocket):
     session_id = str(uuid.uuid4())[:8]
     logger.info(f"[{session_id}] WebSocket connected")
-    if not nvcf_client:
         await websocket.send_json({
             "type": "error",
-            "message": "NVCF client not initialized. Check server configuration.",
-            "code": "NVCF_NOT_CONFIGURED",
         })
         await websocket.close()
         return
-    # Ensure connected to NVCF
     try:
-        await nvcf_client.connect()
     except Exception as e:
-        logger.error(f"[{session_id}] Failed to connect to NVCF: {e}")
         await websocket.send_json({
             "type": "error",
-            "message": f"Failed to connect to NVCF: {e}",
-            "code": "NVCF_CONNECTION_ERROR",
         })
         await websocket.close()
         return
@@ -316,12 +312,10 @@ async def websocket_transcribe(websocket: WebSocket):
         "session_id": session_id,
     })
-    # Audio queue for streaming to NVCF
     audio_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
     is_streaming = False
     stream_task: Optional[asyncio.Task] = None
-    # Use a dict as mutable container to avoid nonlocal issues
-    stream_config = {"att_context_size": None}
     async def audio_iterator() -> AsyncIterator[bytes]:
         """Async iterator that reads from the audio queue."""
@@ -332,26 +326,22 @@ async def websocket_transcribe(websocket: WebSocket):
             yield chunk
     async def process_stream():
-        """Process the gRPC stream and send results back via WebSocket."""
         nonlocal is_streaming
         try:
-            logger.info(f"[{session_id}] Starting gRPC stream with att_context_size={stream_config['att_context_size']}")
-            async for result in nvcf_client.stream_audio(
-                audio_iterator(),
-                att_context_size=stream_config["att_context_size"],
-            ):
                 logger.debug(f"[{session_id}] Received transcript: {result.text[:50] if result.text else '(empty)'}... is_final={result.is_final}")
                 await websocket.send_json({
                     "type": "transcript",
                     "text": result.text,
                     "is_final": result.is_final,
                     "confidence": result.confidence,
-                    "latency_ms": result.latency_ms,
                     "session_id": result.session_id,
                 })
-            logger.info(f"[{session_id}] gRPC stream completed normally")
         except Exception as e:
-            logger.error(f"[{session_id}] gRPC stream error: {e}", exc_info=True)
             try:
                 await websocket.send_json({
                     "type": "error",
@@ -387,13 +377,6 @@ async def websocket_transcribe(websocket: WebSocket):
                     if msg_type == "start_stream":
                         if not is_streaming:
                             is_streaming = True
-                            # Extract attention context size if provided
-                            att_ctx = data.get("att_context_size")
-                            if att_ctx and isinstance(att_ctx, list) and len(att_ctx) == 2:
-                                stream_config["att_context_size"] = att_ctx
-                                logger.info(f"[{session_id}] Using att_context_size: {att_ctx}")
-                            else:
-                                stream_config["att_context_size"] = None
                             # Clear the queue
                             while not audio_queue.empty():
                                 try:
@@ -486,7 +469,7 @@ def main():
         settings = load_settings()
     except ValueError as e:
         print(f"Error: {e}")
-        print("Please set NVCF_API_KEY and NVCF_FUNCTION_ID environment variables")
         sys.exit(1)
     setup_logging(settings.server.log_level)
@@ -501,4 +484,3 @@ def main():
 if __name__ == "__main__":
     main()

 # limitations under the License.
 """
+WebSocket-to-Triton bridge for streaming ASR.
 This server accepts WebSocket connections from the browser,
+forwards audio to Triton via gRPC, and returns transcriptions.
 It also serves the React frontend as static files.
 """
 from loguru import logger
 from .config import load_settings, Settings
+from .triton_client import TritonASRClient, TranscriptResult
 # Global settings and client
 settings: Optional[Settings] = None
+triton_client: Optional[TritonASRClient] = None
 def setup_logging(log_level: str = "INFO"):
 # Create FastAPI app
 app = FastAPI(
     title="Streaming ASR Client",
+    description="WebSocket-to-Triton bridge for streaming ASR",
+    version="2.0.0",
 )
 # Add CORS middleware
 @app.on_event("startup")
 async def startup_event():
     """Initialize on startup."""
+    global settings, triton_client
     # Load settings
     try:
         settings = load_settings()
     except ValueError as e:
         logger.error(f"Configuration error: {e}")
+        logger.error("Please set NGC_API_KEY and FUNCTION_ID environment variables")
         # Don't exit - allow the app to start for health checks
         return
     setup_logging(settings.server.log_level)
     logger.info("=" * 60)
+    logger.info("Streaming ASR Client - WebSocket-to-Triton Bridge")
     logger.info("=" * 60)
+    logger.info(f"Triton URL: {settings.triton.server_url}")
+    logger.info(f"Function ID: {settings.triton.function_id}")
+    logger.info(f"Model: {settings.triton.model_name}")
     logger.info(f"Server: {settings.server.host}:{settings.server.port}")
+    # Initialize Triton client
+    triton_client = TritonASRClient(settings.triton)
+    # Connect to Triton (for NVCF, full validation happens on first inference)
     try:
+        await triton_client.connect()
+        logger.info("Triton client initialized successfully")
     except Exception as e:
+        logger.warning(f"Initial Triton connection failed: {e}")
         logger.warning("Will retry on first request")
 @app.on_event("shutdown")
 async def shutdown_event():
     """Cleanup on shutdown."""
+    global triton_client
+    if triton_client:
+        await triton_client.disconnect()
+        logger.info("Disconnected from Triton")
 @app.get("/health")
     """Health check endpoint."""
     result = {
         "status": "healthy",
+        "triton_configured": settings is not None,
     }
+    if triton_client:
         try:
+            triton_health = await triton_client.health_check()
+            result["triton"] = triton_health
         except Exception as e:
+            result["triton"] = {"status": "error", "error": str(e)}
     return result
 @app.get("/api/config")
 async def get_config():
+    """Get service configuration."""
+    if not triton_client:
+        raise HTTPException(status_code=503, detail="Triton client not initialized")
+    return {
+        "model_name": settings.triton.model_name,
+        "server_url": settings.triton.server_url,
+        "sample_rate": 16000,
+    }
 def convert_audio_to_pcm(file_content: bytes, filename: str) -> tuple[bytes, int]:
     Returns:
         Transcription result
     """
+    if not triton_client:
+        raise HTTPException(status_code=503, detail="Triton client not initialized")
     # Read file content
     content = await file.read()
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
+    # Stream to Triton
+    chunk_duration_ms = 100
     chunk_size = int(sample_rate * chunk_duration_ms / 1000) * 2  # 2 bytes per sample
     async def audio_generator() -> AsyncIterator[bytes]:
     final_text = ""
     try:
+        async for result in triton_client.stream_audio(audio_generator()):
             if result.is_final:
                 final_text = result.text
             transcripts.append({
                 "text": result.text,
                 "is_final": result.is_final,
             })
     except Exception as e:
         logger.error(f"Transcription error: {e}")
     session_id = str(uuid.uuid4())[:8]
     logger.info(f"[{session_id}] WebSocket connected")
+    if not triton_client:
         await websocket.send_json({
             "type": "error",
+            "message": "Triton client not initialized. Check server configuration.",
+            "code": "TRITON_NOT_CONFIGURED",
         })
         await websocket.close()
         return
+    # Ensure connected to Triton
     try:
+        await triton_client.connect()
     except Exception as e:
+        logger.error(f"[{session_id}] Failed to connect to Triton: {e}")
         await websocket.send_json({
             "type": "error",
+            "message": f"Failed to connect to Triton: {e}",
+            "code": "TRITON_CONNECTION_ERROR",
         })
         await websocket.close()
         return
         "session_id": session_id,
     })
+    # Audio queue for streaming to Triton
     audio_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
     is_streaming = False
     stream_task: Optional[asyncio.Task] = None
     async def audio_iterator() -> AsyncIterator[bytes]:
         """Async iterator that reads from the audio queue."""
             yield chunk
     async def process_stream():
+        """Process the Triton stream and send results back via WebSocket."""
         nonlocal is_streaming
         try:
+            logger.info(f"[{session_id}] Starting Triton stream")
+            async for result in triton_client.stream_audio(audio_iterator()):
                 logger.debug(f"[{session_id}] Received transcript: {result.text[:50] if result.text else '(empty)'}... is_final={result.is_final}")
                 await websocket.send_json({
                     "type": "transcript",
                     "text": result.text,
                     "is_final": result.is_final,
                     "confidence": result.confidence,
                     "session_id": result.session_id,
                 })
+            logger.info(f"[{session_id}] Triton stream completed normally")
         except Exception as e:
+            logger.error(f"[{session_id}] Triton stream error: {e}", exc_info=True)
             try:
                 await websocket.send_json({
                     "type": "error",
                     if msg_type == "start_stream":
                         if not is_streaming:
                             is_streaming = True
                             # Clear the queue
                             while not audio_queue.empty():
                                 try:
         settings = load_settings()
     except ValueError as e:
         print(f"Error: {e}")
+        print("Please set NGC_API_KEY and FUNCTION_ID environment variables")
         sys.exit(1)
     setup_logging(settings.server.log_level)
 if __name__ == "__main__":
     main()

bridge/proto/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Proto definitions for streaming ASR."""
-from .streaming_asr_pb2 import *
-from .streaming_asr_pb2_grpc import *

bridge/proto/streaming_asr.proto DELETED Viewed

@@ -1,170 +0,0 @@
-// Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-syntax = "proto3";
-package streaming_asr;
-// Streaming ASR Service
-// Supports bidirectional streaming for real-time speech recognition
-service StreamingASR {
-    // Bidirectional streaming RPC for real-time transcription
-    // Client streams audio chunks, server streams transcription results
-    rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
-    // Health check endpoint
-    rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
-    // Get server configuration
-    rpc GetConfig(GetConfigRequest) returns (GetConfigResponse);
-}
-// Request message for streaming recognition
-message StreamingRecognizeRequest {
-    oneof streaming_request {
-        // Configuration for the stream (send as first message)
-        StreamingRecognitionConfig streaming_config = 1;
-        // Audio content (send after config)
-        bytes audio_content = 2;
-        // Control message to end the stream
-        StreamingControl control = 3;
-    }
-}
-// Configuration for streaming recognition
-message StreamingRecognitionConfig {
-    // Audio encoding (default: PCM_S16LE)
-    string encoding = 1;
-    // Sample rate in Hz (default: 16000)
-    int32 sample_rate_hz = 2;
-    // Language code (default: en-US)
-    string language_code = 3;
-    // Enable interim results (default: true)
-    bool interim_results = 4;
-    // === Dynamic streaming parameters ===
-    // These can be changed per-session for testing different configurations.
-    // Use -1 or 0 to use server defaults.
-    //
-    // Parameters are split into two categories:
-    // - LIGHTWEIGHT (instant): att_context_size - changes take effect immediately
-    // - HEAVY (buffer rebuild): chunk_size, shift_size, left_chunks - requires reconfiguration
-    // [HEAVY] Chunk size in frames (-1 for model default)
-    // Controls the size of audio chunks processed at once
-    // Changing this triggers buffer rebuild
-    int32 chunk_size = 10;
-    // [HEAVY] Shift size in frames (-1 for model default)
-    // Controls how much the window shifts between chunks
-    // Changing this triggers buffer rebuild
-    int32 shift_size = 11;
-    // [HEAVY] Number of left context chunks to keep (default: 2)
-    // More chunks = more context but higher latency
-    // Changing this triggers buffer rebuild
-    int32 left_chunks = 12;
-    // [MEDIUM] Attention context size [left, right] (e.g., [70, 1])
-    // Controls the attention window for the encoder
-    // Requires cache reset but NOT buffer rebuild - faster than heavy params
-    repeated int32 att_context_size = 13;
-}
-// Control messages for the stream
-message StreamingControl {
-    enum ControlType {
-        CONTROL_UNSPECIFIED = 0;
-        END_OF_STREAM = 1;      // Client finished sending audio
-        RESET_SESSION = 2;      // Reset transcription state
-    }
-    ControlType type = 1;
-}
-// Response message for streaming recognition
-message StreamingRecognizeResponse {
-    // The transcription result
-    StreamingRecognitionResult result = 1;
-    // Error information (if any)
-    StreamingError error = 2;
-    // Session information
-    string session_id = 3;
-}
-// A single recognition result
-message StreamingRecognitionResult {
-    // The transcribed text
-    string transcript = 1;
-    // Whether this is a final result or interim
-    bool is_final = 2;
-    // Confidence score (0.0 to 1.0), optional
-    float confidence = 3;
-    // Processing latency in milliseconds
-    float latency_ms = 4;
-    // Stability score for interim results (0.0 to 1.0)
-    float stability = 5;
-}
-// Error information
-message StreamingError {
-    // Error code
-    int32 code = 1;
-    // Human-readable error message
-    string message = 2;
-}
-// Health check request (empty)
-message HealthCheckRequest {}
-// Health check response
-message HealthCheckResponse {
-    enum ServingStatus {
-        UNKNOWN = 0;
-        SERVING = 1;
-        NOT_SERVING = 2;
-    }
-    ServingStatus status = 1;
-    string model_loaded = 2;
-}
-// Get config request (empty)
-message GetConfigRequest {}
-// Get config response
-message GetConfigResponse {
-    string model_path = 1;
-    string device = 2;
-    string decoder_type = 3;
-    int32 sample_rate = 4;
-    float chunk_size_ms = 5;
-    float buffer_size_ms = 6;
-    // Current streaming parameters
-    int32 chunk_size = 10;
-    int32 shift_size = 11;
-    int32 left_chunks = 12;
-    repeated int32 att_context_size = 13;
-}

bridge/proto/streaming_asr_pb2.py DELETED Viewed

@@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: streaming_asr.proto
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import descriptor_pool as _descriptor_pool
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf.internal import builder as _builder
-# @@protoc_insertion_point(imports)
-_sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13streaming_asr.proto\x12\rstreaming_asr\"\xc4\x01\n\x19StreamingRecognizeRequest\x12\x45\n\x10streaming_config\x18\x01 \x01(\x0b\x32).streaming_asr.StreamingRecognitionConfigH\x00\x12\x17\n\raudio_content\x18\x02 \x01(\x0cH\x00\x12\x32\n\x07\x63ontrol\x18\x03 \x01(\x0b\x32\x1f.streaming_asr.StreamingControlH\x00\x42\x13\n\x11streaming_request\"v\n\x1aStreamingRecognitionConfig\x12\x10\n\x08\x65ncoding\x18\x01 \x01(\t\x12\x16\n\x0esample_rate_hz\x18\x02 \x01(\x05\x12\x15\n\rlanguage_code\x18\x03 \x01(\t\x12\x17\n\x0finterim_results\x18\x04 \x01(\x08\"\x9b\x01\n\x10StreamingControl\x12\x39\n\x04type\x18\x01 \x01(\x0e\x32+.streaming_asr.StreamingControl.ControlType\"L\n\x0b\x43ontrolType\x12\x17\n\x13\x43ONTROL_UNSPECIFIED\x10\x00\x12\x11\n\rEND_OF_STREAM\x10\x01\x12\x11\n\rRESET_SESSION\x10\x02\"\x99\x01\n\x1aStreamingRecognizeResponse\x12\x39\n\x06result\x18\x01 \x01(\x0b\x32).streaming_asr.StreamingRecognitionResult\x12,\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x1d.streaming_asr.StreamingError\x12\x12\n\nsession_id\x18\x03 \x01(\t\"}\n\x1aStreamingRecognitionResult\x12\x12\n\ntranscript\x18\x01 \x01(\t\x12\x10\n\x08is_final\x18\x02 \x01(\x08\x12\x12\n\nconfidence\x18\x03 \x01(\x02\x12\x12\n\nlatency_ms\x18\x04 \x01(\x02\x12\x11\n\tstability\x18\x05 \x01(\x02\"/\n\x0eStreamingError\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0f\n\x07message\x18\x02 \x01(\t\"\x14\n\x12HealthCheckRequest\"\xa9\x01\n\x13HealthCheckResponse\x12@\n\x06status\x18\x01 \x01(\x0e\x32\x30.streaming_asr.HealthCheckResponse.ServingStatus\x12\x14\n\x0cmodel_loaded\x18\x02 \x01(\t\":\n\rServingStatus\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x0b\n\x07SERVING\x10\x01\x12\x0f\n\x0bNOT_SERVING\x10\x02\"\x12\n\x10GetConfigRequest\"\x91\x01\n\x11GetConfigResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x0e\n\x06\x64\x65vice\x18\x02 \x01(\t\x12\x14\n\x0c\x64\x65\x63oder_type\x18\x03 \x01(\t\x12\x13\n\x0bsample_rate\x18\x04 \x01(\x05\x12\x15\n\rchunk_size_ms\x18\x05 \x01(\x02\x12\x16\n\x0e\x62uffer_size_ms\x18\x06 \x01(\x02\x32\xa3\x02\n\x0cStreamingASR\x12m\n\x12StreamingRecognize\x12(.streaming_asr.StreamingRecognizeRequest\x1a).streaming_asr.StreamingRecognizeResponse(\x01\x30\x01\x12T\n\x0bHealthCheck\x12!.streaming_asr.HealthCheckRequest\x1a\".streaming_asr.HealthCheckResponse\x12N\n\tGetConfig\x12\x1f.streaming_asr.GetConfigRequest\x1a .streaming_asr.GetConfigResponseb\x06proto3')
-_globals = globals()
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'streaming_asr_pb2', _globals)
-if not _descriptor._USE_C_DESCRIPTORS:
-  DESCRIPTOR._loaded_options = None
-  _globals['_STREAMINGRECOGNIZEREQUEST']._serialized_start=39
-  _globals['_STREAMINGRECOGNIZEREQUEST']._serialized_end=235
-  _globals['_STREAMINGRECOGNITIONCONFIG']._serialized_start=237
-  _globals['_STREAMINGRECOGNITIONCONFIG']._serialized_end=355
-  _globals['_STREAMINGCONTROL']._serialized_start=358
-  _globals['_STREAMINGCONTROL']._serialized_end=513
-  _globals['_STREAMINGCONTROL_CONTROLTYPE']._serialized_start=437
-  _globals['_STREAMINGCONTROL_CONTROLTYPE']._serialized_end=513
-  _globals['_STREAMINGRECOGNIZERESPONSE']._serialized_start=516
-  _globals['_STREAMINGRECOGNIZERESPONSE']._serialized_end=669
-  _globals['_STREAMINGRECOGNITIONRESULT']._serialized_start=671
-  _globals['_STREAMINGRECOGNITIONRESULT']._serialized_end=796
-  _globals['_STREAMINGERROR']._serialized_start=798
-  _globals['_STREAMINGERROR']._serialized_end=845
-  _globals['_HEALTHCHECKREQUEST']._serialized_start=847
-  _globals['_HEALTHCHECKREQUEST']._serialized_end=867
-  _globals['_HEALTHCHECKRESPONSE']._serialized_start=870
-  _globals['_HEALTHCHECKRESPONSE']._serialized_end=1039
-  _globals['_HEALTHCHECKRESPONSE_SERVINGSTATUS']._serialized_start=981
-  _globals['_HEALTHCHECKRESPONSE_SERVINGSTATUS']._serialized_end=1039
-  _globals['_GETCONFIGREQUEST']._serialized_start=1041
-  _globals['_GETCONFIGREQUEST']._serialized_end=1059
-  _globals['_GETCONFIGRESPONSE']._serialized_start=1062
-  _globals['_GETCONFIGRESPONSE']._serialized_end=1207
-  _globals['_STREAMINGASR']._serialized_start=1210
-  _globals['_STREAMINGASR']._serialized_end=1501
-# @@protoc_insertion_point(module_scope)

bridge/proto/streaming_asr_pb2_grpc.py DELETED Viewed

@@ -1,170 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-from . import streaming_asr_pb2 as streaming__asr__pb2
-class StreamingASRStub(object):
-    """Streaming ASR Service
-    Supports bidirectional streaming for real-time speech recognition
-    """
-    def __init__(self, channel):
-        """Constructor.
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.StreamingRecognize = channel.stream_stream(
-                '/streaming_asr.StreamingASR/StreamingRecognize',
-                request_serializer=streaming__asr__pb2.StreamingRecognizeRequest.SerializeToString,
-                response_deserializer=streaming__asr__pb2.StreamingRecognizeResponse.FromString,
-                )
-        self.HealthCheck = channel.unary_unary(
-                '/streaming_asr.StreamingASR/HealthCheck',
-                request_serializer=streaming__asr__pb2.HealthCheckRequest.SerializeToString,
-                response_deserializer=streaming__asr__pb2.HealthCheckResponse.FromString,
-                )
-        self.GetConfig = channel.unary_unary(
-                '/streaming_asr.StreamingASR/GetConfig',
-                request_serializer=streaming__asr__pb2.GetConfigRequest.SerializeToString,
-                response_deserializer=streaming__asr__pb2.GetConfigResponse.FromString,
-                )
-class StreamingASRServicer(object):
-    """Streaming ASR Service
-    Supports bidirectional streaming for real-time speech recognition
-    """
-    def StreamingRecognize(self, request_iterator, context):
-        """Bidirectional streaming RPC for real-time transcription
-        Client streams audio chunks, server streams transcription results
-        """
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-    def HealthCheck(self, request, context):
-        """Health check endpoint
-        """
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-    def GetConfig(self, request, context):
-        """Get server configuration
-        """
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-def add_StreamingASRServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'StreamingRecognize': grpc.stream_stream_rpc_method_handler(
-                    servicer.StreamingRecognize,
-                    request_deserializer=streaming__asr__pb2.StreamingRecognizeRequest.FromString,
-                    response_serializer=streaming__asr__pb2.StreamingRecognizeResponse.SerializeToString,
-            ),
-            'HealthCheck': grpc.unary_unary_rpc_method_handler(
-                    servicer.HealthCheck,
-                    request_deserializer=streaming__asr__pb2.HealthCheckRequest.FromString,
-                    response_serializer=streaming__asr__pb2.HealthCheckResponse.SerializeToString,
-            ),
-            'GetConfig': grpc.unary_unary_rpc_method_handler(
-                    servicer.GetConfig,
-                    request_deserializer=streaming__asr__pb2.GetConfigRequest.FromString,
-                    response_serializer=streaming__asr__pb2.GetConfigResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'streaming_asr.StreamingASR', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
- # This class is part of an EXPERIMENTAL API.
-class StreamingASR(object):
-    """Streaming ASR Service
-    Supports bidirectional streaming for real-time speech recognition
-    """
-    @staticmethod
-    def StreamingRecognize(request_iterator,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            '/streaming_asr.StreamingASR/StreamingRecognize',
-            streaming__asr__pb2.StreamingRecognizeRequest.SerializeToString,
-            streaming__asr__pb2.StreamingRecognizeResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-    @staticmethod
-    def HealthCheck(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/streaming_asr.StreamingASR/HealthCheck',
-            streaming__asr__pb2.HealthCheckRequest.SerializeToString,
-            streaming__asr__pb2.HealthCheckResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-    @staticmethod
-    def GetConfig(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/streaming_asr.StreamingASR/GetConfig',
-            streaming__asr__pb2.GetConfigRequest.SerializeToString,
-            streaming__asr__pb2.GetConfigResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)

bridge/requirements.txt CHANGED Viewed

@@ -3,10 +3,11 @@ fastapi>=0.104.0
 uvicorn[standard]>=0.24.0
 websockets>=12.0
-# gRPC for NVCF communication
 grpcio>=1.60.0
-grpcio-tools>=1.60.0
-protobuf>=4.25.0
 # Logging
 loguru>=0.7.0
@@ -23,4 +24,3 @@ pydantic>=2.5.0
 # File upload support
 python-multipart>=0.0.6

 uvicorn[standard]>=0.24.0
 websockets>=12.0
+# Triton Inference Server client
+tritonclient[grpc]>=2.40.0
+# gRPC (needed for Triton client)
 grpcio>=1.60.0
 # Logging
 loguru>=0.7.0
 # File upload support
 python-multipart>=0.0.6

bridge/triton_client.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Async Triton client for streaming ASR with NVCF."""
+import asyncio
+import uuid
+from dataclasses import dataclass
+from functools import partial
+from typing import AsyncIterator, Optional, Callable, Any
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from loguru import logger
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+from .config import TritonConfig
+@dataclass
+class TranscriptResult:
+    """Transcription result from the ASR service."""
+    text: str
+    is_final: bool
+    confidence: float = 0.0
+    session_id: str = ""
+def _stream_callback(result_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop, result, error):
+    """Callback for streaming responses - puts results into async queue."""
+    if error:
+        asyncio.run_coroutine_threadsafe(
+            result_queue.put({"error": str(error)}),
+            loop
+        )
+    else:
+        try:
+            transcript = result.as_numpy("transcript")[0]
+            if isinstance(transcript, bytes):
+                transcript = transcript.decode('utf-8')
+            is_final = bool(result.as_numpy("is_final")[0])
+            confidence = float(result.as_numpy("confidence")[0])
+            asyncio.run_coroutine_threadsafe(
+                result_queue.put({
+                    "transcript": transcript,
+                    "is_final": is_final,
+                    "confidence": confidence,
+                }),
+                loop
+            )
+        except Exception as e:
+            asyncio.run_coroutine_threadsafe(
+                result_queue.put({"error": str(e)}),
+                loop
+            )
+class TritonASRClient:
+    """
+    Async Triton client for streaming ASR.
+    Handles bidirectional streaming to Triton with NVCF authentication.
+    """
+    def __init__(self, config: TritonConfig):
+        """
+        Initialize the Triton client.
+        Args:
+            config: Triton configuration with API key and function ID
+        """
+        self.config = config
+        self._client: Optional[grpcclient.InferenceServerClient] = None
+        self._executor = ThreadPoolExecutor(max_workers=4)
+        self.sample_rate = 16000
+        self.chunk_size = 1600  # 100ms at 16kHz
+    def _get_headers(self) -> dict:
+        """Get gRPC metadata headers for NVCF authentication.
+        Note: gRPC metadata keys must be lowercase.
+        """
+        headers = {
+            "authorization": f"Bearer {self.config.ngc_api_key}",
+            "function-id": self.config.function_id,
+        }
+        if self.config.version_id:
+            headers["function-version-id"] = self.config.version_id
+        return headers
+    async def connect(self) -> None:
+        """Establish connection to Triton server."""
+        if self._client is not None:
+            return
+        logger.info(f"Connecting to Triton at {self.config.server_url}")
+        try:
+            # Create Triton client with SSL if needed
+            self._client = grpcclient.InferenceServerClient(
+                url=self.config.server_url,
+                ssl=self.config.use_ssl,
+                # For NVCF, auth is passed via metadata in each request
+            )
+            # Note: For NVCF, we skip standard health checks because they don't
+            # support passing authentication headers. Authentication is validated
+            # on the first actual inference request instead.
+            logger.info(f"Connected to Triton at {self.config.server_url}")
+            logger.info("(Health check skipped for NVCF - auth validated on first request)")
+        except Exception as e:
+            logger.error(f"Failed to connect to Triton: {e}")
+            self._client = None
+            raise
+    async def disconnect(self) -> None:
+        """Close connection to Triton."""
+        if self._client is not None:
+            try:
+                self._client.close()
+            except:
+                pass
+            self._client = None
+            logger.info("Disconnected from Triton")
+    async def health_check(self) -> dict:
+        """
+        Check Triton service health.
+        Note: For NVCF, standard health checks may fail due to authentication
+        requirements. This method returns a simplified status.
+        Returns:
+            Health status dictionary
+        """
+        if self._client is None:
+            await self.connect()
+        # For NVCF, we can't do standard health checks without auth headers
+        # Just return that the client is configured
+        return {
+            "status": "CONFIGURED",
+            "server_url": self.config.server_url,
+            "model_name": self.config.model_name,
+            "client_ready": self._client is not None,
+            "healthy": self._client is not None,
+            "note": "Full health check available on first inference request",
+        }
+    async def stream_audio(
+        self,
+        audio_iterator: AsyncIterator[bytes],
+        on_transcript: Optional[Callable[[TranscriptResult], Any]] = None,
+    ) -> AsyncIterator[TranscriptResult]:
+        """
+        Stream audio to Triton and yield transcription results.
+        Args:
+            audio_iterator: Async iterator yielding audio chunks (bytes, PCM 16-bit)
+            on_transcript: Optional callback for each transcript
+        Yields:
+            TranscriptResult objects
+        """
+        if self._client is None:
+            await self.connect()
+        session_id = str(uuid.uuid4())[:8]
+        logger.info(f"[{session_id}] Starting Triton stream")
+        # Create async queue for results
+        loop = asyncio.get_event_loop()
+        result_queue: asyncio.Queue = asyncio.Queue()
+        # Create callback with queue reference
+        callback = partial(_stream_callback, result_queue, loop)
+        # Start stream in thread (Triton client is synchronous)
+        def start_stream():
+            try:
+                self._client.start_stream(
+                    callback=callback,
+                    headers=self._get_headers(),
+                )
+                return True
+            except Exception as e:
+                logger.error(f"[{session_id}] Failed to start stream: {e}")
+                return False
+        stream_started = await loop.run_in_executor(self._executor, start_stream)
+        if not stream_started:
+            raise RuntimeError("Failed to start Triton stream")
+        logger.info(f"[{session_id}] Triton stream started")
+        # Task to send audio chunks
+        async def send_audio():
+            chunk_count = 0
+            try:
+                async for audio_bytes in audio_iterator:
+                    if audio_bytes:
+                        # Convert bytes to int16 numpy array
+                        audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+                        # Send chunk
+                        await loop.run_in_executor(
+                            self._executor,
+                            partial(self._send_chunk, session_id, audio_np, is_final=False)
+                        )
+                        chunk_count += 1
+                        if chunk_count == 1:
+                            logger.info(f"[{session_id}] First audio chunk sent")
+                        elif chunk_count % 50 == 0:
+                            logger.debug(f"[{session_id}] Sent {chunk_count} audio chunks")
+                # Send final chunk
+                await loop.run_in_executor(
+                    self._executor,
+                    partial(self._send_chunk, session_id, np.array([], dtype=np.int16), is_final=True)
+                )
+                logger.info(f"[{session_id}] Sent {chunk_count} total audio chunks, final=True")
+                # Signal end of audio
+                await asyncio.sleep(0.5)  # Wait for final responses
+                await result_queue.put(None)  # Signal completion
+            except Exception as e:
+                logger.error(f"[{session_id}] Error sending audio: {e}")
+                await result_queue.put({"error": str(e)})
+                await result_queue.put(None)
+        # Start sending audio in background
+        send_task = asyncio.create_task(send_audio())
+        try:
+            # Yield results as they come in
+            while True:
+                result = await result_queue.get()
+                if result is None:
+                    break
+                if "error" in result:
+                    logger.error(f"[{session_id}] Stream error: {result['error']}")
+                    continue
+                transcript_result = TranscriptResult(
+                    text=result["transcript"],
+                    is_final=result["is_final"],
+                    confidence=result["confidence"],
+                    session_id=session_id,
+                )
+                if on_transcript:
+                    on_transcript(transcript_result)
+                yield transcript_result
+        finally:
+            # Stop the stream
+            def stop_stream():
+                try:
+                    self._client.stop_stream()
+                except:
+                    pass
+            await loop.run_in_executor(self._executor, stop_stream)
+            # Wait for send task to complete
+            try:
+                await asyncio.wait_for(send_task, timeout=1.0)
+            except asyncio.TimeoutError:
+                send_task.cancel()
+            logger.info(f"[{session_id}] Triton stream ended")
+    def _send_chunk(self, session_id: str, audio_chunk: np.ndarray, is_final: bool):
+        """Send audio chunk to Triton (synchronous, called from executor)."""
+        # Create inputs
+        inputs = []
+        # Audio chunk (int16)
+        audio_input = grpcclient.InferInput("audio_chunk", [len(audio_chunk)], "INT16")
+        audio_input.set_data_from_numpy(audio_chunk)
+        inputs.append(audio_input)
+        # Sample rate
+        sr_input = grpcclient.InferInput("sample_rate", [1], "INT32")
+        sr_input.set_data_from_numpy(np.array([self.sample_rate], dtype=np.int32))
+        inputs.append(sr_input)
+        # Is final flag
+        final_input = grpcclient.InferInput("is_final", [1], "BOOL")
+        final_input.set_data_from_numpy(np.array([is_final], dtype=np.bool_))
+        inputs.append(final_input)
+        # Session ID
+        session_input = grpcclient.InferInput("session_id", [1], "BYTES")
+        session_input.set_data_from_numpy(np.array([session_id], dtype=np.object_))
+        inputs.append(session_input)
+        # Outputs
+        outputs = [
+            grpcclient.InferRequestedOutput("transcript"),
+            grpcclient.InferRequestedOutput("is_final"),
+            grpcclient.InferRequestedOutput("confidence"),
+        ]
+        try:
+            # Send async request through the stream
+            # Note: headers are passed at start_stream() level, not per-request
+            self._client.async_stream_infer(
+                model_name=self.config.model_name,
+                inputs=inputs,
+                outputs=outputs,
+            )
+        except InferenceServerException as e:
+            logger.error(f"[{session_id}] Inference error: {e}")
+            raise
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self.connect()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.disconnect()

proto/generate.sh ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/bin/bash
+# Generate Python gRPC code from proto files
+#
+# Usage:
+#   ./proto/generate.sh
+#
+# Requirements:
+#   pip install grpcio-tools
+set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+PROTO_DIR="$SCRIPT_DIR"
+OUTPUT_DIR="$PROJECT_DIR/src/nemotron_speech/grpc_gen"
+echo "Generating Python gRPC code..."
+echo "  Proto dir: $PROTO_DIR"
+echo "  Output dir: $OUTPUT_DIR"
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+# Generate Python code
+python3 -m grpc_tools.protoc \
+    --proto_path="$PROTO_DIR" \
+    --python_out="$OUTPUT_DIR" \
+    --grpc_python_out="$OUTPUT_DIR" \
+    "$PROTO_DIR/riva_audio.proto" \
+    "$PROTO_DIR/riva_asr.proto" \
+    "$PROTO_DIR/health.proto"
+# Create __init__.py
+cat > "$OUTPUT_DIR/__init__.py" << 'EOF'
+"""Generated gRPC code for Riva-compatible ASR service."""
+from .riva_audio_pb2 import AudioEncoding
+from .riva_asr_pb2 import (
+    StreamingRecognizeRequest,
+    StreamingRecognizeResponse,
+    StreamingRecognitionConfig,
+    RecognitionConfig,
+    StreamingRecognitionResult,
+    SpeechRecognitionAlternative,
+    RecognizeRequest,
+    RecognizeResponse,
+    CustomConfiguration,
+)
+from .riva_asr_pb2_grpc import (
+    RivaSpeechRecognitionServicer,
+    RivaSpeechRecognitionStub,
+    add_RivaSpeechRecognitionServicer_to_server,
+)
+from .health_pb2 import HealthCheckRequest, HealthCheckResponse
+from .health_pb2_grpc import HealthServicer, HealthStub, add_HealthServicer_to_server
+__all__ = [
+    # Audio
+    "AudioEncoding",
+    # ASR messages
+    "StreamingRecognizeRequest",
+    "StreamingRecognizeResponse",
+    "StreamingRecognitionConfig",
+    "RecognitionConfig",
+    "StreamingRecognitionResult",
+    "SpeechRecognitionAlternative",
+    "RecognizeRequest",
+    "RecognizeResponse",
+    "CustomConfiguration",
+    # ASR service
+    "RivaSpeechRecognitionServicer",
+    "RivaSpeechRecognitionStub",
+    "add_RivaSpeechRecognitionServicer_to_server",
+    # Health
+    "HealthCheckRequest",
+    "HealthCheckResponse",
+    "HealthServicer",
+    "HealthStub",
+    "add_HealthServicer_to_server",
+]
+EOF
+# Fix imports in generated files (use relative imports)
+# The generated files use absolute imports which don't work in a package
+for f in "$OUTPUT_DIR"/*_pb2*.py; do
+    if [[ -f "$f" ]]; then
+        # Fix imports: change "import xxx_pb2" to "from . import xxx_pb2"
+        sed -i 's/^import riva_audio_pb2/from . import riva_audio_pb2/g' "$f"
+        sed -i 's/^import riva_asr_pb2/from . import riva_asr_pb2/g' "$f"
+        sed -i 's/^import health_pb2/from . import health_pb2/g' "$f"
+    fi
+done
+echo "Done! Generated files in $OUTPUT_DIR:"
+ls -la "$OUTPUT_DIR"

proto/health.proto ADDED Viewed

	@@ -0,0 +1,35 @@

+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// gRPC Health Checking Protocol (standard)
+// https://github.com/grpc/grpc/blob/master/doc/health-checking.md
+syntax = "proto3";
+package grpc.health.v1;
+option java_package = "io.grpc.health.v1";
+option java_outer_classname = "HealthProto";
+// Health checking service.
+service Health {
+  // Check the health of a service.
+  rpc Check(HealthCheckRequest) returns (HealthCheckResponse);
+  // Watch the health of a service (streaming).
+  rpc Watch(HealthCheckRequest) returns (stream HealthCheckResponse);
+}
+message HealthCheckRequest {
+  // The service name to check. Empty string checks the server overall.
+  string service = 1;
+}
+message HealthCheckResponse {
+  enum ServingStatus {
+    UNKNOWN = 0;
+    SERVING = 1;
+    NOT_SERVING = 2;
+    SERVICE_UNKNOWN = 3;  // Used only by Watch
+  }
+  ServingStatus status = 1;
+}

proto/riva_asr.proto ADDED Viewed

	@@ -0,0 +1,163 @@

+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Riva-compatible ASR proto definitions (subset)
+// Compatible with nvidia-riva-client for seamless integration
+syntax = "proto3";
+package nvidia.riva.asr;
+option java_package = "com.nvidia.riva.asr";
+option java_outer_classname = "RivaAsrProto";
+import "riva_audio.proto";
+// The RivaSpeechRecognition service provides streaming speech recognition.
+service RivaSpeechRecognition {
+  // Performs bidirectional streaming speech recognition.
+  // Send audio data and receive transcription results in real-time.
+  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {}
+  // Performs synchronous (non-streaming) speech recognition.
+  // Send complete audio and receive full transcription.
+  rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {}
+}
+// Request message for streaming recognition.
+message StreamingRecognizeRequest {
+  oneof streaming_request {
+    // The streaming configuration. Must be the first message sent.
+    StreamingRecognitionConfig streaming_config = 1;
+    // Audio content to be recognized. Sequential chunks of audio data.
+    bytes audio_content = 2;
+  }
+}
+// Configuration for streaming recognition.
+message StreamingRecognitionConfig {
+  // Required. Configuration for the recognition.
+  RecognitionConfig config = 1;
+  // If true, interim results may be returned as they become available.
+  bool interim_results = 2;
+}
+// Configuration for recognition request.
+message RecognitionConfig {
+  // Encoding of audio data sent in all RecognitionAudio messages.
+  AudioEncoding encoding = 1;
+  // Sample rate in Hertz of the audio data. Must be 16000 for Nemotron.
+  int32 sample_rate_hertz = 2;
+  // Language code (e.g., "en-US"). Currently only English supported.
+  string language_code = 3;
+  // Maximum number of recognition hypotheses to return.
+  // Currently only 1 is supported.
+  int32 max_alternatives = 4;
+  // If true, adds punctuation to recognition result hypotheses.
+  // Note: Nemotron model handles punctuation internally.
+  bool enable_automatic_punctuation = 11;
+  // If true, the recognizer will detect word time offsets.
+  // Note: Not currently supported, will be ignored.
+  bool enable_word_time_offsets = 8;
+  // Metadata about the audio being sent.
+  RecognitionMetadata metadata = 9;
+  // Custom configuration for model-specific parameters.
+  CustomConfiguration custom_configuration = 24;
+}
+// Metadata about the audio being recognized.
+message RecognitionMetadata {
+  // The original source of the audio (e.g., "microphone", "file").
+  string audio_source = 1;
+}
+// Custom configuration for Nemotron-specific parameters.
+message CustomConfiguration {
+  // Right context for streaming (controls latency/accuracy tradeoff)
+  // 0 = ~80ms, 1 = ~160ms (default), 6 = ~560ms, 13 = ~1.12s
+  int32 right_context = 1;
+}
+// Response message for streaming recognition.
+message StreamingRecognizeResponse {
+  // Streaming recognition results.
+  repeated StreamingRecognitionResult results = 1;
+}
+// A streaming recognition result corresponding to a portion of the audio.
+message StreamingRecognitionResult {
+  // May contain one or more recognition hypotheses.
+  repeated SpeechRecognitionAlternative alternatives = 1;
+  // If true, this is the final result. No further results will be
+  // returned for this portion of audio.
+  bool is_final = 2;
+  // Stability of the result (0.0 to 1.0). Higher is more stable.
+  float stability = 3;
+  // Time offset relative to the beginning of the audio.
+  float audio_processed = 4;
+}
+// Alternative hypotheses (a]ias recognition results).
+message SpeechRecognitionAlternative {
+  // Transcript text representing the words the user spoke.
+  string transcript = 1;
+  // Confidence estimate (0.0 to 1.0). Higher is better.
+  float confidence = 2;
+  // Word-level information (if enable_word_time_offsets was set).
+  repeated WordInfo words = 3;
+}
+// Word-level information for recognized words.
+message WordInfo {
+  // Time offset relative to the beginning of the audio.
+  float start_time = 1;
+  // Time offset relative to the beginning of the audio.
+  float end_time = 2;
+  // The word corresponding to this set of information.
+  string word = 3;
+  // Confidence estimate for this word (0.0 to 1.0).
+  float confidence = 4;
+}
+// Request for non-streaming (batch) recognition.
+message RecognizeRequest {
+  // Required. Configuration for the recognition.
+  RecognitionConfig config = 1;
+  // Required. The audio data to be recognized.
+  bytes audio = 2;
+}
+// Response for non-streaming recognition.
+message RecognizeResponse {
+  // Recognition results.
+  repeated SpeechRecognitionResult results = 1;
+}
+// A non-streaming recognition result.
+message SpeechRecognitionResult {
+  // May contain one or more recognition hypotheses.
+  repeated SpeechRecognitionAlternative alternatives = 1;
+  // For multi-channel audio, this is the channel number.
+  int32 channel_tag = 2;
+  // Time offset of the audio that generated this result.
+  float audio_processed = 3;
+}

proto/riva_audio.proto ADDED Viewed

	@@ -0,0 +1,32 @@

+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Riva-compatible audio encoding definitions
+syntax = "proto3";
+package nvidia.riva.asr;
+option java_package = "com.nvidia.riva.asr";
+option java_outer_classname = "RivaAudioProto";
+// Audio encoding types supported by the ASR service.
+enum AudioEncoding {
+  // Not specified. Will be treated as LINEAR_PCM.
+  ENCODING_UNSPECIFIED = 0;
+  // Uncompressed 16-bit signed little-endian samples (Linear PCM).
+  // This is the only encoding supported by Nemotron ASR.
+  LINEAR_PCM = 1;
+  // FLAC (Free Lossless Audio Codec) encoded audio.
+  // Note: Not currently supported, will return error.
+  FLAC = 2;
+  // μ-law encoded audio.
+  // Note: Not currently supported, will return error.
+  MULAW = 3;
+  // A-law encoded audio.
+  // Note: Not currently supported, will return error.
+  ALAW = 20;
+}

test_triton_asr.py ADDED Viewed

	@@ -0,0 +1,266 @@

+#!/usr/bin/env python3
+"""
+Test client for Nemotron ASR with Triton Inference Server.
+This client demonstrates streaming ASR using Triton's gRPC interface
+with decoupled mode for bidirectional streaming.
+Usage:
+    # From microphone
+    python test_triton_asr.py --server localhost:8001
+    # From file
+    python test_triton_asr.py --server localhost:8001 --file audio.wav
+"""
+import argparse
+import asyncio
+import time
+import uuid
+import wave
+from pathlib import Path
+from functools import partial
+import numpy as np
+# Triton client - use synchronous client for streaming
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+def stream_callback(user_data, result, error):
+    """Callback for streaming responses."""
+    if error:
+        user_data["errors"].append(str(error))
+    else:
+        try:
+            transcript = result.as_numpy("transcript")[0]
+            if isinstance(transcript, bytes):
+                transcript = transcript.decode('utf-8')
+            is_final = result.as_numpy("is_final")[0]
+            confidence = result.as_numpy("confidence")[0]
+            user_data["results"].append({
+                "transcript": transcript,
+                "is_final": is_final,
+                "confidence": confidence
+            })
+            if is_final:
+                print(f"\n[FINAL] {transcript} (confidence: {confidence:.2f})")
+            elif transcript:
+                print(f"\r[interim] {transcript}", end="", flush=True)
+        except Exception as e:
+            user_data["errors"].append(str(e))
+class TritonASRClient:
+    """Streaming ASR client for Triton."""
+    def __init__(self, server: str):
+        self.server = server
+        self.client = None
+        self.sample_rate = 16000
+        self.chunk_size = 1600  # 100ms at 16kHz
+        self.session_id = str(uuid.uuid4())[:8]
+    def connect(self):
+        """Connect to Triton server."""
+        self.client = grpcclient.InferenceServerClient(url=self.server)
+        # Check server health
+        if not self.client.is_server_live():
+            raise RuntimeError("Triton server is not live")
+        if not self.client.is_server_ready():
+            raise RuntimeError("Triton server is not ready")
+        print(f"Connected to Triton at {self.server}")
+        print(f"Session ID: {self.session_id}")
+    def transcribe_file(self, file_path: str):
+        """Transcribe audio from a file."""
+        path = Path(file_path)
+        if not path.exists():
+            print(f"Error: File not found: {file_path}")
+            return
+        with wave.open(str(path), 'rb') as wf:
+            if wf.getframerate() != 16000:
+                print(f"Warning: Expected 16kHz, got {wf.getframerate()}Hz")
+            if wf.getnchannels() != 1:
+                print(f"Warning: Expected mono, got {wf.getnchannels()} channels")
+            frames = wf.readframes(wf.getnframes())
+        audio_np = np.frombuffer(frames, dtype=np.int16)
+        print(f"File: {file_path}")
+        print(f"Duration: {len(audio_np) / self.sample_rate:.2f}s")
+        print()
+        # Set up streaming callback
+        user_data = {"results": [], "errors": []}
+        # Start stream
+        self.client.start_stream(callback=partial(stream_callback, user_data))
+        try:
+            # Process in chunks
+            chunk_samples = self.chunk_size
+            for i in range(0, len(audio_np), chunk_samples):
+                chunk = audio_np[i:i + chunk_samples]
+                is_final = (i + chunk_samples >= len(audio_np))
+                self._send_chunk(chunk, is_final)
+                # Small delay to allow responses to come back
+                time.sleep(0.05)
+            # Wait for final responses
+            time.sleep(1.0)
+        finally:
+            self.client.stop_stream()
+        # Print any errors
+        for error in user_data["errors"]:
+            print(f"Error: {error}")
+    def transcribe_microphone(self, duration: float = 30.0):
+        """Transcribe from microphone."""
+        try:
+            import pyaudio
+        except ImportError:
+            print("Error: pyaudio not installed. Run: pip install pyaudio")
+            return
+        p = pyaudio.PyAudio()
+        # Use default device
+        device_info = p.get_default_input_device_info()
+        print(f"Using device: {device_info['name']}")
+        print(f"Recording for {duration}s. Press Ctrl+C to stop.\n")
+        stream = p.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=self.sample_rate,
+            input=True,
+            frames_per_buffer=self.chunk_size,
+        )
+        # Set up streaming callback
+        user_data = {"results": [], "errors": []}
+        # Start stream
+        self.client.start_stream(callback=partial(stream_callback, user_data))
+        start_time = time.time()
+        try:
+            while time.time() - start_time < duration:
+                data = stream.read(self.chunk_size, exception_on_overflow=False)
+                audio_np = np.frombuffer(data, dtype=np.int16)
+                self._send_chunk(audio_np, is_final=False)
+        except KeyboardInterrupt:
+            pass
+        finally:
+            # Send final chunk
+            self._send_chunk(np.array([], dtype=np.int16), is_final=True)
+            # Wait for final responses
+            time.sleep(0.5)
+            self.client.stop_stream()
+            stream.stop_stream()
+            stream.close()
+            p.terminate()
+        # Print any errors
+        for error in user_data["errors"]:
+            print(f"Error: {error}")
+    def _send_chunk(self, audio_chunk: np.ndarray, is_final: bool):
+        """Send audio chunk to Triton."""
+        # Create inputs
+        inputs = []
+        # Audio chunk (int16)
+        audio_input = grpcclient.InferInput("audio_chunk", [len(audio_chunk)], "INT16")
+        audio_input.set_data_from_numpy(audio_chunk)
+        inputs.append(audio_input)
+        # Sample rate
+        sr_input = grpcclient.InferInput("sample_rate", [1], "INT32")
+        sr_input.set_data_from_numpy(np.array([self.sample_rate], dtype=np.int32))
+        inputs.append(sr_input)
+        # Is final flag
+        final_input = grpcclient.InferInput("is_final", [1], "BOOL")
+        final_input.set_data_from_numpy(np.array([is_final], dtype=np.bool_))
+        inputs.append(final_input)
+        # Session ID
+        session_input = grpcclient.InferInput("session_id", [1], "BYTES")
+        session_input.set_data_from_numpy(np.array([self.session_id], dtype=np.object_))
+        inputs.append(session_input)
+        # Outputs
+        outputs = [
+            grpcclient.InferRequestedOutput("transcript"),
+            grpcclient.InferRequestedOutput("is_final"),
+            grpcclient.InferRequestedOutput("confidence"),
+        ]
+        try:
+            # Send async request through the stream
+            self.client.async_stream_infer(
+                model_name="nemotron_asr",
+                inputs=inputs,
+                outputs=outputs,
+            )
+        except InferenceServerException as e:
+            print(f"Inference error: {e}")
+def main():
+    parser = argparse.ArgumentParser(description="Triton ASR Test Client")
+    parser.add_argument(
+        "--server",
+        default="localhost:8001",
+        help="Triton gRPC server address (host:port)"
+    )
+    parser.add_argument(
+        "--file",
+        type=str,
+        help="Audio file to transcribe (WAV, 16kHz mono)"
+    )
+    parser.add_argument(
+        "--duration",
+        type=float,
+        default=30.0,
+        help="Recording duration for microphone input (seconds)"
+    )
+    args = parser.parse_args()
+    client = TritonASRClient(args.server)
+    try:
+        client.connect()
+        if args.file:
+            client.transcribe_file(args.file)
+        else:
+            client.transcribe_microphone(args.duration)
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

web/src/App.tsx CHANGED Viewed

@@ -10,9 +10,7 @@ import type {
   ServerMessage,
   TranscriptEntry,
   FileTranscriptResponse,
-  AttentionContextSize,
 } from './types/messages';
-import { ATTENTION_CONTEXT_OPTIONS } from './types/messages';
 function App() {
   // Transcript state
@@ -24,11 +22,6 @@ function App() {
   const [isUploading, setIsUploading] = useState(false);
   const [uploadError, setUploadError] = useState<string | null>(null);
-  // Attention context state (default: [70, 0])
-  const [attentionContext, setAttentionContext] = useState<AttentionContextSize>(
-    ATTENTION_CONTEXT_OPTIONS[0].value
-  );
   // Handle incoming WebSocket messages
   const handleMessage = useCallback((message: ServerMessage) => {
     switch (message.type) {
@@ -103,9 +96,9 @@ function App() {
   // Handle recording start
   const handleStartRecording = useCallback(async () => {
-    sendMessage({ type: 'start_stream', att_context_size: attentionContext });
     await startRecording();
-  }, [sendMessage, startRecording, attentionContext]);
   // Handle recording stop
   const handleStopRecording = useCallback(() => {
@@ -212,12 +205,12 @@ function App() {
               style={{ backgroundColor: 'transparent' }}
             />
             <h1 className="font-sans text-2xl md:text-3xl font-bold text-nvidia-green">
-              Nemotron Speech ASR
             </h1>
           </div>
           <p className="text-surface-400 text-sm">
             Real-time speech recognition powered by{' '}
-            <span className="text-nvidia-green font-medium">NVIDIA NeMo</span>
           </p>
         </header>
@@ -266,12 +259,10 @@ function App() {
               connectionState={connectionState}
               audioDevices={audioDevices}
               selectedDevice={selectedDevice}
-              attentionContext={attentionContext}
               onStartRecording={handleStartRecording}
               onStopRecording={handleStopRecording}
               onReset={handleReset}
               onDeviceChange={selectDevice}
-              onAttentionContextChange={setAttentionContext}
               onFileUpload={handleFileUpload}
               onExport={handleExport}
               hasTranscript={hasTranscript}
@@ -313,8 +304,8 @@ function App() {
           </div>
           <p className="mt-2">
             Built with{' '}
-            <span className="text-nvidia-green">NVIDIA NeMo</span>
-            {' '}Cache-Aware Streaming •{' '}
             <a
               href="https://github.com/NVIDIA/NeMo"
               target="_blank"

   ServerMessage,
   TranscriptEntry,
   FileTranscriptResponse,
 } from './types/messages';
 function App() {
   // Transcript state
   const [isUploading, setIsUploading] = useState(false);
   const [uploadError, setUploadError] = useState<string | null>(null);
   // Handle incoming WebSocket messages
   const handleMessage = useCallback((message: ServerMessage) => {
     switch (message.type) {
   // Handle recording start
   const handleStartRecording = useCallback(async () => {
+    sendMessage({ type: 'start_stream' });
     await startRecording();
+  }, [sendMessage, startRecording]);
   // Handle recording stop
   const handleStopRecording = useCallback(() => {
               style={{ backgroundColor: 'transparent' }}
             />
             <h1 className="font-sans text-2xl md:text-3xl font-bold text-nvidia-green">
+              Nemotron Speech Streaming
             </h1>
           </div>
           <p className="text-surface-400 text-sm">
             Real-time speech recognition powered by{' '}
+            <span className="text-nvidia-green font-medium">NVIDIA Triton</span>
           </p>
         </header>
               connectionState={connectionState}
               audioDevices={audioDevices}
               selectedDevice={selectedDevice}
               onStartRecording={handleStartRecording}
               onStopRecording={handleStopRecording}
               onReset={handleReset}
               onDeviceChange={selectDevice}
               onFileUpload={handleFileUpload}
               onExport={handleExport}
               hasTranscript={hasTranscript}
           </div>
           <p className="mt-2">
             Built with{' '}
+            <span className="text-nvidia-green">NVIDIA Triton</span>
+            {' '}Inference Server •{' '}
             <a
               href="https://github.com/NVIDIA/NeMo"
               target="_blank"

web/src/components/ControlBar.tsx CHANGED Viewed

@@ -1,18 +1,15 @@
-import { Mic, MicOff, RotateCcw, Upload, Download, Settings, Sliders } from 'lucide-react';
-import type { RecordingState, ConnectionState, AudioDevice, AttentionContextSize } from '../types/messages';
-import { ATTENTION_CONTEXT_OPTIONS } from '../types/messages';
 interface ControlBarProps {
   recordingState: RecordingState;
   connectionState: ConnectionState;
   audioDevices: AudioDevice[];
   selectedDevice: string | null;
-  attentionContext: AttentionContextSize;
   onStartRecording: () => void;
   onStopRecording: () => void;
   onReset: () => void;
   onDeviceChange: (deviceId: string) => void;
-  onAttentionContextChange: (value: AttentionContextSize) => void;
   onFileUpload: (file: File) => void;
   onExport: () => void;
   hasTranscript: boolean;
@@ -23,12 +20,10 @@ export function ControlBar({
   connectionState,
   audioDevices,
   selectedDevice,
-  attentionContext,
   onStartRecording,
   onStopRecording,
   onReset,
   onDeviceChange,
-  onAttentionContextChange,
   onFileUpload,
   onExport,
   hasTranscript,
@@ -45,19 +40,6 @@ export function ControlBar({
     }
   };
-  const handleAttentionContextChange = (e: React.ChangeEvent<HTMLSelectElement>) => {
-    const idx = parseInt(e.target.value, 10);
-    const option = ATTENTION_CONTEXT_OPTIONS[idx];
-    if (option) {
-      onAttentionContextChange(option.value);
-    }
-  };
-  // Find current attention context index
-  const currentAttentionIdx = ATTENTION_CONTEXT_OPTIONS.findIndex(
-    (opt) => opt.value[0] === attentionContext[0] && opt.value[1] === attentionContext[1]
-  );
   return (
     <div className="flex flex-col gap-4">
       {/* Device selector */}
@@ -88,30 +70,6 @@ export function ControlBar({
         </select>
       </div>
-      {/* Attention context selector */}
-      <div className="flex items-center gap-3">
-        <Sliders className="w-4 h-4 text-surface-400" />
-        <select
-          value={currentAttentionIdx >= 0 ? currentAttentionIdx : 0}
-          onChange={handleAttentionContextChange}
-          disabled={isRecording}
-          className={`
-            flex-1 px-3 py-2 rounded-lg
-            bg-surface-800 border border-surface-600
-            text-sm text-surface-200
-            focus:outline-none focus:border-nvidia-green/50
-            disabled:opacity-50 disabled:cursor-not-allowed
-            transition-colors
-          `}
-        >
-          {ATTENTION_CONTEXT_OPTIONS.map((option, idx) => (
-            <option key={idx} value={idx}>
-              {option.label}
-            </option>
-          ))}
-        </select>
-      </div>
       {/* Main controls */}
       <div className="flex items-center justify-center gap-4">
         {/* Record button */}

+import { Mic, MicOff, RotateCcw, Upload, Download, Settings } from 'lucide-react';
+import type { RecordingState, ConnectionState, AudioDevice } from '../types/messages';
 interface ControlBarProps {
   recordingState: RecordingState;
   connectionState: ConnectionState;
   audioDevices: AudioDevice[];
   selectedDevice: string | null;
   onStartRecording: () => void;
   onStopRecording: () => void;
   onReset: () => void;
   onDeviceChange: (deviceId: string) => void;
   onFileUpload: (file: File) => void;
   onExport: () => void;
   hasTranscript: boolean;
   connectionState,
   audioDevices,
   selectedDevice,
   onStartRecording,
   onStopRecording,
   onReset,
   onDeviceChange,
   onFileUpload,
   onExport,
   hasTranscript,
     }
   };
   return (
     <div className="flex flex-col gap-4">
       {/* Device selector */}
         </select>
       </div>
       {/* Main controls */}
       <div className="flex items-center justify-center gap-4">
         {/* Record button */}

web/src/types/messages.ts CHANGED Viewed

@@ -36,21 +36,9 @@ export type ServerMessage =
   | SessionStartedMessage
   | SessionEndedMessage;
-// Attention context size options
-// [left, right] - MEDIUM parameter (cache reset, no buffer rebuild)
-export type AttentionContextSize = [number, number];
-export const ATTENTION_CONTEXT_OPTIONS: { label: string; value: AttentionContextSize }[] = [
-  { label: 'Default (70, 13)', value: [70, 13] },
-  { label: 'Balanced (70, 6)', value: [70, 6] },
-  { label: 'Low latency (70, 1)', value: [70, 1] },
-  { label: 'Lowest latency (70, 0)', value: [70, 0] },
-];
 // Client messages
 export interface StartStreamMessage {
   type: 'start_stream';
-  att_context_size?: AttentionContextSize;
 }
 export interface EndStreamMessage {
@@ -96,4 +84,3 @@ export interface FileTranscriptResponse {
     latency_ms: number;
   }[];
 }

   | SessionStartedMessage
   | SessionEndedMessage;
 // Client messages
 export interface StartStreamMessage {
   type: 'start_stream';
 }
 export interface EndStreamMessage {
     latency_ms: number;
   }[];
 }