fix: Improve Whisper server startup with async polling and reduce logs

- Make server startup async to avoid Bun's 10s timeout - Add frontend polling to detect when server is ready - Use PowerShell Get-NetTCPConnection for reliable port detection - Add starting state to prevent multiple simultaneous starts - Reduce verbose logging, keep only essential info - Add dev-dist and nul to gitignore
2026-02-14 01:02:54 -06:00
parent 9f1e10b8d5
commit 5be0fb91ab
5 changed files with 180 additions and 73 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -50,7 +50,9 @@
      "mcp__agent-ui__localhost_4100-notificar",
      "mcp__agent-ui__localhost_4100-enviar_al_panel",
      "mcp__agent-ui__localhost_4100-render_html",
-      "mcp__agent-ui__localhost_4100-load_vue_component"
+      "mcp__agent-ui__localhost_4100-load_vue_component",
      "mcp__agent-ui__localhost_4100-page_refresh",
      "WebFetch(domain:docs.anthropic.com)"
    ]
  },
  "enableAllProjectMcpServers": true,
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ frontend/node_modules/
 .env
 *.log
 dist/
 frontend/dev-dist/
 nul
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -135,35 +135,59 @@ function initRecognition() {
 // ============ WHISPER FUNCTIONS ============
-async function checkWhisperStatus() {
+async function checkWhisperStatus(updateLoading = true) {
  try {
    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
    const data = await res.json()
    useWhisper.value = data.enabled
    whisperReady.value = data.running
    if (updateLoading) {
      whisperLoading.value = data.starting || false
    }
    return data
  } catch {
    useWhisper.value = false
    whisperReady.value = false
    if (updateLoading) {
      whisperLoading.value = false
    }
    return null
  }
 }
 async function toggleWhisperMode() {
  // Prevent multiple clicks
  if (whisperLoading.value) {
    console.log('[Voice] Toggle already in progress, ignoring')
    return
  }
  whisperLoading.value = true
  error.value = ''
  // Show immediate feedback
  if (!useWhisper.value) {
    canvasStore.showNotification('Starting Whisper GPU server...', 'info', 10000)
  }
  try {
    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
      method: 'POST'
    })
    const data = await res.json()
    // Server is starting - poll until ready
    if (data.starting) {
      console.log('[Voice] Server starting, polling for status...')
      await pollWhisperStatus()
      return
    }
    useWhisper.value = data.enabled
    whisperReady.value = data.running
    if (data.enabled) {
-      canvasStore.showNotification('Whisper GPU enabled', 'success')
+      canvasStore.showNotification('Whisper GPU ready!', 'success')
      connectWhisperSocket()
    } else {
      canvasStore.showNotification('Using Web Speech API', 'info')
@@ -171,12 +195,61 @@ async function toggleWhisperMode() {
    }
  } catch (e: any) {
    error.value = 'Failed to toggle Whisper'
    canvasStore.showNotification('Error starting Whisper server', 'error')
    console.error('[Voice] Whisper toggle error:', e)
  } finally {
    whisperLoading.value = false
  }
 }
 // Poll server status until ready or failed
 async function pollWhisperStatus() {
  const maxAttempts = 60  // 2 minutes max
  let attempts = 0
  while (attempts < maxAttempts) {
    await new Promise(resolve => setTimeout(resolve, 2000))
    attempts++
    try {
      const status = await checkWhisperStatus(false)  // Don't update loading state
      if (!status) {
        console.log('[Voice] Failed to get status')
        continue
      }
      // Still starting
      if (status.starting) {
        console.log(`[Voice] Still starting... (${attempts * 2}s)`)
        continue
      }
      // Started successfully
      if (status.running && status.enabled) {
        console.log('[Voice] Server ready!')
        canvasStore.showNotification('Whisper GPU ready!', 'success')
        connectWhisperSocket()
        whisperLoading.value = false
        return
      }
      // Failed to start
      console.log('[Voice] Server failed to start')
      canvasStore.showNotification('Whisper server failed to start', 'error')
      whisperLoading.value = false
      return
    } catch (e) {
      console.error('[Voice] Polling error:', e)
    }
  }
  // Timeout
  canvasStore.showNotification('Whisper server timeout', 'error')
  whisperLoading.value = false
 }
 function connectWhisperSocket() {
  if (whisperSocket?.readyState === WebSocket.OPEN) return
@@ -671,8 +744,13 @@ onMounted(async () => {
  document.addEventListener('keyup', handleKeyUp, { capture: true })
  // Check Whisper status on mount
-  await checkWhisperStatus()
+  const status = await checkWhisperStatus()
-  if (useWhisper.value) {
+
  // If server is starting (page was reloaded during startup), continue polling
  if (status?.starting) {
    console.log('[Voice] Server is starting, resuming polling...')
    pollWhisperStatus()
  } else if (useWhisper.value) {
    connectWhisperSocket()
  }
 })
@@ -743,8 +821,9 @@ defineExpose({
              <button
                class="whisper-toggle"
                :class="{ active: useWhisper, loading: whisperLoading }"
                :disabled="whisperLoading"
                @click.stop="toggleWhisperMode"
-                :title="useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU'"
+                :title="whisperLoading ? 'Starting Whisper server...' : (useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU')"
              >
                <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                  <rect x="4" y="4" width="16" height="16" rx="2"/>
@@ -917,10 +996,15 @@ defineExpose({
  transition: all 0.15s;
 }
-.whisper-toggle:hover {
+.whisper-toggle:hover:not(:disabled) {
  background: rgba(255, 255, 255, 0.5);
 }
 .whisper-toggle:disabled {
  cursor: not-allowed;
  opacity: 0.6;
 }
 .whisper-toggle.active {
  background: linear-gradient(180deg, #10b981 0%, #059669 100%);
  border-color: #047857;
--- a/server/services/whisper.ts
+++ b/server/services/whisper.ts
@@ -12,6 +12,7 @@ const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
 interface WhisperState {
  enabled: boolean
  running: boolean
  starting: boolean  // Prevents multiple simultaneous start attempts
  process: Subprocess | null
  model: string
  device: string
@@ -20,8 +21,9 @@ interface WhisperState {
 const state: WhisperState = {
  enabled: false,
  running: false,
  starting: false,
  process: null,
-  model: 'medium',
+  model: 'large-v3',
  device: 'cuda'
 }
@@ -46,89 +48,104 @@ async function killProcessOnPort(port: number): Promise<void> {
 * Start the Whisper Python server
 */
 export async function startWhisperServer(): Promise<boolean> {
  // Prevent multiple simultaneous start attempts
  if (state.starting) {
    return false
  }
  if (state.running && state.process) {
    console.log('[Whisper] Server already running')
    return true
  }
-  console.log('[Whisper] ====== STARTING (v3) ======')
+  state.starting = true
-  console.log('[Whisper] Script:', WHISPER_SCRIPT)
+  console.log(`[Whisper] Starting (${state.model})...`)
  // Kill any existing process on the port
  console.log('[Whisper] Cleaning up port', WHISPER_PORT)
  await killProcessOnPort(WHISPER_PORT)
  try {
    // Use Bun.spawn with inherit to show logs directly in console
-    const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
+    // -u flag disables Python output buffering for real-time logs
    const proc = Bun.spawn(['python', '-u', WHISPER_SCRIPT], {
      cwd: join(import.meta.dir, '..'),
      stdout: 'inherit',
      stderr: 'inherit',
-      env: { ...process.env }
+      env: { ...process.env, PYTHONUNBUFFERED: '1' }
    })
    state.process = proc
-    // Wait a bit for the server to start, then check if port is listening
+    // Wait a bit for the server to start
-    await new Promise(resolve => setTimeout(resolve, 3000))
+    await new Promise(resolve => setTimeout(resolve, 2000))
    // Check if process is still running
    if (proc.exitCode !== null) {
      console.error('[Whisper] Process exited with code:', proc.exitCode)
      state.process = null
      state.starting = false
      return false
    }
-    // Check if port is listening (simple TCP check)
+    // Check if WebSocket is ready
    const isListening = await checkPort(WHISPER_PORT)
    if (isListening) {
-      console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
+      console.log('[Whisper] Ready')
      state.running = true
      state.enabled = true
      state.starting = false
      return true
    }
-    // Wait more if model is still loading (up to 90 seconds total)
+    // Wait more if model is still loading (up to 120 seconds total for large models)
-    console.log('[Whisper] Waiting for model to load...')
+    for (let i = 0; i < 40; i++) {
    for (let i = 0; i < 30; i++) {
      await new Promise(resolve => setTimeout(resolve, 3000))
      if (proc.exitCode !== null) {
-        console.error('[Whisper] Process died while loading')
+        console.error('[Whisper] Process died')
        state.process = null
        state.starting = false
        return false
      }
-      if (await checkPort(WHISPER_PORT)) {
+      const ready = await checkPort(WHISPER_PORT)
-        console.log('[Whisper] Server ready!')
+      if (ready) {
        console.log('[Whisper] Ready')
        state.running = true
        state.enabled = true
        state.starting = false
        return true
      }
    }
-    console.log('[Whisper] Timeout waiting for server')
+    console.error('[Whisper] Timeout (120s)')
    state.starting = false
    return false
  } catch (err: any) {
-    console.error('[Whisper] Failed to start:', err.message)
+    console.error('[Whisper] Error:', err.message)
    state.process = null
    state.starting = false
    return false
  }
 }
 /**
- * Check if a port is listening using PowerShell
+ * Check if Whisper WebSocket is ready using PowerShell
 */
 async function checkPort(port: number): Promise<boolean> {
  try {
-    const proc = Bun.spawn(['powershell', '-Command',
+    const proc = Bun.spawn(['powershell', '-NoProfile', '-Command',
-      `if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
+      `$c = Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue; if ($c) { Write-Output 'LISTENING' }`
-    ], { stdout: 'ignore', stderr: 'ignore' })
+    ], {
      stdout: 'pipe',
      stderr: 'ignore'
    })
-    const exitCode = await proc.exited
+    const output = await new Response(proc.stdout).text()
-    return exitCode === 0
+    await proc.exited
    return output.trim() === 'LISTENING'
  } catch {
    return false
  }
@@ -139,35 +156,43 @@ async function checkPort(port: number): Promise<boolean> {
 */
 export function stopWhisperServer(): boolean {
  if (!state.process) {
    console.log('[Whisper] No server running')
    return true
  }
  console.log('[Whisper] Stopping server...')
  try {
    state.process.kill()
    state.process = null
    state.running = false
    state.enabled = false
-    console.log('[Whisper] Server stopped')
+    console.log('[Whisper] Stopped')
    return true
  } catch (err) {
-    console.error('[Whisper] Error stopping server:', err)
+    console.error('[Whisper] Stop error:', err)
    return false
  }
 }
 /**
- * Toggle Whisper server on/off
+ * Toggle Whisper server on/off (async - returns immediately when starting)
 */
-export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
+export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean; starting: boolean }> {
  // Prevent toggle while starting
  if (state.starting) {
    return { enabled: false, success: false, starting: true }
  }
  if (state.enabled && state.running) {
    const success = stopWhisperServer()
-    return { enabled: false, success }
+    return { enabled: false, success, starting: false }
  } else {
-    const success = await startWhisperServer()
+    // Start server in background - don't await
-    return { enabled: success, success }
+    startWhisperServer().catch(err => {
      console.error('[Whisper] Start error:', err)
      state.starting = false
    })
    // Return immediately - frontend will poll for status
    return { enabled: false, success: true, starting: true }
  }
 }
@@ -177,26 +202,30 @@ export async function toggleWhisperServer(): Promise<{ enabled: boolean; success
 export async function getWhisperState(): Promise<{
  enabled: boolean
  running: boolean
  starting: boolean
  port: number
  model: string
  device: string
 }> {
-  // Check if port is actually listening
+  // Check if port is actually listening (skip if starting to avoid interference)
-  const isListening = await checkPort(WHISPER_PORT)
+  if (!state.starting) {
    const isListening = await checkPort(WHISPER_PORT)
-  // Sync state with reality
+    // Sync state with reality
-  if (isListening && !state.running) {
+    if (isListening && !state.running) {
-    state.running = true
+      state.running = true
-    state.enabled = true
+      state.enabled = true
-  } else if (!isListening && state.running) {
+    } else if (!isListening && state.running) {
-    state.running = false
+      state.running = false
-    state.enabled = false
+      state.enabled = false
-    state.process = null
+      state.process = null
    }
  }
  return {
    enabled: state.enabled,
    running: state.running,
    starting: state.starting,
    port: WHISPER_PORT,
    model: state.model,
    device: state.device
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -79,10 +79,13 @@ def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes
 # Configuration
 HOST = "localhost"
 PORT = 4104
-MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
+MODEL_SIZE = "large-v3"  # Best standard model for Spanish
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU
 # Model display name (extract from path if needed)
 MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE
 # Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
 INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
 Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
@@ -109,7 +112,7 @@ async def load_model():
        return model
    model_loading = True
-    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
+    print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...")
    try:
        # Load model - this downloads on first run
@@ -140,15 +143,11 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
    if model is None:
        return {"error": "Model not loaded"}
    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
    # Convert WebM to WAV if needed
    if is_webm:
        print("[Whisper] Converting WebM to WAV...")
        wav_data = convert_audio_to_wav(audio_data, "webm")
        if wav_data is None:
            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
    else:
        wav_data = audio_data
@@ -159,7 +158,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
    try:
        # Transcribe with optimized parameters
        print(f"[Whisper] Transcribing {temp_path}...")
        segments, info = model.transcribe(
            temp_path,
            language=language,
@@ -191,7 +189,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
                "text": segment.text
            })
        print(f"[Whisper] Transcription result: '{text.strip()}'")
        return {
            "success": True,
@@ -201,7 +198,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
            "duration": info.duration,
            "segments": segments_list,
            "engine": "whisper-gpu",
-            "model": MODEL_SIZE,
+            "model": MODEL_NAME,
            "device": DEVICE
        }
@@ -218,7 +215,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
 async def handle_client(websocket):
    """Handle WebSocket client connection"""
    print(f"[Whisper] Client connected")
    # Ensure model is loaded
    await load_model()
@@ -226,7 +222,7 @@ async def handle_client(websocket):
    # Send ready message
    await websocket.send(json.dumps({
        "type": "ready",
-        "model": MODEL_SIZE,
+        "model": MODEL_NAME,
        "device": DEVICE
    }))
@@ -234,8 +230,6 @@ async def handle_client(websocket):
        async for message in websocket:
            if isinstance(message, bytes):
                # Binary audio data (likely WebM format from browser)
                print(f"[Whisper] Received {len(message)} bytes of binary audio")
                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
@@ -260,8 +254,6 @@ async def handle_client(websocket):
                        language = cmd.get("language", "es")
                        is_partial = cmd.get("partial", False)
                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
@@ -283,7 +275,7 @@ async def handle_client(websocket):
                    elif cmd.get("type") == "status":
                        await websocket.send(json.dumps({
                            "type": "status",
-                            "model": MODEL_SIZE,
+                            "model": MODEL_NAME,
                            "device": DEVICE,
                            "ready": model is not None
                        }))
@@ -295,21 +287,19 @@ async def handle_client(websocket):
                    }))
    except websockets.exceptions.ConnectionClosed:
-        print("[Whisper] Client disconnected")
+        pass
    except Exception as e:
        print(f"[Whisper] Error: {e}")
 async def main():
    """Start WebSocket server"""
-    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
+    print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}")
    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
    # Pre-load model
    print("[Whisper] Pre-loading model...")
    await load_model()
    async with websockets.serve(handle_client, HOST, PORT):
-        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
+        print(f"[Whisper] Ready")
        await asyncio.Future()  # Run forever
 if __name__ == "__main__":