feat: Add Whisper GPU speech-to-text with progressive transcription

- Add faster-whisper Python server for GPU-accelerated transcription
- Support dual mode: Web Speech API or Whisper GPU (toggleable)
- Progressive transcription every 3 seconds while recording
- Separate terminal server process (stable during hot-reload)
- Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal
- Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status
- Update package.json with separate api/terminal/frontend processes
This commit is contained in:
2026-02-13 23:47:52 -06:00
parent e867b7873e
commit 638e6ac8e0
10 changed files with 1009 additions and 31 deletions

218
server/services/whisper.ts Normal file
View File

@@ -0,0 +1,218 @@
/**
* Whisper Service - Manages the Python Whisper server process
* Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
*/
import { join } from 'path'
import { Subprocess } from 'bun'
const WHISPER_PORT = 4104
const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
interface WhisperState {
enabled: boolean
running: boolean
process: Subprocess | null
model: string
device: string
}
const state: WhisperState = {
enabled: false,
running: false,
process: null,
model: 'medium',
device: 'cuda'
}
/**
* Kill any process using the Whisper port
*/
async function killProcessOnPort(port: number): Promise<void> {
try {
// Use PowerShell to find and kill process on port
const proc = Bun.spawn(['powershell', '-Command',
`Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
], { stdout: 'ignore', stderr: 'ignore' })
await proc.exited
// Wait a moment for port to be released
await new Promise(resolve => setTimeout(resolve, 1000))
} catch {
// Ignore errors
}
}
/**
* Start the Whisper Python server
*/
export async function startWhisperServer(): Promise<boolean> {
if (state.running && state.process) {
console.log('[Whisper] Server already running')
return true
}
console.log('[Whisper] ====== STARTING (v3) ======')
console.log('[Whisper] Script:', WHISPER_SCRIPT)
// Kill any existing process on the port
console.log('[Whisper] Cleaning up port', WHISPER_PORT)
await killProcessOnPort(WHISPER_PORT)
try {
// Use Bun.spawn with inherit to show logs directly in console
const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
cwd: join(import.meta.dir, '..'),
stdout: 'inherit',
stderr: 'inherit',
env: { ...process.env }
})
state.process = proc
// Wait a bit for the server to start, then check if port is listening
await new Promise(resolve => setTimeout(resolve, 3000))
// Check if process is still running
if (proc.exitCode !== null) {
console.error('[Whisper] Process exited with code:', proc.exitCode)
state.process = null
return false
}
// Check if port is listening (simple TCP check)
const isListening = await checkPort(WHISPER_PORT)
if (isListening) {
console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
state.running = true
state.enabled = true
return true
}
// Wait more if model is still loading (up to 90 seconds total)
console.log('[Whisper] Waiting for model to load...')
for (let i = 0; i < 30; i++) {
await new Promise(resolve => setTimeout(resolve, 3000))
if (proc.exitCode !== null) {
console.error('[Whisper] Process died while loading')
state.process = null
return false
}
if (await checkPort(WHISPER_PORT)) {
console.log('[Whisper] Server ready!')
state.running = true
state.enabled = true
return true
}
}
console.log('[Whisper] Timeout waiting for server')
return false
} catch (err: any) {
console.error('[Whisper] Failed to start:', err.message)
state.process = null
return false
}
}
/**
* Check if a port is listening using PowerShell
*/
async function checkPort(port: number): Promise<boolean> {
try {
const proc = Bun.spawn(['powershell', '-Command',
`if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
], { stdout: 'ignore', stderr: 'ignore' })
const exitCode = await proc.exited
return exitCode === 0
} catch {
return false
}
}
/**
* Stop the Whisper server
*/
export function stopWhisperServer(): boolean {
if (!state.process) {
console.log('[Whisper] No server running')
return true
}
console.log('[Whisper] Stopping server...')
try {
state.process.kill()
state.process = null
state.running = false
state.enabled = false
console.log('[Whisper] Server stopped')
return true
} catch (err) {
console.error('[Whisper] Error stopping server:', err)
return false
}
}
/**
* Toggle Whisper server on/off
*/
export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
if (state.enabled && state.running) {
const success = stopWhisperServer()
return { enabled: false, success }
} else {
const success = await startWhisperServer()
return { enabled: success, success }
}
}
/**
* Get current Whisper state (checks real port status)
*/
export async function getWhisperState(): Promise<{
enabled: boolean
running: boolean
port: number
model: string
device: string
}> {
// Check if port is actually listening
const isListening = await checkPort(WHISPER_PORT)
// Sync state with reality
if (isListening && !state.running) {
state.running = true
state.enabled = true
} else if (!isListening && state.running) {
state.running = false
state.enabled = false
state.process = null
}
return {
enabled: state.enabled,
running: state.running,
port: WHISPER_PORT,
model: state.model,
device: state.device
}
}
/**
* Check if Whisper is enabled
*/
export function isWhisperEnabled(): boolean {
return state.enabled && state.running
}
// WebSocket server for Whisper (proxies to Python server or handles directly)
let whisperWsServer: any = null
export function getWhisperPort(): number {
return WHISPER_PORT
}