feat: Add Whisper GPU speech-to-text with progressive transcription
- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
This commit is contained in:
@@ -250,13 +250,38 @@ function initTerminal() {
|
||||
}
|
||||
})
|
||||
|
||||
// Capture Ctrl+E even when terminal has focus
|
||||
// Capture Ctrl+E and Ctrl+V when terminal has focus
|
||||
terminal.attachCustomKeyEventHandler((e) => {
|
||||
// Ctrl+E: Toggle terminal
|
||||
if (e.ctrlKey && e.key === 'e') {
|
||||
e.preventDefault()
|
||||
toggleTerminal()
|
||||
return false // Prevent terminal from processing
|
||||
return false
|
||||
}
|
||||
|
||||
// Ctrl+V: Paste from clipboard
|
||||
if (e.ctrlKey && e.key === 'v' && e.type === 'keydown') {
|
||||
e.preventDefault()
|
||||
navigator.clipboard.readText().then((text) => {
|
||||
if (text && socket && socket.readyState === WebSocket.OPEN) {
|
||||
socket.send(JSON.stringify({ type: 'input', data: text }))
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error('[Terminal] Clipboard read failed:', err)
|
||||
})
|
||||
return false
|
||||
}
|
||||
|
||||
// Ctrl+C: Copy selection (if any)
|
||||
if (e.ctrlKey && e.key === 'c' && e.type === 'keydown') {
|
||||
const selection = terminal?.getSelection()
|
||||
if (selection) {
|
||||
navigator.clipboard.writeText(selection).catch(console.error)
|
||||
return false
|
||||
}
|
||||
// If no selection, let Ctrl+C pass through as SIGINT
|
||||
}
|
||||
|
||||
return true // Let terminal handle other keys
|
||||
})
|
||||
}
|
||||
|
||||
@@ -30,19 +30,32 @@ const isDragging = ref(false)
|
||||
const dragOffset = ref({ x: 0, y: 0 })
|
||||
const containerRef = ref<HTMLElement | null>(null)
|
||||
|
||||
// Speech recognition
|
||||
// Speech recognition (Web Speech API)
|
||||
let recognition: SpeechRecognition | null = null
|
||||
|
||||
// WebSocket connection (own session)
|
||||
// WebSocket connection to terminal
|
||||
const WS_URL = `ws://${window.location.hostname}:4103`
|
||||
let socket: WebSocket | null = null
|
||||
const connected = ref(false)
|
||||
|
||||
// Push-to-talk state (Ctrl+S)
|
||||
// Push-to-talk state (Ctrl+Space)
|
||||
let keyDownTime = 0
|
||||
let holdTimeout: number | null = null
|
||||
const isPushToTalk = ref(false)
|
||||
|
||||
// ============ WHISPER MODE ============
|
||||
const useWhisper = ref(false)
|
||||
const whisperReady = ref(false)
|
||||
const whisperLoading = ref(false)
|
||||
const WHISPER_WS_URL = `ws://${window.location.hostname}:4104`
|
||||
let whisperSocket: WebSocket | null = null
|
||||
let mediaRecorder: MediaRecorder | null = null
|
||||
let audioChunks: Blob[] = []
|
||||
let lastTranscriptLength = 0 // Track length of last transcription to show only new text
|
||||
let chunkInterval: number | null = null
|
||||
const CHUNK_INTERVAL_MS = 3000 // Send audio every 3 seconds
|
||||
let mediaStream: MediaStream | null = null
|
||||
|
||||
const displayText = computed(() => {
|
||||
if (interimTranscript.value) {
|
||||
return transcript.value + ' ' + interimTranscript.value
|
||||
@@ -73,7 +86,7 @@ function initRecognition() {
|
||||
const rec = new SpeechRecognition()
|
||||
rec.continuous = true
|
||||
rec.interimResults = true
|
||||
rec.lang = 'es-ES'
|
||||
rec.lang = 'es-419' // Latin American Spanish (better for accents)
|
||||
|
||||
rec.onresult = (event: SpeechRecognitionEvent) => {
|
||||
let interim = ''
|
||||
@@ -105,7 +118,7 @@ function initRecognition() {
|
||||
}
|
||||
|
||||
rec.onend = () => {
|
||||
if (isRecording.value) {
|
||||
if (isRecording.value && !useWhisper.value) {
|
||||
// Restart if still recording (browser stops after silence)
|
||||
rec.start()
|
||||
}
|
||||
@@ -114,6 +127,215 @@ function initRecognition() {
|
||||
return rec
|
||||
}
|
||||
|
||||
// ============ WHISPER FUNCTIONS ============
|
||||
|
||||
async function checkWhisperStatus() {
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
|
||||
const data = await res.json()
|
||||
useWhisper.value = data.enabled
|
||||
whisperReady.value = data.running
|
||||
return data
|
||||
} catch {
|
||||
useWhisper.value = false
|
||||
whisperReady.value = false
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
async function toggleWhisperMode() {
|
||||
whisperLoading.value = true
|
||||
error.value = ''
|
||||
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
|
||||
method: 'POST'
|
||||
})
|
||||
const data = await res.json()
|
||||
|
||||
useWhisper.value = data.enabled
|
||||
whisperReady.value = data.running
|
||||
|
||||
if (data.enabled) {
|
||||
canvasStore.showNotification('Whisper GPU enabled', 'success')
|
||||
connectWhisperSocket()
|
||||
} else {
|
||||
canvasStore.showNotification('Using Web Speech API', 'info')
|
||||
disconnectWhisperSocket()
|
||||
}
|
||||
} catch (e: any) {
|
||||
error.value = 'Failed to toggle Whisper'
|
||||
console.error('[Voice] Whisper toggle error:', e)
|
||||
} finally {
|
||||
whisperLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
function connectWhisperSocket() {
|
||||
if (whisperSocket?.readyState === WebSocket.OPEN) return
|
||||
|
||||
console.log('[Voice] Connecting to Whisper server...')
|
||||
whisperSocket = new WebSocket(WHISPER_WS_URL)
|
||||
|
||||
whisperSocket.onopen = () => {
|
||||
console.log('[Voice] Whisper WebSocket connected')
|
||||
whisperReady.value = true
|
||||
}
|
||||
|
||||
whisperSocket.onmessage = (event) => {
|
||||
try {
|
||||
const msg = JSON.parse(event.data)
|
||||
|
||||
if (msg.type === 'ready') {
|
||||
console.log('[Voice] Whisper ready:', msg.model, msg.device)
|
||||
whisperReady.value = true
|
||||
} else if (msg.type === 'transcription') {
|
||||
if (msg.success && msg.text) {
|
||||
const fullText = msg.text.trim()
|
||||
|
||||
if (msg.partial) {
|
||||
// For partial results, show as interim (will be replaced)
|
||||
// Only show text that's new since last transcription
|
||||
const newText = fullText.substring(lastTranscriptLength).trim()
|
||||
if (newText) {
|
||||
interimTranscript.value = newText
|
||||
console.log(`[Voice] 🔄 WHISPER partial:`, newText)
|
||||
}
|
||||
} else {
|
||||
// Final result - replace everything
|
||||
transcript.value = fullText + ' '
|
||||
interimTranscript.value = ''
|
||||
lastTranscriptLength = 0
|
||||
console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
|
||||
}
|
||||
|
||||
// Update last transcript length for next partial
|
||||
lastTranscriptLength = fullText.length
|
||||
} else if (msg.error) {
|
||||
error.value = msg.error
|
||||
console.error('[Voice] Whisper error:', msg.error)
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[Voice] Whisper message error:', e)
|
||||
}
|
||||
}
|
||||
|
||||
whisperSocket.onclose = () => {
|
||||
console.log('[Voice] Whisper WebSocket closed')
|
||||
whisperReady.value = false
|
||||
}
|
||||
|
||||
whisperSocket.onerror = (e) => {
|
||||
console.error('[Voice] Whisper WebSocket error:', e)
|
||||
whisperReady.value = false
|
||||
}
|
||||
}
|
||||
|
||||
function disconnectWhisperSocket() {
|
||||
if (whisperSocket) {
|
||||
whisperSocket.close()
|
||||
whisperSocket = null
|
||||
}
|
||||
whisperReady.value = false
|
||||
}
|
||||
|
||||
async function startWhisperRecording() {
|
||||
try {
|
||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
|
||||
mediaRecorder = new MediaRecorder(mediaStream, {
|
||||
mimeType: 'audio/webm;codecs=opus'
|
||||
})
|
||||
|
||||
audioChunks = []
|
||||
|
||||
mediaRecorder.ondataavailable = (event) => {
|
||||
if (event.data.size > 0) {
|
||||
audioChunks.push(event.data)
|
||||
}
|
||||
}
|
||||
|
||||
// Reset state for new recording
|
||||
audioChunks = []
|
||||
lastTranscriptLength = 0
|
||||
|
||||
// Start recording
|
||||
mediaRecorder.start(100) // Collect data every 100ms
|
||||
isRecording.value = true
|
||||
interimTranscript.value = 'Listening (Whisper GPU)...'
|
||||
|
||||
// Send chunks periodically for progressive transcription
|
||||
chunkInterval = window.setInterval(() => {
|
||||
if (audioChunks.length > 0 && whisperSocket?.readyState === WebSocket.OPEN) {
|
||||
sendAudioChunk(false) // false = partial, don't clear
|
||||
}
|
||||
}, CHUNK_INTERVAL_MS)
|
||||
|
||||
} catch (e: any) {
|
||||
error.value = `Microphone error: ${e.message}`
|
||||
console.error('[Voice] Microphone error:', e)
|
||||
}
|
||||
}
|
||||
|
||||
function sendAudioChunk(isFinal: boolean) {
|
||||
if (audioChunks.length === 0) return
|
||||
|
||||
// Always send ALL accumulated audio (webm needs header from first chunk)
|
||||
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
|
||||
|
||||
// Clear chunks only if final
|
||||
if (isFinal) {
|
||||
audioChunks = []
|
||||
lastTranscriptLength = 0
|
||||
}
|
||||
|
||||
const reader = new FileReader()
|
||||
reader.onloadend = () => {
|
||||
const base64 = (reader.result as string).split(',')[1]
|
||||
|
||||
if (whisperSocket?.readyState === WebSocket.OPEN) {
|
||||
if (!isFinal) {
|
||||
interimTranscript.value = 'Processing...'
|
||||
}
|
||||
whisperSocket.send(JSON.stringify({
|
||||
type: 'transcribe',
|
||||
audio: base64,
|
||||
language: 'es',
|
||||
partial: !isFinal
|
||||
}))
|
||||
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
|
||||
}
|
||||
}
|
||||
reader.readAsDataURL(audioBlob)
|
||||
}
|
||||
|
||||
function stopWhisperRecording() {
|
||||
// Clear the chunk interval
|
||||
if (chunkInterval) {
|
||||
clearInterval(chunkInterval)
|
||||
chunkInterval = null
|
||||
}
|
||||
|
||||
// Send final chunk
|
||||
if (audioChunks.length > 0) {
|
||||
sendAudioChunk(true) // true = final
|
||||
}
|
||||
|
||||
// Stop recorder
|
||||
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
|
||||
mediaRecorder.stop()
|
||||
}
|
||||
|
||||
// Stop media stream
|
||||
if (mediaStream) {
|
||||
mediaStream.getTracks().forEach(track => track.stop())
|
||||
mediaStream = null
|
||||
}
|
||||
|
||||
isRecording.value = false
|
||||
}
|
||||
|
||||
function toggleRecording() {
|
||||
if (isRecording.value) {
|
||||
stopRecording()
|
||||
@@ -124,24 +346,35 @@ function toggleRecording() {
|
||||
|
||||
function startRecording() {
|
||||
error.value = ''
|
||||
if (!recognition) {
|
||||
recognition = initRecognition()
|
||||
}
|
||||
if (recognition) {
|
||||
try {
|
||||
recognition.start()
|
||||
isRecording.value = true
|
||||
} catch (e) {
|
||||
console.error('[Voice] Failed to start:', e)
|
||||
|
||||
if (useWhisper.value && whisperReady.value) {
|
||||
// Use Whisper GPU mode
|
||||
startWhisperRecording()
|
||||
} else {
|
||||
// Use Web Speech API
|
||||
if (!recognition) {
|
||||
recognition = initRecognition()
|
||||
}
|
||||
if (recognition) {
|
||||
try {
|
||||
recognition.start()
|
||||
isRecording.value = true
|
||||
} catch (e) {
|
||||
console.error('[Voice] Failed to start:', e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
if (recognition) {
|
||||
recognition.stop()
|
||||
if (useWhisper.value) {
|
||||
stopWhisperRecording()
|
||||
} else {
|
||||
if (recognition) {
|
||||
recognition.stop()
|
||||
}
|
||||
isRecording.value = false
|
||||
}
|
||||
isRecording.value = false
|
||||
interimTranscript.value = ''
|
||||
}
|
||||
|
||||
@@ -209,6 +442,7 @@ function sendTranscript() {
|
||||
|
||||
function close() {
|
||||
stopRecording()
|
||||
clearTranscript()
|
||||
isOpen.value = false
|
||||
}
|
||||
|
||||
@@ -349,17 +583,28 @@ function sendTranscriptAndClose() {
|
||||
typeChar()
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
onMounted(async () => {
|
||||
recognition = initRecognition()
|
||||
// Use capture phase to intercept before terminal or other elements
|
||||
document.addEventListener('keydown', handleKeyDown, { capture: true })
|
||||
document.addEventListener('keyup', handleKeyUp, { capture: true })
|
||||
|
||||
// Check Whisper status on mount
|
||||
await checkWhisperStatus()
|
||||
if (useWhisper.value) {
|
||||
connectWhisperSocket()
|
||||
}
|
||||
})
|
||||
|
||||
onBeforeUnmount(() => {
|
||||
stopRecording()
|
||||
recognition = null
|
||||
disconnectSocket()
|
||||
disconnectWhisperSocket()
|
||||
if (chunkInterval) clearInterval(chunkInterval)
|
||||
if (mediaStream) {
|
||||
mediaStream.getTracks().forEach(track => track.stop())
|
||||
}
|
||||
document.removeEventListener('keydown', handleKeyDown, { capture: true })
|
||||
document.removeEventListener('keyup', handleKeyUp, { capture: true })
|
||||
document.removeEventListener('mousemove', onDrag)
|
||||
@@ -408,8 +653,23 @@ defineExpose({
|
||||
</svg>
|
||||
<span>Voice</span>
|
||||
<i class="dot" :class="{ recording: isRecording, ptt: isPushToTalk }"></i>
|
||||
<span class="mode-badge" :class="{ gpu: useWhisper }">
|
||||
{{ useWhisper ? 'GPU' : 'Web' }}
|
||||
</span>
|
||||
</div>
|
||||
<div class="window-controls">
|
||||
<button
|
||||
class="whisper-toggle"
|
||||
:class="{ active: useWhisper, loading: whisperLoading }"
|
||||
@click.stop="toggleWhisperMode"
|
||||
:title="useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU'"
|
||||
>
|
||||
<svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
||||
<rect x="4" y="4" width="16" height="16" rx="2"/>
|
||||
<line x1="9" y1="9" x2="9" y2="15"/>
|
||||
<line x1="15" y1="9" x2="15" y2="15"/>
|
||||
</svg>
|
||||
</button>
|
||||
<button class="x" @click="close" title="Close">
|
||||
<svg width="8" height="8" viewBox="0 0 10 10">
|
||||
<line x1="0" y1="0" x2="10" y2="10" stroke="currentColor" stroke-width="1.5"/>
|
||||
@@ -545,6 +805,53 @@ defineExpose({
|
||||
box-shadow: 0 0 6px #f90;
|
||||
}
|
||||
|
||||
.mode-badge {
|
||||
font-size: 8px;
|
||||
padding: 1px 4px;
|
||||
border-radius: 3px;
|
||||
background: rgba(0, 0, 0, 0.2);
|
||||
color: #555;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.mode-badge.gpu {
|
||||
background: linear-gradient(135deg, #10b981, #059669);
|
||||
color: #fff;
|
||||
box-shadow: 0 0 4px rgba(16, 185, 129, 0.5);
|
||||
}
|
||||
|
||||
.whisper-toggle {
|
||||
width: 20px;
|
||||
height: 18px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: rgba(255, 255, 255, 0.3);
|
||||
border: 1px solid rgba(0, 0, 0, 0.1);
|
||||
border-radius: 3px;
|
||||
color: #666;
|
||||
cursor: pointer;
|
||||
transition: all 0.15s;
|
||||
}
|
||||
|
||||
.whisper-toggle:hover {
|
||||
background: rgba(255, 255, 255, 0.5);
|
||||
}
|
||||
|
||||
.whisper-toggle.active {
|
||||
background: linear-gradient(180deg, #10b981 0%, #059669 100%);
|
||||
border-color: #047857;
|
||||
color: #fff;
|
||||
}
|
||||
|
||||
.whisper-toggle.loading {
|
||||
animation: pulse 0.6s infinite;
|
||||
background: linear-gradient(180deg, #f59e0b 0%, #d97706 100%);
|
||||
border-color: #b45309;
|
||||
color: #fff;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0.5; }
|
||||
|
||||
@@ -223,6 +223,112 @@ export function createGlobalHandlers(callbacks: ToolManagementCallbacks): ToolCo
|
||||
}, 100)
|
||||
return 'Recargando pagina...'
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'whisper_status',
|
||||
description: 'Obtiene el estado del servidor Whisper GPU para speech-to-text.',
|
||||
category: 'global',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {}
|
||||
},
|
||||
handler: async () => {
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
|
||||
const data = await res.json()
|
||||
return `Whisper GPU Status:\n` +
|
||||
` Enabled: ${data.enabled ? 'Yes' : 'No'}\n` +
|
||||
` Running: ${data.running ? 'Yes' : 'No'}\n` +
|
||||
` Model: ${data.model}\n` +
|
||||
` Device: ${data.device}\n` +
|
||||
` Port: ${data.port}`
|
||||
} catch (e: any) {
|
||||
return `Error checking Whisper status: ${e.message}`
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'whisper_toggle',
|
||||
description: 'Activa o desactiva Whisper GPU para speech-to-text. Cuando esta activo usa la GPU para transcribir voz con mejor precision para acentos latinos.',
|
||||
category: 'global',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {}
|
||||
},
|
||||
handler: async () => {
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
|
||||
method: 'POST'
|
||||
})
|
||||
const data = await res.json()
|
||||
|
||||
if (data.enabled) {
|
||||
return `Whisper GPU ENABLED\n` +
|
||||
` Model: ${data.model}\n` +
|
||||
` Device: ${data.device}\n` +
|
||||
` Port: ws://localhost:${data.port}\n\n` +
|
||||
`Voice input will now use GPU-accelerated transcription.`
|
||||
} else {
|
||||
return `Whisper GPU DISABLED\n\n` +
|
||||
`Voice input will use Web Speech API (browser native).`
|
||||
}
|
||||
} catch (e: any) {
|
||||
return `Error toggling Whisper: ${e.message}`
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'whisper_start',
|
||||
description: 'Inicia el servidor Whisper GPU si no esta corriendo.',
|
||||
category: 'global',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {}
|
||||
},
|
||||
handler: async () => {
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/start`, {
|
||||
method: 'POST'
|
||||
})
|
||||
const data = await res.json()
|
||||
|
||||
if (data.success) {
|
||||
return `Whisper server started!\n` +
|
||||
` Model: ${data.model}\n` +
|
||||
` Device: ${data.device}\n` +
|
||||
` Ready: ${data.running ? 'Yes' : 'Loading...'}`
|
||||
} else {
|
||||
return `Failed to start Whisper server: ${data.message}`
|
||||
}
|
||||
} catch (e: any) {
|
||||
return `Error starting Whisper: ${e.message}`
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'whisper_stop',
|
||||
description: 'Detiene el servidor Whisper GPU para liberar memoria de la GPU.',
|
||||
category: 'global',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {}
|
||||
},
|
||||
handler: async () => {
|
||||
try {
|
||||
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/stop`, {
|
||||
method: 'POST'
|
||||
})
|
||||
const data = await res.json()
|
||||
|
||||
if (data.success) {
|
||||
return `Whisper server stopped. GPU memory released.`
|
||||
} else {
|
||||
return `Failed to stop Whisper server: ${data.message}`
|
||||
}
|
||||
} catch (e: any) {
|
||||
return `Error stopping Whisper: ${e.message}`
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user