Compare commits

..

5 Commits

Author SHA1 Message Date
853aea6eb5 chore: Remove dev-dist from git tracking 2026-02-14 01:03:16 -06:00
5be0fb91ab fix: Improve Whisper server startup with async polling and reduce logs
- Make server startup async to avoid Bun's 10s timeout
- Add frontend polling to detect when server is ready
- Use PowerShell Get-NetTCPConnection for reliable port detection
- Add starting state to prevent multiple simultaneous starts
- Reduce verbose logging, keep only essential info
- Add dev-dist and nul to gitignore
2026-02-14 01:03:02 -06:00
9f1e10b8d5 feat: Add typing animation to voice transcription
- Text appears letter by letter (15-25ms per character)
- Blinking cursor shows while text is animating
- Animation continues from last position for new chunks
- Smooth visual feedback for transcription progress
2026-02-14 00:28:26 -06:00
ac17a9f292 fix: Improve Whisper transcription with WebM to WAV conversion
- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM)
- Optimize transcription parameters (VAD, temperature, beam_size)
- Add Honduras Spanish context prompt with local expressions
- Fix chunk accumulation display in voice panel
- Add 1.5s recording buffer after releasing Ctrl+Space
- Skip small audio chunks (<5KB) that cause ffmpeg errors
- Use large-v3 model for better accuracy
2026-02-14 00:16:01 -06:00
638e6ac8e0 feat: Add Whisper GPU speech-to-text with progressive transcription
- Add faster-whisper Python server for GPU-accelerated transcription
- Support dual mode: Web Speech API or Whisper GPU (toggleable)
- Progressive transcription every 3 seconds while recording
- Separate terminal server process (stable during hot-reload)
- Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal
- Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status
- Update package.json with separate api/terminal/frontend processes
2026-02-13 23:47:52 -06:00
20 changed files with 1315 additions and 8080 deletions

View File

@@ -50,7 +50,9 @@
"mcp__agent-ui__localhost_4100-notificar",
"mcp__agent-ui__localhost_4100-enviar_al_panel",
"mcp__agent-ui__localhost_4100-render_html",
"mcp__agent-ui__localhost_4100-load_vue_component"
"mcp__agent-ui__localhost_4100-load_vue_component",
"mcp__agent-ui__localhost_4100-page_refresh",
"WebFetch(domain:docs.anthropic.com)"
]
},
"enableAllProjectMcpServers": true,

2
.gitignore vendored
View File

@@ -3,3 +3,5 @@ frontend/node_modules/
.env
*.log
dist/
frontend/dev-dist/
nul

View File

@@ -1 +0,0 @@
if('serviceWorker' in navigator) navigator.serviceWorker.register('/dev-sw.js?dev-sw', { scope: '/', type: 'classic' })

View File

@@ -1,94 +0,0 @@
/**
* Copyright 2018 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// If the loader is already loaded, just stop.
if (!self.define) {
let registry = {};
// Used for `eval` and `importScripts` where we can't get script URL by other means.
// In both cases, it's safe to use a global var because those functions are synchronous.
let nextDefineUri;
const singleRequire = (uri, parentUri) => {
uri = new URL(uri + ".js", parentUri).href;
return registry[uri] || (
new Promise(resolve => {
if ("document" in self) {
const script = document.createElement("script");
script.src = uri;
script.onload = resolve;
document.head.appendChild(script);
} else {
nextDefineUri = uri;
importScripts(uri);
resolve();
}
})
.then(() => {
let promise = registry[uri];
if (!promise) {
throw new Error(`Module ${uri} didnt register its module`);
}
return promise;
})
);
};
self.define = (depsNames, factory) => {
const uri = nextDefineUri || ("document" in self ? document.currentScript.src : "") || location.href;
if (registry[uri]) {
// Module is already loading or loaded.
return;
}
let exports = {};
const require = depUri => singleRequire(depUri, uri);
const specialDeps = {
module: { uri },
exports,
require
};
registry[uri] = Promise.all(depsNames.map(
depName => specialDeps[depName] || require(depName)
)).then(deps => {
factory(...deps);
return exports;
});
};
}
define(['./workbox-5a5d9309'], (function (workbox) { 'use strict';
self.skipWaiting();
workbox.clientsClaim();
/**
* The precacheAndRoute() method efficiently caches and responds to
* requests for URLs in the manifest.
* See https://goo.gl/S9QRab
*/
workbox.precacheAndRoute([{
"url": "suppress-warnings.js",
"revision": "d41d8cd98f00b204e9800998ecf8427e"
}, {
"url": "index.html",
"revision": "0.24e3u5ntq78"
}], {});
workbox.cleanupOutdatedCaches();
workbox.registerRoute(new workbox.NavigationRoute(workbox.createHandlerBoundToURL("index.html"), {
allowlist: [/^\/$/],
denylist: [/^\/api\//]
}));
}));
//# sourceMappingURL=sw.js.map

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -250,13 +250,38 @@ function initTerminal() {
}
})
// Capture Ctrl+E even when terminal has focus
// Capture Ctrl+E and Ctrl+V when terminal has focus
terminal.attachCustomKeyEventHandler((e) => {
// Ctrl+E: Toggle terminal
if (e.ctrlKey && e.key === 'e') {
e.preventDefault()
toggleTerminal()
return false // Prevent terminal from processing
return false
}
// Ctrl+V: Paste from clipboard
if (e.ctrlKey && e.key === 'v' && e.type === 'keydown') {
e.preventDefault()
navigator.clipboard.readText().then((text) => {
if (text && socket && socket.readyState === WebSocket.OPEN) {
socket.send(JSON.stringify({ type: 'input', data: text }))
}
}).catch((err) => {
console.error('[Terminal] Clipboard read failed:', err)
})
return false
}
// Ctrl+C: Copy selection (if any)
if (e.ctrlKey && e.key === 'c' && e.type === 'keydown') {
const selection = terminal?.getSelection()
if (selection) {
navigator.clipboard.writeText(selection).catch(console.error)
return false
}
// If no selection, let Ctrl+C pass through as SIGINT
}
return true // Let terminal handle other keys
})
}

View File

@@ -23,6 +23,11 @@ const transcript = ref('')
const interimTranscript = ref('')
const error = ref('')
// Typing animation state
const animatedTranscript = ref('')
let typingTimeout: number | null = null
let lastAnimatedLength = 0
// Position and drag state
const position = ref({ x: 0, y: 0 })
const hasCustomPosition = ref(false)
@@ -30,18 +35,32 @@ const isDragging = ref(false)
const dragOffset = ref({ x: 0, y: 0 })
const containerRef = ref<HTMLElement | null>(null)
// Speech recognition
// Speech recognition (Web Speech API)
let recognition: SpeechRecognition | null = null
// WebSocket connection (own session)
// WebSocket connection to terminal
const WS_URL = `ws://${window.location.hostname}:4103`
let socket: WebSocket | null = null
const connected = ref(false)
// Push-to-talk state (Ctrl+S)
// Push-to-talk state (Ctrl+Space)
let keyDownTime = 0
let holdTimeout: number | null = null
const isPushToTalk = ref(false)
let pendingWhisperSend = false // Flag to send transcript when Whisper responds
// ============ WHISPER MODE ============
const useWhisper = ref(false)
const whisperReady = ref(false)
const whisperLoading = ref(false)
const WHISPER_WS_URL = `ws://${window.location.hostname}:4104`
let whisperSocket: WebSocket | null = null
let mediaRecorder: MediaRecorder | null = null
let audioChunks: Blob[] = []
let lastTranscriptLength = 0 // Track length of last transcription to show only new text
let chunkInterval: number | null = null
const CHUNK_INTERVAL_MS = 3000 // Send audio every 3 seconds
let mediaStream: MediaStream | null = null
const displayText = computed(() => {
if (interimTranscript.value) {
@@ -73,7 +92,7 @@ function initRecognition() {
const rec = new SpeechRecognition()
rec.continuous = true
rec.interimResults = true
rec.lang = 'es-ES'
rec.lang = 'es-419' // Latin American Spanish (better for accents)
rec.onresult = (event: SpeechRecognitionEvent) => {
let interim = ''
@@ -105,7 +124,7 @@ function initRecognition() {
}
rec.onend = () => {
if (isRecording.value) {
if (isRecording.value && !useWhisper.value) {
// Restart if still recording (browser stops after silence)
rec.start()
}
@@ -114,6 +133,307 @@ function initRecognition() {
return rec
}
// ============ WHISPER FUNCTIONS ============
async function checkWhisperStatus(updateLoading = true) {
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
const data = await res.json()
useWhisper.value = data.enabled
whisperReady.value = data.running
if (updateLoading) {
whisperLoading.value = data.starting || false
}
return data
} catch {
useWhisper.value = false
whisperReady.value = false
if (updateLoading) {
whisperLoading.value = false
}
return null
}
}
async function toggleWhisperMode() {
// Prevent multiple clicks
if (whisperLoading.value) {
console.log('[Voice] Toggle already in progress, ignoring')
return
}
whisperLoading.value = true
error.value = ''
// Show immediate feedback
if (!useWhisper.value) {
canvasStore.showNotification('Starting Whisper GPU server...', 'info', 10000)
}
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
method: 'POST'
})
const data = await res.json()
// Server is starting - poll until ready
if (data.starting) {
console.log('[Voice] Server starting, polling for status...')
await pollWhisperStatus()
return
}
useWhisper.value = data.enabled
whisperReady.value = data.running
if (data.enabled) {
canvasStore.showNotification('Whisper GPU ready!', 'success')
connectWhisperSocket()
} else {
canvasStore.showNotification('Using Web Speech API', 'info')
disconnectWhisperSocket()
}
} catch (e: any) {
error.value = 'Failed to toggle Whisper'
canvasStore.showNotification('Error starting Whisper server', 'error')
console.error('[Voice] Whisper toggle error:', e)
} finally {
whisperLoading.value = false
}
}
// Poll server status until ready or failed
async function pollWhisperStatus() {
const maxAttempts = 60 // 2 minutes max
let attempts = 0
while (attempts < maxAttempts) {
await new Promise(resolve => setTimeout(resolve, 2000))
attempts++
try {
const status = await checkWhisperStatus(false) // Don't update loading state
if (!status) {
console.log('[Voice] Failed to get status')
continue
}
// Still starting
if (status.starting) {
console.log(`[Voice] Still starting... (${attempts * 2}s)`)
continue
}
// Started successfully
if (status.running && status.enabled) {
console.log('[Voice] Server ready!')
canvasStore.showNotification('Whisper GPU ready!', 'success')
connectWhisperSocket()
whisperLoading.value = false
return
}
// Failed to start
console.log('[Voice] Server failed to start')
canvasStore.showNotification('Whisper server failed to start', 'error')
whisperLoading.value = false
return
} catch (e) {
console.error('[Voice] Polling error:', e)
}
}
// Timeout
canvasStore.showNotification('Whisper server timeout', 'error')
whisperLoading.value = false
}
function connectWhisperSocket() {
if (whisperSocket?.readyState === WebSocket.OPEN) return
console.log('[Voice] Connecting to Whisper server...')
whisperSocket = new WebSocket(WHISPER_WS_URL)
whisperSocket.onopen = () => {
console.log('[Voice] Whisper WebSocket connected')
whisperReady.value = true
}
whisperSocket.onmessage = (event) => {
try {
const msg = JSON.parse(event.data)
if (msg.type === 'ready') {
console.log('[Voice] Whisper ready:', msg.model, msg.device)
whisperReady.value = true
} else if (msg.type === 'transcription') {
if (msg.success && msg.text) {
const fullText = msg.text.trim()
if (msg.partial) {
// For partial results, show full accumulated transcription
transcript.value = fullText + ' '
interimTranscript.value = ''
console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
} else {
// Final result
transcript.value = fullText + ' '
interimTranscript.value = ''
console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
// Auto-send if push-to-talk was waiting for this
if (pendingWhisperSend) {
pendingWhisperSend = false
console.log('[Voice] Whisper response received, sending transcript')
if (transcript.value.trim()) {
sendTranscriptAndClose()
} else {
isPushToTalk.value = false
close()
}
}
}
// Update last transcript length for next partial
lastTranscriptLength = fullText.length
} else if (msg.error) {
error.value = msg.error
console.error('[Voice] Whisper error:', msg.error)
// Clear pending send on error
if (pendingWhisperSend) {
pendingWhisperSend = false
isPushToTalk.value = false
}
}
}
} catch (e) {
console.error('[Voice] Whisper message error:', e)
}
}
whisperSocket.onclose = () => {
console.log('[Voice] Whisper WebSocket closed')
whisperReady.value = false
}
whisperSocket.onerror = (e) => {
console.error('[Voice] Whisper WebSocket error:', e)
whisperReady.value = false
}
}
function disconnectWhisperSocket() {
if (whisperSocket) {
whisperSocket.close()
whisperSocket = null
}
whisperReady.value = false
}
async function startWhisperRecording() {
try {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
mediaRecorder = new MediaRecorder(mediaStream, {
mimeType: 'audio/webm;codecs=opus'
})
audioChunks = []
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data)
}
}
// Reset state for new recording
audioChunks = []
lastTranscriptLength = 0
// Start recording
mediaRecorder.start(100) // Collect data every 100ms
isRecording.value = true
// Send chunks periodically for progressive transcription
chunkInterval = window.setInterval(() => {
if (audioChunks.length > 0 && whisperSocket?.readyState === WebSocket.OPEN) {
sendAudioChunk(false) // false = partial, don't clear
}
}, CHUNK_INTERVAL_MS)
} catch (e: any) {
error.value = `Microphone error: ${e.message}`
console.error('[Voice] Microphone error:', e)
}
}
function sendAudioChunk(isFinal: boolean) {
if (audioChunks.length === 0) return
// Always send ALL accumulated audio (webm needs header from first chunk)
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
const chunkCount = audioChunks.length
// Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
if (audioBlob.size < 5000) {
console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
if (isFinal) {
audioChunks = []
}
return
}
// Clear chunks only if final
if (isFinal) {
audioChunks = []
lastTranscriptLength = 0
}
const reader = new FileReader()
reader.onloadend = () => {
const base64 = (reader.result as string).split(',')[1]
if (whisperSocket?.readyState === WebSocket.OPEN) {
whisperSocket.send(JSON.stringify({
type: 'transcribe',
audio: base64,
language: 'es',
partial: !isFinal
}))
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
}
}
reader.readAsDataURL(audioBlob)
}
function stopWhisperRecording() {
// Clear the chunk interval
if (chunkInterval) {
clearInterval(chunkInterval)
chunkInterval = null
}
// Send final chunk
if (audioChunks.length > 0) {
sendAudioChunk(true) // true = final
}
// Stop recorder
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop()
}
// Stop media stream
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop())
mediaStream = null
}
isRecording.value = false
}
function toggleRecording() {
if (isRecording.value) {
stopRecording()
@@ -124,30 +444,47 @@ function toggleRecording() {
function startRecording() {
error.value = ''
if (!recognition) {
recognition = initRecognition()
}
if (recognition) {
try {
recognition.start()
isRecording.value = true
} catch (e) {
console.error('[Voice] Failed to start:', e)
if (useWhisper.value && whisperReady.value) {
// Use Whisper GPU mode
startWhisperRecording()
} else {
// Use Web Speech API
if (!recognition) {
recognition = initRecognition()
}
if (recognition) {
try {
recognition.start()
isRecording.value = true
} catch (e) {
console.error('[Voice] Failed to start:', e)
}
}
}
}
function stopRecording() {
if (recognition) {
recognition.stop()
if (useWhisper.value) {
stopWhisperRecording()
} else {
if (recognition) {
recognition.stop()
}
isRecording.value = false
}
isRecording.value = false
interimTranscript.value = ''
}
function clearTranscript() {
transcript.value = ''
interimTranscript.value = ''
animatedTranscript.value = ''
lastAnimatedLength = 0
if (typingTimeout) {
clearTimeout(typingTimeout)
typingTimeout = null
}
}
function connectSocket() {
@@ -209,6 +546,7 @@ function sendTranscript() {
function close() {
stopRecording()
clearTranscript()
isOpen.value = false
}
@@ -290,21 +628,31 @@ function handleKeyUp(e: KeyboardEvent) {
holdTimeout = null
}
// If was push-to-talk recording, stop and send after 1200ms
// If was push-to-talk recording, continue recording for 1.5s buffer then stop
if (isPushToTalk.value && isRecording.value) {
console.log('[Voice] Stopping recording, will send in 1200ms')
stopRecording()
console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
// Keep recording for 1.5s more (UX buffer for trailing words)
setTimeout(() => {
console.log('[Voice] Sending transcript:', transcript.value.trim())
console.log('[Voice] Socket state:', socket?.readyState)
if (transcript.value.trim()) {
sendTranscriptAndClose()
console.log('[Voice] Buffer complete, stopping recording')
stopRecording()
if (useWhisper.value) {
// For Whisper: wait for server response (handled in onmessage)
console.log('[Voice] Waiting for Whisper transcription...')
pendingWhisperSend = true
} else {
// No transcript, just close
isPushToTalk.value = false
close()
// For Web Speech API: send after short delay for final results
setTimeout(() => {
if (transcript.value.trim()) {
sendTranscriptAndClose()
} else {
isPushToTalk.value = false
close()
}
}, 300)
}
}, 1200)
}, 1500)
}
keyDownTime = 0
@@ -349,17 +697,74 @@ function sendTranscriptAndClose() {
typeChar()
}
onMounted(() => {
// Typing animation effect
function animateTyping(targetText: string) {
// Clear any pending animation
if (typingTimeout) {
clearTimeout(typingTimeout)
typingTimeout = null
}
// If new text is shorter, just set it (user cleared or correction)
if (targetText.length < animatedTranscript.value.length) {
animatedTranscript.value = targetText
lastAnimatedLength = targetText.length
return
}
// Start from where we left off
const startIndex = lastAnimatedLength
// Type remaining characters one by one
function typeNext(index: number) {
if (index <= targetText.length) {
animatedTranscript.value = targetText.substring(0, index)
lastAnimatedLength = index
if (index < targetText.length) {
// Faster typing speed: 15-25ms per character
const delay = 15 + Math.random() * 10
typingTimeout = window.setTimeout(() => typeNext(index + 1), delay)
}
}
}
typeNext(startIndex)
}
// Watch transcript changes for typing animation
watch(transcript, (newVal) => {
animateTyping(newVal)
})
onMounted(async () => {
recognition = initRecognition()
// Use capture phase to intercept before terminal or other elements
document.addEventListener('keydown', handleKeyDown, { capture: true })
document.addEventListener('keyup', handleKeyUp, { capture: true })
// Check Whisper status on mount
const status = await checkWhisperStatus()
// If server is starting (page was reloaded during startup), continue polling
if (status?.starting) {
console.log('[Voice] Server is starting, resuming polling...')
pollWhisperStatus()
} else if (useWhisper.value) {
connectWhisperSocket()
}
})
onBeforeUnmount(() => {
stopRecording()
recognition = null
disconnectSocket()
disconnectWhisperSocket()
if (chunkInterval) clearInterval(chunkInterval)
if (typingTimeout) clearTimeout(typingTimeout)
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop())
}
document.removeEventListener('keydown', handleKeyDown, { capture: true })
document.removeEventListener('keyup', handleKeyUp, { capture: true })
document.removeEventListener('mousemove', onDrag)
@@ -408,8 +813,24 @@ defineExpose({
</svg>
<span>Voice</span>
<i class="dot" :class="{ recording: isRecording, ptt: isPushToTalk }"></i>
<span class="mode-badge" :class="{ gpu: useWhisper }">
{{ useWhisper ? 'GPU' : 'Web' }}
</span>
</div>
<div class="window-controls">
<button
class="whisper-toggle"
:class="{ active: useWhisper, loading: whisperLoading }"
:disabled="whisperLoading"
@click.stop="toggleWhisperMode"
:title="whisperLoading ? 'Starting Whisper server...' : (useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU')"
>
<svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<rect x="4" y="4" width="16" height="16" rx="2"/>
<line x1="9" y1="9" x2="9" y2="15"/>
<line x1="15" y1="9" x2="15" y2="15"/>
</svg>
</button>
<button class="x" @click="close" title="Close">
<svg width="8" height="8" viewBox="0 0 10 10">
<line x1="0" y1="0" x2="10" y2="10" stroke="currentColor" stroke-width="1.5"/>
@@ -421,10 +842,10 @@ defineExpose({
<!-- Content -->
<div class="content">
<div class="transcript" :class="{ empty: !transcript && !interimTranscript }">
<span class="final">{{ transcript }}</span>
<div class="transcript" :class="{ empty: !animatedTranscript && !interimTranscript }">
<span class="final">{{ animatedTranscript }}</span><span class="cursor" v-if="animatedTranscript && animatedTranscript.length < transcript.length">|</span>
<span class="interim">{{ interimTranscript }}</span>
<span v-if="!transcript && !interimTranscript" class="placeholder">
<span v-if="!animatedTranscript && !interimTranscript" class="placeholder">
Presiona el micrófono o mantén Ctrl+Space...
</span>
</div>
@@ -545,6 +966,58 @@ defineExpose({
box-shadow: 0 0 6px #f90;
}
.mode-badge {
font-size: 8px;
padding: 1px 4px;
border-radius: 3px;
background: rgba(0, 0, 0, 0.2);
color: #555;
font-weight: 600;
text-transform: uppercase;
}
.mode-badge.gpu {
background: linear-gradient(135deg, #10b981, #059669);
color: #fff;
box-shadow: 0 0 4px rgba(16, 185, 129, 0.5);
}
.whisper-toggle {
width: 20px;
height: 18px;
display: flex;
align-items: center;
justify-content: center;
background: rgba(255, 255, 255, 0.3);
border: 1px solid rgba(0, 0, 0, 0.1);
border-radius: 3px;
color: #666;
cursor: pointer;
transition: all 0.15s;
}
.whisper-toggle:hover:not(:disabled) {
background: rgba(255, 255, 255, 0.5);
}
.whisper-toggle:disabled {
cursor: not-allowed;
opacity: 0.6;
}
.whisper-toggle.active {
background: linear-gradient(180deg, #10b981 0%, #059669 100%);
border-color: #047857;
color: #fff;
}
.whisper-toggle.loading {
animation: pulse 0.6s infinite;
background: linear-gradient(180deg, #f59e0b 0%, #d97706 100%);
border-color: #b45309;
color: #fff;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
@@ -601,6 +1074,17 @@ defineExpose({
font-style: italic;
}
.transcript .cursor {
color: #4a9;
font-weight: bold;
animation: blink 0.6s infinite;
}
@keyframes blink {
0%, 50% { opacity: 1; }
51%, 100% { opacity: 0; }
}
.transcript .placeholder {
color: #888;
}

View File

@@ -223,6 +223,112 @@ export function createGlobalHandlers(callbacks: ToolManagementCallbacks): ToolCo
}, 100)
return 'Recargando pagina...'
}
},
{
name: 'whisper_status',
description: 'Obtiene el estado del servidor Whisper GPU para speech-to-text.',
category: 'global',
schema: {
type: 'object',
properties: {}
},
handler: async () => {
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
const data = await res.json()
return `Whisper GPU Status:\n` +
` Enabled: ${data.enabled ? 'Yes' : 'No'}\n` +
` Running: ${data.running ? 'Yes' : 'No'}\n` +
` Model: ${data.model}\n` +
` Device: ${data.device}\n` +
` Port: ${data.port}`
} catch (e: any) {
return `Error checking Whisper status: ${e.message}`
}
}
},
{
name: 'whisper_toggle',
description: 'Activa o desactiva Whisper GPU para speech-to-text. Cuando esta activo usa la GPU para transcribir voz con mejor precision para acentos latinos.',
category: 'global',
schema: {
type: 'object',
properties: {}
},
handler: async () => {
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
method: 'POST'
})
const data = await res.json()
if (data.enabled) {
return `Whisper GPU ENABLED\n` +
` Model: ${data.model}\n` +
` Device: ${data.device}\n` +
` Port: ws://localhost:${data.port}\n\n` +
`Voice input will now use GPU-accelerated transcription.`
} else {
return `Whisper GPU DISABLED\n\n` +
`Voice input will use Web Speech API (browser native).`
}
} catch (e: any) {
return `Error toggling Whisper: ${e.message}`
}
}
},
{
name: 'whisper_start',
description: 'Inicia el servidor Whisper GPU si no esta corriendo.',
category: 'global',
schema: {
type: 'object',
properties: {}
},
handler: async () => {
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/start`, {
method: 'POST'
})
const data = await res.json()
if (data.success) {
return `Whisper server started!\n` +
` Model: ${data.model}\n` +
` Device: ${data.device}\n` +
` Ready: ${data.running ? 'Yes' : 'Loading...'}`
} else {
return `Failed to start Whisper server: ${data.message}`
}
} catch (e: any) {
return `Error starting Whisper: ${e.message}`
}
}
},
{
name: 'whisper_stop',
description: 'Detiene el servidor Whisper GPU para liberar memoria de la GPU.',
category: 'global',
schema: {
type: 'object',
properties: {}
},
handler: async () => {
try {
const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/stop`, {
method: 'POST'
})
const data = await res.json()
if (data.success) {
return `Whisper server stopped. GPU memory released.`
} else {
return `Failed to stop Whisper server: ${data.message}`
}
} catch (e: any) {
return `Error stopping Whisper: ${e.message}`
}
}
}
]
}

View File

@@ -3,7 +3,10 @@
"version": "1.0.0",
"description": "Dynamic canvas for Claude Code interaction",
"scripts": {
"start": "concurrently -n server,frontend -c blue,green \"cd server && bun --watch run index.ts\" \"cd frontend && bun run dev --host\""
"start": "concurrently -n api,terminal,frontend -c blue,yellow,green \"cd server && bun --watch run index.ts\" \"cd server && bun run terminal.ts\" \"cd frontend && bun run dev --host\"",
"start:api": "cd server && bun --watch run index.ts",
"start:terminal": "cd server && bun run terminal.ts",
"start:frontend": "cd frontend && bun run dev --host"
},
"devDependencies": {
"concurrently": "^9.2.1"

View File

@@ -1,7 +1,6 @@
import { PORT_HTTP, WORKING_DIR } from './config'
import { initDatabase } from './db'
import { handleRequest } from './routes'
import { startTerminalServer } from './services/terminal'
// Initialize database
initDatabase()
@@ -12,18 +11,10 @@ Bun.serve({
fetch: handleRequest
})
console.log(`[HTTP] API running at http://localhost:${PORT_HTTP}`)
// Start Terminal WebSocket server
startTerminalServer()
// Startup summary
console.log('')
console.log('='.repeat(50))
console.log('Agent UI Server started')
console.log('Agent UI API Server (hot-reload enabled)')
console.log(` API: http://localhost:${PORT_HTTP}`)
console.log(` Terminal: ws://localhost:4103`)
console.log(` Working Dir: ${WORKING_DIR}`)
console.log('')
console.log('WebMCP starts separately with Claude Code MCP')
console.log('='.repeat(50))

View File

@@ -7,6 +7,7 @@ import { handleThemes, handleActiveTheme, handleDesignTokens, handleThemeById, h
import { handleCanvas, handleCanvasById, handleToolbarCanvas, handleDefaultCanvas, handleCanvasComponents, handleCanvasComponentById } from './canvas'
import { handleGiteaRepo, handleGiteaTree, handleGiteaFile } from './gitea'
import { handleTables, handleStats, handleTableSchema, handleTableData, handleQuery } from './database'
import { handleWhisperRoutes } from './whisper'
export async function handleRequest(req: Request): Promise<Response> {
const url = new URL(req.url)
@@ -168,5 +169,11 @@ export async function handleRequest(req: Request): Promise<Response> {
return handleQuery(req)
}
// Whisper (GPU speech-to-text)
if (path.startsWith('/api/whisper/')) {
const res = await handleWhisperRoutes(req)
if (res) return res
}
return notFoundResponse()
}

66
server/routes/whisper.ts Normal file
View File

@@ -0,0 +1,66 @@
/**
* Whisper API routes
* Control the local GPU-accelerated speech-to-text server
*/
import {
startWhisperServer,
stopWhisperServer,
toggleWhisperServer,
getWhisperState,
getWhisperPort
} from '../services/whisper'
export async function handleWhisperRoutes(req: Request): Promise<Response | null> {
const url = new URL(req.url)
const path = url.pathname
// GET /api/whisper/status - Get current state
if (path === '/api/whisper/status' && req.method === 'GET') {
const state = await getWhisperState()
return Response.json(state)
}
// POST /api/whisper/start - Start Whisper server
if (path === '/api/whisper/start' && req.method === 'POST') {
const success = await startWhisperServer()
const state = await getWhisperState()
return Response.json({
success,
...state,
message: success ? 'Whisper server started' : 'Failed to start Whisper server'
})
}
// POST /api/whisper/stop - Stop Whisper server
if (path === '/api/whisper/stop' && req.method === 'POST') {
const success = stopWhisperServer()
const state = await getWhisperState()
return Response.json({
success,
...state,
message: success ? 'Whisper server stopped' : 'Failed to stop Whisper server'
})
}
// POST /api/whisper/toggle - Toggle Whisper on/off
if (path === '/api/whisper/toggle' && req.method === 'POST') {
const result = await toggleWhisperServer()
const state = await getWhisperState()
return Response.json({
...result,
...state,
message: state.enabled ? 'Whisper enabled (GPU)' : 'Whisper disabled (using Web Speech API)'
})
}
// GET /api/whisper/port - Get Whisper WebSocket port
if (path === '/api/whisper/port' && req.method === 'GET') {
return Response.json({
port: getWhisperPort(),
url: `ws://localhost:${getWhisperPort()}`
})
}
return null
}

247
server/services/whisper.ts Normal file
View File

@@ -0,0 +1,247 @@
/**
* Whisper Service - Manages the Python Whisper server process
* Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
*/
import { join } from 'path'
import { Subprocess } from 'bun'
const WHISPER_PORT = 4104
const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
interface WhisperState {
enabled: boolean
running: boolean
starting: boolean // Prevents multiple simultaneous start attempts
process: Subprocess | null
model: string
device: string
}
const state: WhisperState = {
enabled: false,
running: false,
starting: false,
process: null,
model: 'large-v3',
device: 'cuda'
}
/**
* Kill any process using the Whisper port
*/
async function killProcessOnPort(port: number): Promise<void> {
try {
// Use PowerShell to find and kill process on port
const proc = Bun.spawn(['powershell', '-Command',
`Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
], { stdout: 'ignore', stderr: 'ignore' })
await proc.exited
// Wait a moment for port to be released
await new Promise(resolve => setTimeout(resolve, 1000))
} catch {
// Ignore errors
}
}
/**
* Start the Whisper Python server
*/
export async function startWhisperServer(): Promise<boolean> {
// Prevent multiple simultaneous start attempts
if (state.starting) {
return false
}
if (state.running && state.process) {
return true
}
state.starting = true
console.log(`[Whisper] Starting (${state.model})...`)
// Kill any existing process on the port
await killProcessOnPort(WHISPER_PORT)
try {
// Use Bun.spawn with inherit to show logs directly in console
// -u flag disables Python output buffering for real-time logs
const proc = Bun.spawn(['python', '-u', WHISPER_SCRIPT], {
cwd: join(import.meta.dir, '..'),
stdout: 'inherit',
stderr: 'inherit',
env: { ...process.env, PYTHONUNBUFFERED: '1' }
})
state.process = proc
// Wait a bit for the server to start
await new Promise(resolve => setTimeout(resolve, 2000))
// Check if process is still running
if (proc.exitCode !== null) {
console.error('[Whisper] Process exited with code:', proc.exitCode)
state.process = null
state.starting = false
return false
}
// Check if WebSocket is ready
const isListening = await checkPort(WHISPER_PORT)
if (isListening) {
console.log('[Whisper] Ready')
state.running = true
state.enabled = true
state.starting = false
return true
}
// Wait more if model is still loading (up to 120 seconds total for large models)
for (let i = 0; i < 40; i++) {
await new Promise(resolve => setTimeout(resolve, 3000))
if (proc.exitCode !== null) {
console.error('[Whisper] Process died')
state.process = null
state.starting = false
return false
}
const ready = await checkPort(WHISPER_PORT)
if (ready) {
console.log('[Whisper] Ready')
state.running = true
state.enabled = true
state.starting = false
return true
}
}
console.error('[Whisper] Timeout (120s)')
state.starting = false
return false
} catch (err: any) {
console.error('[Whisper] Error:', err.message)
state.process = null
state.starting = false
return false
}
}
/**
* Check if Whisper WebSocket is ready using PowerShell
*/
async function checkPort(port: number): Promise<boolean> {
try {
const proc = Bun.spawn(['powershell', '-NoProfile', '-Command',
`$c = Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue; if ($c) { Write-Output 'LISTENING' }`
], {
stdout: 'pipe',
stderr: 'ignore'
})
const output = await new Response(proc.stdout).text()
await proc.exited
return output.trim() === 'LISTENING'
} catch {
return false
}
}
/**
* Stop the Whisper server
*/
export function stopWhisperServer(): boolean {
if (!state.process) {
return true
}
try {
state.process.kill()
state.process = null
state.running = false
state.enabled = false
console.log('[Whisper] Stopped')
return true
} catch (err) {
console.error('[Whisper] Stop error:', err)
return false
}
}
/**
* Toggle Whisper server on/off (async - returns immediately when starting)
*/
export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean; starting: boolean }> {
// Prevent toggle while starting
if (state.starting) {
return { enabled: false, success: false, starting: true }
}
if (state.enabled && state.running) {
const success = stopWhisperServer()
return { enabled: false, success, starting: false }
} else {
// Start server in background - don't await
startWhisperServer().catch(err => {
console.error('[Whisper] Start error:', err)
state.starting = false
})
// Return immediately - frontend will poll for status
return { enabled: false, success: true, starting: true }
}
}
/**
* Get current Whisper state (checks real port status)
*/
export async function getWhisperState(): Promise<{
enabled: boolean
running: boolean
starting: boolean
port: number
model: string
device: string
}> {
// Check if port is actually listening (skip if starting to avoid interference)
if (!state.starting) {
const isListening = await checkPort(WHISPER_PORT)
// Sync state with reality
if (isListening && !state.running) {
state.running = true
state.enabled = true
} else if (!isListening && state.running) {
state.running = false
state.enabled = false
state.process = null
}
}
return {
enabled: state.enabled,
running: state.running,
starting: state.starting,
port: WHISPER_PORT,
model: state.model,
device: state.device
}
}
/**
* Check if Whisper is enabled
*/
export function isWhisperEnabled(): boolean {
return state.enabled && state.running
}
// WebSocket server for Whisper (proxies to Python server or handles directly)
let whisperWsServer: any = null
export function getWhisperPort(): number {
return WHISPER_PORT
}

22
server/terminal.ts Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bun
/**
* Terminal Server - Independent process
* This runs separately from the main server to maintain stable Claude Code sessions
* even when the main server restarts due to code changes.
*/
import { startTerminalServer } from './services/terminal'
import { WORKING_DIR } from './config'
console.log('')
console.log('='.repeat(50))
console.log('Terminal Server (Independent Process)')
console.log(` WebSocket: ws://localhost:4103`)
console.log(` Working Dir: ${WORKING_DIR}`)
console.log('')
console.log('This process is stable and won\'t restart')
console.log('when the main server reloads.')
console.log('='.repeat(50))
console.log('')
startTerminalServer()

314
server/whisper_server.py Normal file
View File

@@ -0,0 +1,314 @@
#!/usr/bin/env python3
"""
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
WebSocket server that receives audio and returns transcriptions
"""
import asyncio
import json
import sys
import io
import wave
import tempfile
import os
import subprocess
from pathlib import Path
try:
import websockets
from faster_whisper import WhisperModel
except ImportError as e:
print(f"Missing dependency: {e}")
print("Run: pip install faster-whisper websockets")
sys.exit(1)
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
"""
Convert audio data to WAV format using ffmpeg.
Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
"""
# Create temp files for input and output
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
in_file.write(input_data)
input_path = in_file.name
output_path = input_path.replace(f".{input_format}", ".wav")
try:
# Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
result = subprocess.run([
"ffmpeg", "-y", # Overwrite output
"-i", input_path, # Input file
"-ar", "16000", # Sample rate 16kHz
"-ac", "1", # Mono
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
output_path
], capture_output=True, text=True, timeout=30)
if result.returncode != 0:
print(f"[Whisper] ffmpeg error: {result.stderr}")
return None
# Read the converted WAV file
with open(output_path, "rb") as f:
wav_data = f.read()
return wav_data
except subprocess.TimeoutExpired:
print("[Whisper] ffmpeg conversion timed out")
return None
except FileNotFoundError:
print("[Whisper] ffmpeg not found - please install ffmpeg")
return None
except Exception as e:
print(f"[Whisper] Conversion error: {e}")
return None
finally:
# Cleanup temp files
try:
os.unlink(input_path)
except:
pass
try:
os.unlink(output_path)
except:
pass
# Configuration
HOST = "localhost"
PORT = 4104
MODEL_SIZE = "large-v3" # Best standard model for Spanish
DEVICE = "cuda" # cuda or cpu
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
# Model display name (extract from path if needed)
MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
# Global model instance
model = None
model_loading = False
async def load_model():
"""Load Whisper model (lazy loading on first request)"""
global model, model_loading
if model is not None:
return model
if model_loading:
# Wait for model to finish loading
while model_loading:
await asyncio.sleep(0.1)
return model
model_loading = True
print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...")
try:
# Load model - this downloads on first run
model = WhisperModel(
MODEL_SIZE,
device=DEVICE,
compute_type=COMPUTE_TYPE,
download_root=str(Path.home() / ".cache" / "whisper")
)
print(f"[Whisper] Model loaded successfully!")
except Exception as e:
print(f"[Whisper] Error loading model: {e}")
print("[Whisper] Falling back to CPU...")
model = WhisperModel(
MODEL_SIZE,
device="cpu",
compute_type="int8",
download_root=str(Path.home() / ".cache" / "whisper")
)
model_loading = False
return model
def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
"""Transcribe audio data using Whisper"""
global model
if model is None:
return {"error": "Model not loaded"}
# Convert WebM to WAV if needed
if is_webm:
wav_data = convert_audio_to_wav(audio_data, "webm")
if wav_data is None:
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
else:
wav_data = audio_data
# Save audio to temp file (faster-whisper needs a file path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(wav_data)
temp_path = f.name
try:
# Transcribe with optimized parameters
segments, info = model.transcribe(
temp_path,
language=language,
beam_size=5,
best_of=5, # Number of candidates when sampling
temperature=0.0, # Use greedy decoding (most accurate)
vad_filter=True, # Voice activity detection
vad_parameters=dict(
min_silence_duration_ms=300, # Shorter silence detection
speech_pad_ms=200, # Padding around speech
threshold=0.5 # VAD sensitivity (lower = more sensitive)
),
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
condition_on_previous_text=True, # Use context from previous segments
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
compression_ratio_threshold=2.4,
word_timestamps=False # Faster without word-level timestamps
)
# Collect all segments
text = ""
segments_list = []
for segment in segments:
text += segment.text + " "
segments_list.append({
"start": segment.start,
"end": segment.end,
"text": segment.text
})
return {
"success": True,
"text": text.strip(),
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration,
"segments": segments_list,
"engine": "whisper-gpu",
"model": MODEL_NAME,
"device": DEVICE
}
except Exception as e:
print(f"[Whisper] Transcription error: {e}")
return {"error": str(e)}
finally:
# Cleanup temp file
try:
os.unlink(temp_path)
except:
pass
async def handle_client(websocket):
"""Handle WebSocket client connection"""
# Ensure model is loaded
await load_model()
# Send ready message
await websocket.send(json.dumps({
"type": "ready",
"model": MODEL_NAME,
"device": DEVICE
}))
try:
async for message in websocket:
if isinstance(message, bytes):
# Binary audio data (likely WebM format from browser)
# Transcribe in thread pool to not block
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: transcribe_audio(message, "es", is_webm=True)
)
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
else:
# JSON command
try:
cmd = json.loads(message)
if cmd.get("type") == "transcribe":
# Audio data sent as base64 (WebM format from browser)
import base64
audio_data = base64.b64decode(cmd.get("audio", ""))
language = cmd.get("language", "es")
is_partial = cmd.get("partial", False)
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: transcribe_audio(audio_data, language, is_webm=True)
)
# Add partial flag to result
if is_partial:
result["partial"] = True
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
elif cmd.get("type") == "ping":
await websocket.send(json.dumps({"type": "pong"}))
elif cmd.get("type") == "status":
await websocket.send(json.dumps({
"type": "status",
"model": MODEL_NAME,
"device": DEVICE,
"ready": model is not None
}))
except json.JSONDecodeError:
await websocket.send(json.dumps({
"type": "error",
"message": "Invalid JSON"
}))
except websockets.exceptions.ConnectionClosed:
pass
except Exception as e:
print(f"[Whisper] Error: {e}")
async def main():
"""Start WebSocket server"""
print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}")
# Pre-load model
await load_model()
async with websockets.serve(handle_client, HOST, PORT):
print(f"[Whisper] Ready")
await asyncio.Future() # Run forever
if __name__ == "__main__":
# Install websockets if needed
try:
import websockets
except ImportError:
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
import websockets
asyncio.run(main())