diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx index 747c61fd0b..fd467ac48c 100644 --- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx +++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx @@ -36,6 +36,7 @@ import { useToast } from "../../ui/toast" import { useKV } from "../../context/kv" import { useTextareaKeybindings } from "../textarea-keybindings" import { DialogSkill } from "../dialog-skill" +import { VoiceInput, VoiceMetadataStore } from "@/voice" export type PromptProps = { sessionID?: string @@ -165,6 +166,9 @@ export function Prompt(props: PromptProps) { extmarkToPartIndex: Map interrupt: number placeholder: number + recording: boolean + interimText: string + voiceMeta: string }>({ placeholder: randomIndex(list().length), prompt: { @@ -174,6 +178,9 @@ export function Prompt(props: PromptProps) { mode: "normal", extmarkToPartIndex: new Map(), interrupt: 0, + recording: false, + interimText: "", + voiceMeta: "", }) createEffect( @@ -261,6 +268,11 @@ export function Prompt(props: PromptProps) { onSelect: (dialog) => { if (autocomplete.visible) return if (!input.focused) return + if (store.recording) { + voice?.stop() + dialog.clear() + return + } // TODO: this should be its own command if (store.mode === "shell") { setStore("mode", "normal") @@ -370,6 +382,16 @@ export function Prompt(props: PromptProps) { input.cursorOffset = Bun.stringWidth(content) }, }, + { + title: "Voice dictate", + value: "voice.dictate", + keybind: "voice_dictate", + category: "Prompt", + onSelect: async (dialog) => { + dialog.clear() + await toggleVoice() + }, + }, { title: "Skills", value: "prompt.skills", @@ -587,6 +609,7 @@ export function Prompt(props: PromptProps) { ]) async function submit() { + if (store.recording) await voice?.stop() if (props.disabled) return if (autocomplete?.visible) return if (!store.prompt.input) return @@ -624,6 +647,12 @@ export function Prompt(props: PromptProps) { const messageID = MessageID.ascending() let inputText = store.prompt.input + // Append voice metadata context if voice was used during this prompt + const voiceContext = voiceMetaStore.toMarkdown() + if (voiceContext) { + inputText = inputText + "\n\n\n" + voiceContext + "\n" + } + // Expand pasted text inline before submitting const allExtmarks = input.extmarks.getAllForTypeId(promptPartTypeId) const sortedExtmarks = allExtmarks.sort((a: { start: number }, b: { start: number }) => b.start - a.start) @@ -732,6 +761,86 @@ export function Prompt(props: PromptProps) { } const exit = useExit() + // ── Voice dictation ────────────────────────────────────────────── + let voice: VoiceInput | undefined + const voiceMetaStore = new VoiceMetadataStore() + + function getVoice(): VoiceInput { + if (!voice) { + voice = new VoiceInput() + voice.onTranscript = (seg) => { + // Ingest metadata regardless of final/partial + voiceMetaStore.ingest(seg) + setStore("voiceMeta", voiceMetaStore.shortSummary) + + if (seg.is_final && seg.text.trim()) { + if (!input || input.isDestroyed) return + input.insertText(seg.text.trim() + " ") + setStore("prompt", "input", input.plainText) + setStore("interimText", "") + setTimeout(() => { + if (!input || input.isDestroyed) return + input.getLayoutNode().markDirty() + renderer.requestRender() + }, 0) + } else if (!seg.is_final) { + setStore("interimText", seg.text) + } + } + voice.onError = (err) => { + toast.show({ + message: `Voice: ${err.message}`, + variant: "error", + duration: 3000, + }) + setStore("recording", false) + setStore("interimText", "") + setStore("voiceMeta", "") + } + voice.onStateChange = (active) => { + setStore("recording", active) + if (active) { + voiceMetaStore.startSession() + } else { + voiceMetaStore.endSession() + setStore("interimText", "") + } + } + } + return voice + } + + async function toggleVoice() { + if (props.disabled) return + if (store.mode === "shell") return + try { + await getVoice().toggle() + } catch (err) { + const msg = err instanceof Error ? err.message : "Voice input failed" + toast.show({ + message: msg.includes("not found") || msg.includes("ENOENT") ? "Voice requires sox: brew install sox" : `Voice: ${msg}`, + variant: "warning", + duration: 5000, + }) + setStore("recording", false) + } + } + + onCleanup(() => { + voice?.stop() + }) + + createEffect( + on( + () => props.sessionID, + () => { + voice?.stop() + }, + { defer: true }, + ), + ) + // ── End voice ──────────────────────────────────────────────────── + function pasteText(text: string, virtualText: string) { const currentOffset = input.visualCursor.offset const extmarkStart = currentOffset @@ -945,6 +1054,11 @@ export function Prompt(props: PromptProps) { setStore("extmarkToPartIndex", new Map()) return } + if (keybind.match("voice_dictate", e)) { + e.preventDefault() + toggleVoice() + return + } if (keybind.match("app_exit", e)) { if (store.prompt.input === "") { await exit() @@ -1106,6 +1220,17 @@ export function Prompt(props: PromptProps) { {local.model.variant.current()} + + · + + ● + + + {store.interimText + ? `"${store.interimText.slice(0, 40)}${store.interimText.length > 40 ? "…" : ""}"` + : store.voiceMeta || "listening…"} + + @@ -1245,6 +1370,11 @@ export function Prompt(props: PromptProps) { {keybind.print("command_list")} commands + + + {keybind.print("voice_dictate")} stop + + diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index efae2ca551..024a444452 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -762,6 +762,7 @@ export namespace Config { tips_toggle: z.string().optional().default("h").describe("Toggle tips on home screen"), plugin_manager: z.string().optional().default("none").describe("Open plugin manager dialog"), display_thinking: z.string().optional().default("none").describe("Toggle thinking blocks visibility"), + voice_dictate: z.string().optional().default("alt+v").describe("Toggle voice dictation"), }) .strict() .meta({ diff --git a/packages/opencode/src/voice/asr-client.ts b/packages/opencode/src/voice/asr-client.ts new file mode 100644 index 0000000000..4f0811f8e9 --- /dev/null +++ b/packages/opencode/src/voice/asr-client.ts @@ -0,0 +1,146 @@ +export interface TranscriptSegment { + text: string + is_final: boolean + metadata?: { + emotion?: string + intent?: string + gender?: string + age?: string + } + metadata_probs?: { + emotion?: Array<{ token: string; probability: number }> + intent?: Array<{ token: string; probability: number }> + } + speech_rate?: { + words_per_minute: number + filler_count: number + filler_rate: number + pause_count: number + } +} + +export class AsrStreamClient { + private ws: WebSocket | null = null + private url: string + private language: string + private sampleRate: number + private endResolve: (() => void) | null = null + + onTranscript: ((seg: TranscriptSegment) => void) | null = null + onError: ((err: Error) => void) | null = null + + private token: string + + constructor(opts?: { url?: string; language?: string; sampleRate?: number; token?: string }) { + const base = opts?.url ?? process.env.WHISSLE_ASR_URL ?? "wss://api.whissle.ai/asr/stream" + this.token = opts?.token ?? process.env.WHISSLE_AUTH_TOKEN ?? "" + this.url = this.token ? `${base}?token=${encodeURIComponent(this.token)}` : base + this.language = opts?.language ?? process.env.WHISSLE_ASR_LANGUAGE ?? "en" + this.sampleRate = opts?.sampleRate ?? 16000 + } + + get connected(): boolean { + return this.ws?.readyState === WebSocket.OPEN + } + + connect(): Promise { + return new Promise((resolve, reject) => { + if (this.connected) { + resolve() + return + } + + const ws = new WebSocket(this.url) + ws.binaryType = "arraybuffer" + + const timeout = setTimeout(() => { + reject(new Error("Voice server connection timed out")) + try { + ws.close() + } catch {} + }, 10_000) + + ws.addEventListener("open", () => { + clearTimeout(timeout) + this.ws = ws + this.sendConfig() + resolve() + }) + + ws.addEventListener("message", (ev: MessageEvent) => { + if (typeof ev.data !== "string") return + try { + const msg = JSON.parse(ev.data) + if (msg.type === "transcript") { + this.onTranscript?.({ + text: msg.text ?? "", + is_final: msg.is_final !== false, + metadata: msg.metadata, + metadata_probs: msg.metadata_probs, + speech_rate: msg.speech_rate, + }) + } else if (msg.type === "end") { + this.endResolve?.() + this.endResolve = null + this.close() + } else if (msg.type === "error") { + this.onError?.(new Error(msg.message ?? "ASR error")) + } + } catch {} + }) + + ws.addEventListener("error", () => { + clearTimeout(timeout) + const err = new Error("Voice server connection failed") + reject(err) + this.onError?.(err) + }) + + ws.addEventListener("close", () => { + clearTimeout(timeout) + this.endResolve?.() + this.endResolve = null + this.ws = null + }) + }) + } + + private sendConfig(): void { + this.ws?.send( + JSON.stringify({ + type: "config", + language: this.language, + use_lm: true, + sample_rate: this.sampleRate, + metadata_prob: true, + word_timestamps: true, + }), + ) + } + + sendPcm(pcm: Buffer): void { + if (!this.connected || !this.ws) return + const ab = pcm.buffer.slice(pcm.byteOffset, pcm.byteOffset + pcm.byteLength) + this.ws.send(ab) + } + + end(): Promise { + if (!this.connected || !this.ws) return Promise.resolve() + return new Promise((resolve) => { + this.endResolve = resolve + this.ws!.send(JSON.stringify({ type: "end" })) + setTimeout(() => { + this.endResolve?.() + this.endResolve = null + this.close() + }, 5_000) + }) + } + + close(): void { + try { + this.ws?.close() + } catch {} + this.ws = null + } +} diff --git a/packages/opencode/src/voice/index.ts b/packages/opencode/src/voice/index.ts new file mode 100644 index 0000000000..c3e3aeab9a --- /dev/null +++ b/packages/opencode/src/voice/index.ts @@ -0,0 +1,73 @@ +import { MicCapture } from "./mic" +import { AsrStreamClient } from "./asr-client" + +export type { TranscriptSegment } from "./asr-client" +export type { VoiceMetadata } from "./metadata" +export { VoiceMetadataStore } from "./metadata" + +export class VoiceInput { + private mic = new MicCapture() + private asr: AsrStreamClient + private _active = false + + onTranscript: ((seg: { + text: string + is_final: boolean + metadata?: { emotion?: string; intent?: string; gender?: string; age?: string } + metadata_probs?: { + emotion?: Array<{ token: string; probability: number }> + intent?: Array<{ token: string; probability: number }> + } + speech_rate?: { words_per_minute: number; filler_count: number; filler_rate: number; pause_count: number } + }) => void) | null = null + onError: ((err: Error) => void) | null = null + onStateChange: ((active: boolean) => void) | null = null + + constructor(opts?: { url?: string; language?: string; token?: string }) { + this.asr = new AsrStreamClient(opts) + } + + get active(): boolean { + return this._active + } + + async start(): Promise { + if (this._active) return + + this.asr.onTranscript = (seg) => this.onTranscript?.(seg) + this.asr.onError = (err) => { + this.onError?.(err) + this.stop() + } + + await this.asr.connect() + + this.mic.onData = (pcm) => this.asr.sendPcm(pcm) + this.mic.onError = (err) => { + this.onError?.(err) + this.stop() + } + + await this.mic.start() + this._active = true + this.onStateChange?.(true) + } + + async stop(): Promise { + if (!this._active) return + this._active = false + this.mic.stop() + try { + await this.asr.end() + } catch {} + this.onStateChange?.(false) + } + + async toggle(): Promise { + if (this._active) { + await this.stop() + } else { + await this.start() + } + } +} diff --git a/packages/opencode/src/voice/metadata.ts b/packages/opencode/src/voice/metadata.ts new file mode 100644 index 0000000000..5ef26b6a57 --- /dev/null +++ b/packages/opencode/src/voice/metadata.ts @@ -0,0 +1,214 @@ +import type { TranscriptSegment } from "./asr-client" + +/** + * Accumulated voice metadata for a session — emotion trends, intent patterns, + * speech rate stats. Designed to be serialized to voice-metadata.md and + * injected as context so the LLM can adapt to the user's vocal state. + */ +export interface VoiceMetadata { + /** Running count of voice segments processed */ + segmentCount: number + /** Emotion frequency: { happy: 5, frustrated: 2, ... } */ + emotionCounts: Record + /** Intent frequency: { question: 3, command: 7, ... } */ + intentCounts: Record + /** Current/latest emotion detected */ + currentEmotion: string + /** Current/latest intent detected */ + currentIntent: string + /** Average words per minute across segments */ + avgWpm: number + /** Total filler count (um, uh, like) */ + totalFillers: number + /** Total pause count */ + totalPauses: number + /** Recent emotion sequence (last 10) for trend detection */ + recentEmotions: string[] + /** Timestamps of recording sessions */ + sessions: Array<{ start: number; end?: number; segmentCount: number }> +} + +/** + * Accumulates voice metadata across a session and produces a markdown + * summary suitable for injection into LLM context. + */ +export class VoiceMetadataStore { + private data: VoiceMetadata = { + segmentCount: 0, + emotionCounts: {}, + intentCounts: {}, + currentEmotion: "", + currentIntent: "", + avgWpm: 0, + totalFillers: 0, + totalPauses: 0, + recentEmotions: [], + sessions: [], + } + + private wpmSamples: number[] = [] + private sessionStart: number | null = null + + get current(): Readonly { + return this.data + } + + /** Call when recording starts */ + startSession(): void { + this.sessionStart = Date.now() + this.data.sessions.push({ start: this.sessionStart, segmentCount: 0 }) + } + + /** Call when recording stops */ + endSession(): void { + const last = this.data.sessions[this.data.sessions.length - 1] + if (last && !last.end) { + last.end = Date.now() + } + this.sessionStart = null + } + + /** Ingest a transcript segment's metadata */ + ingest(seg: TranscriptSegment): void { + if (!seg.is_final) return + + this.data.segmentCount++ + + // Track the current session's segment count + const last = this.data.sessions[this.data.sessions.length - 1] + if (last && !last.end) last.segmentCount++ + + // Emotion + const emotion = seg.metadata?.emotion + if (emotion) { + this.data.emotionCounts[emotion] = (this.data.emotionCounts[emotion] ?? 0) + 1 + this.data.currentEmotion = emotion + this.data.recentEmotions.push(emotion) + if (this.data.recentEmotions.length > 10) { + this.data.recentEmotions.shift() + } + } + + // Intent + const intent = seg.metadata?.intent + if (intent) { + this.data.intentCounts[intent] = (this.data.intentCounts[intent] ?? 0) + 1 + this.data.currentIntent = intent + } + + // Speech rate + if (seg.speech_rate) { + if (seg.speech_rate.words_per_minute > 0) { + this.wpmSamples.push(seg.speech_rate.words_per_minute) + this.data.avgWpm = Math.round( + this.wpmSamples.reduce((a, b) => a + b, 0) / this.wpmSamples.length, + ) + } + this.data.totalFillers += seg.speech_rate.filler_count + this.data.totalPauses += seg.speech_rate.pause_count + } + } + + /** Dominant emotion (most frequent) */ + get dominantEmotion(): string { + let max = 0 + let best = "" + for (const [emotion, count] of Object.entries(this.data.emotionCounts)) { + if (count > max) { + max = count + best = emotion + } + } + return best + } + + /** Detect emotional trend from recent sequence */ + get emotionTrend(): string { + const recent = this.data.recentEmotions + if (recent.length < 3) return "" + const last3 = recent.slice(-3) + if (last3.every((e) => e === last3[0])) return `consistently ${last3[0]}` + // Check for shift + const first = recent.slice(0, Math.floor(recent.length / 2)) + const second = recent.slice(Math.floor(recent.length / 2)) + const mode = (arr: string[]) => { + const counts: Record = {} + for (const v of arr) counts[v] = (counts[v] ?? 0) + 1 + return Object.entries(counts).sort((a, b) => b[1] - a[1])[0]?.[0] ?? "" + } + const firstMode = mode(first) + const secondMode = mode(second) + if (firstMode && secondMode && firstMode !== secondMode) { + return `shifting from ${firstMode} to ${secondMode}` + } + return "" + } + + /** Short single-line summary for TUI display */ + get shortSummary(): string { + const parts: string[] = [] + if (this.data.currentEmotion) parts.push(this.data.currentEmotion) + if (this.data.currentIntent) parts.push(this.data.currentIntent) + if (this.data.avgWpm > 0) parts.push(`${this.data.avgWpm}wpm`) + return parts.join(" · ") + } + + /** + * Generate a markdown summary for LLM context injection. + * Compact enough to not waste tokens, rich enough to be useful. + */ + toMarkdown(): string { + const d = this.data + if (d.segmentCount === 0) return "" + + const lines: string[] = ["## Voice Session Context"] + lines.push("") + + // Current state + const current: string[] = [] + if (d.currentEmotion) current.push(`**Emotion:** ${d.currentEmotion}`) + if (d.currentIntent) current.push(`**Intent:** ${d.currentIntent}`) + if (current.length) { + lines.push(`Current: ${current.join(", ")}`) + } + + // Trend + const trend = this.emotionTrend + if (trend) lines.push(`Trend: ${trend}`) + + // Emotion distribution (top 3) + const sortedEmotions = Object.entries(d.emotionCounts) + .sort((a, b) => b[1] - a[1]) + .slice(0, 3) + if (sortedEmotions.length) { + const total = Object.values(d.emotionCounts).reduce((a, b) => a + b, 0) + const dist = sortedEmotions + .map(([e, c]) => `${e} ${Math.round((c / total) * 100)}%`) + .join(", ") + lines.push(`Emotions: ${dist}`) + } + + // Intent distribution (top 3) + const sortedIntents = Object.entries(d.intentCounts) + .sort((a, b) => b[1] - a[1]) + .slice(0, 3) + if (sortedIntents.length) { + const total = Object.values(d.intentCounts).reduce((a, b) => a + b, 0) + const dist = sortedIntents + .map(([i, c]) => `${i} ${Math.round((c / total) * 100)}%`) + .join(", ") + lines.push(`Intents: ${dist}`) + } + + // Speech stats + const stats: string[] = [] + if (d.avgWpm > 0) stats.push(`${d.avgWpm} wpm`) + if (d.totalFillers > 0) stats.push(`${d.totalFillers} fillers`) + if (d.totalPauses > 0) stats.push(`${d.totalPauses} pauses`) + if (stats.length) lines.push(`Speech: ${stats.join(", ")}`) + + lines.push(`Segments: ${d.segmentCount}`) + + return lines.join("\n") + } +} diff --git a/packages/opencode/src/voice/mic.ts b/packages/opencode/src/voice/mic.ts new file mode 100644 index 0000000000..abb5084a6a --- /dev/null +++ b/packages/opencode/src/voice/mic.ts @@ -0,0 +1,78 @@ +import { spawn, type ChildProcess } from "child_process" +import which from "which" + +export class MicCapture { + private proc: ChildProcess | null = null + + onData: ((pcm: Buffer) => void) | null = null + onError: ((err: Error) => void) | null = null + + get recording(): boolean { + return this.proc !== null && !this.proc.killed + } + + async start(): Promise { + if (this.proc) return + + const cmd = await MicCapture.findCommand() + if (!cmd) { + throw new Error("sox not found. Install it: brew install sox (macOS) / apt install sox (Linux)") + } + + // Capture at device native rate, resample to 16kHz via the "rate" effect. + // Don't pass -r 16000 — on macOS CoreAudio it tries to set the device to + // 16kHz, fails, falls back to 48kHz, and outputs 48kHz raw PCM. + // The "rate 16000" effect after "-" does the actual resampling. + const args = + cmd === "rec" + ? ["-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"] + : ["-d", "-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"] + + this.proc = spawn(cmd, args, { + stdio: ["ignore", "pipe", "pipe"], + }) + + this.proc.stdout!.on("data", (chunk: Buffer) => { + this.onData?.(chunk) + }) + + this.proc.stderr!.on("data", (data: Buffer) => { + const msg = data.toString().trim() + // sox emits WARN about sample rate when resampling — expected, not an error + if (msg && !msg.includes("WARN") && !msg.includes("can't set sample rate")) { + this.onError?.(new Error(`mic: ${msg}`)) + } + }) + + this.proc.on("error", (err) => { + this.onError?.(err) + this.proc = null + }) + + this.proc.on("exit", () => { + this.proc = null + }) + } + + stop(): void { + if (!this.proc) return + try { + this.proc.kill("SIGTERM") + } catch {} + this.proc = null + } + + private static async findCommand(): Promise { + for (const cmd of ["rec", "sox"]) { + try { + await which(cmd) + return cmd + } catch {} + } + return null + } + + static async available(): Promise { + return (await MicCapture.findCommand()) !== null + } +}