pull/21459/merge
Karan Singla 2026-04-07 22:50:52 -07:00 committed by GitHub
commit 970fecd8fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 642 additions and 0 deletions

View File

@ -36,6 +36,7 @@ import { useToast } from "../../ui/toast"
import { useKV } from "../../context/kv"
import { useTextareaKeybindings } from "../textarea-keybindings"
import { DialogSkill } from "../dialog-skill"
import { VoiceInput, VoiceMetadataStore } from "@/voice"
export type PromptProps = {
sessionID?: string
@ -165,6 +166,9 @@ export function Prompt(props: PromptProps) {
extmarkToPartIndex: Map<number, number>
interrupt: number
placeholder: number
recording: boolean
interimText: string
voiceMeta: string
}>({
placeholder: randomIndex(list().length),
prompt: {
@ -174,6 +178,9 @@ export function Prompt(props: PromptProps) {
mode: "normal",
extmarkToPartIndex: new Map(),
interrupt: 0,
recording: false,
interimText: "",
voiceMeta: "",
})
createEffect(
@ -261,6 +268,11 @@ export function Prompt(props: PromptProps) {
onSelect: (dialog) => {
if (autocomplete.visible) return
if (!input.focused) return
if (store.recording) {
voice?.stop()
dialog.clear()
return
}
// TODO: this should be its own command
if (store.mode === "shell") {
setStore("mode", "normal")
@ -370,6 +382,16 @@ export function Prompt(props: PromptProps) {
input.cursorOffset = Bun.stringWidth(content)
},
},
{
title: "Voice dictate",
value: "voice.dictate",
keybind: "voice_dictate",
category: "Prompt",
onSelect: async (dialog) => {
dialog.clear()
await toggleVoice()
},
},
{
title: "Skills",
value: "prompt.skills",
@ -587,6 +609,7 @@ export function Prompt(props: PromptProps) {
])
async function submit() {
if (store.recording) await voice?.stop()
if (props.disabled) return
if (autocomplete?.visible) return
if (!store.prompt.input) return
@ -624,6 +647,12 @@ export function Prompt(props: PromptProps) {
const messageID = MessageID.ascending()
let inputText = store.prompt.input
// Append voice metadata context if voice was used during this prompt
const voiceContext = voiceMetaStore.toMarkdown()
if (voiceContext) {
inputText = inputText + "\n\n<!-- voice-context -->\n" + voiceContext + "\n<!-- /voice-context -->"
}
// Expand pasted text inline before submitting
const allExtmarks = input.extmarks.getAllForTypeId(promptPartTypeId)
const sortedExtmarks = allExtmarks.sort((a: { start: number }, b: { start: number }) => b.start - a.start)
@ -732,6 +761,86 @@ export function Prompt(props: PromptProps) {
}
const exit = useExit()
// ── Voice dictation ──────────────────────────────────────────────
let voice: VoiceInput | undefined
const voiceMetaStore = new VoiceMetadataStore()
function getVoice(): VoiceInput {
if (!voice) {
voice = new VoiceInput()
voice.onTranscript = (seg) => {
// Ingest metadata regardless of final/partial
voiceMetaStore.ingest(seg)
setStore("voiceMeta", voiceMetaStore.shortSummary)
if (seg.is_final && seg.text.trim()) {
if (!input || input.isDestroyed) return
input.insertText(seg.text.trim() + " ")
setStore("prompt", "input", input.plainText)
setStore("interimText", "")
setTimeout(() => {
if (!input || input.isDestroyed) return
input.getLayoutNode().markDirty()
renderer.requestRender()
}, 0)
} else if (!seg.is_final) {
setStore("interimText", seg.text)
}
}
voice.onError = (err) => {
toast.show({
message: `Voice: ${err.message}`,
variant: "error",
duration: 3000,
})
setStore("recording", false)
setStore("interimText", "")
setStore("voiceMeta", "")
}
voice.onStateChange = (active) => {
setStore("recording", active)
if (active) {
voiceMetaStore.startSession()
} else {
voiceMetaStore.endSession()
setStore("interimText", "")
}
}
}
return voice
}
async function toggleVoice() {
if (props.disabled) return
if (store.mode === "shell") return
try {
await getVoice().toggle()
} catch (err) {
const msg = err instanceof Error ? err.message : "Voice input failed"
toast.show({
message: msg.includes("not found") || msg.includes("ENOENT") ? "Voice requires sox: brew install sox" : `Voice: ${msg}`,
variant: "warning",
duration: 5000,
})
setStore("recording", false)
}
}
onCleanup(() => {
voice?.stop()
})
createEffect(
on(
() => props.sessionID,
() => {
voice?.stop()
},
{ defer: true },
),
)
// ── End voice ────────────────────────────────────────────────────
function pasteText(text: string, virtualText: string) {
const currentOffset = input.visualCursor.offset
const extmarkStart = currentOffset
@ -945,6 +1054,11 @@ export function Prompt(props: PromptProps) {
setStore("extmarkToPartIndex", new Map())
return
}
if (keybind.match("voice_dictate", e)) {
e.preventDefault()
toggleVoice()
return
}
if (keybind.match("app_exit", e)) {
if (store.prompt.input === "") {
await exit()
@ -1106,6 +1220,17 @@ export function Prompt(props: PromptProps) {
<span style={{ fg: theme.warning, bold: true }}>{local.model.variant.current()}</span>
</text>
</Show>
<Show when={store.recording}>
<text fg={theme.textMuted}>·</text>
<text fg={theme.error}>
</text>
<text fg={theme.textMuted} wrapMode="none">
{store.interimText
? `"${store.interimText.slice(0, 40)}${store.interimText.length > 40 ? "…" : ""}"`
: store.voiceMeta || "listening…"}
</text>
</Show>
</box>
</Show>
</box>
@ -1245,6 +1370,11 @@ export function Prompt(props: PromptProps) {
<text fg={theme.text}>
{keybind.print("command_list")} <span style={{ fg: theme.textMuted }}>commands</span>
</text>
<Show when={store.recording}>
<text fg={theme.error}>
{keybind.print("voice_dictate")} <span style={{ fg: theme.textMuted }}>stop</span>
</text>
</Show>
</Match>
<Match when={store.mode === "shell"}>
<text fg={theme.text}>

View File

@ -762,6 +762,7 @@ export namespace Config {
tips_toggle: z.string().optional().default("<leader>h").describe("Toggle tips on home screen"),
plugin_manager: z.string().optional().default("none").describe("Open plugin manager dialog"),
display_thinking: z.string().optional().default("none").describe("Toggle thinking blocks visibility"),
voice_dictate: z.string().optional().default("alt+v").describe("Toggle voice dictation"),
})
.strict()
.meta({

View File

@ -0,0 +1,146 @@
export interface TranscriptSegment {
text: string
is_final: boolean
metadata?: {
emotion?: string
intent?: string
gender?: string
age?: string
}
metadata_probs?: {
emotion?: Array<{ token: string; probability: number }>
intent?: Array<{ token: string; probability: number }>
}
speech_rate?: {
words_per_minute: number
filler_count: number
filler_rate: number
pause_count: number
}
}
export class AsrStreamClient {
private ws: WebSocket | null = null
private url: string
private language: string
private sampleRate: number
private endResolve: (() => void) | null = null
onTranscript: ((seg: TranscriptSegment) => void) | null = null
onError: ((err: Error) => void) | null = null
private token: string
constructor(opts?: { url?: string; language?: string; sampleRate?: number; token?: string }) {
const base = opts?.url ?? process.env.WHISSLE_ASR_URL ?? "wss://api.whissle.ai/asr/stream"
this.token = opts?.token ?? process.env.WHISSLE_AUTH_TOKEN ?? ""
this.url = this.token ? `${base}?token=${encodeURIComponent(this.token)}` : base
this.language = opts?.language ?? process.env.WHISSLE_ASR_LANGUAGE ?? "en"
this.sampleRate = opts?.sampleRate ?? 16000
}
get connected(): boolean {
return this.ws?.readyState === WebSocket.OPEN
}
connect(): Promise<void> {
return new Promise((resolve, reject) => {
if (this.connected) {
resolve()
return
}
const ws = new WebSocket(this.url)
ws.binaryType = "arraybuffer"
const timeout = setTimeout(() => {
reject(new Error("Voice server connection timed out"))
try {
ws.close()
} catch {}
}, 10_000)
ws.addEventListener("open", () => {
clearTimeout(timeout)
this.ws = ws
this.sendConfig()
resolve()
})
ws.addEventListener("message", (ev: MessageEvent) => {
if (typeof ev.data !== "string") return
try {
const msg = JSON.parse(ev.data)
if (msg.type === "transcript") {
this.onTranscript?.({
text: msg.text ?? "",
is_final: msg.is_final !== false,
metadata: msg.metadata,
metadata_probs: msg.metadata_probs,
speech_rate: msg.speech_rate,
})
} else if (msg.type === "end") {
this.endResolve?.()
this.endResolve = null
this.close()
} else if (msg.type === "error") {
this.onError?.(new Error(msg.message ?? "ASR error"))
}
} catch {}
})
ws.addEventListener("error", () => {
clearTimeout(timeout)
const err = new Error("Voice server connection failed")
reject(err)
this.onError?.(err)
})
ws.addEventListener("close", () => {
clearTimeout(timeout)
this.endResolve?.()
this.endResolve = null
this.ws = null
})
})
}
private sendConfig(): void {
this.ws?.send(
JSON.stringify({
type: "config",
language: this.language,
use_lm: true,
sample_rate: this.sampleRate,
metadata_prob: true,
word_timestamps: true,
}),
)
}
sendPcm(pcm: Buffer): void {
if (!this.connected || !this.ws) return
const ab = pcm.buffer.slice(pcm.byteOffset, pcm.byteOffset + pcm.byteLength)
this.ws.send(ab)
}
end(): Promise<void> {
if (!this.connected || !this.ws) return Promise.resolve()
return new Promise((resolve) => {
this.endResolve = resolve
this.ws!.send(JSON.stringify({ type: "end" }))
setTimeout(() => {
this.endResolve?.()
this.endResolve = null
this.close()
}, 5_000)
})
}
close(): void {
try {
this.ws?.close()
} catch {}
this.ws = null
}
}

View File

@ -0,0 +1,73 @@
import { MicCapture } from "./mic"
import { AsrStreamClient } from "./asr-client"
export type { TranscriptSegment } from "./asr-client"
export type { VoiceMetadata } from "./metadata"
export { VoiceMetadataStore } from "./metadata"
export class VoiceInput {
private mic = new MicCapture()
private asr: AsrStreamClient
private _active = false
onTranscript: ((seg: {
text: string
is_final: boolean
metadata?: { emotion?: string; intent?: string; gender?: string; age?: string }
metadata_probs?: {
emotion?: Array<{ token: string; probability: number }>
intent?: Array<{ token: string; probability: number }>
}
speech_rate?: { words_per_minute: number; filler_count: number; filler_rate: number; pause_count: number }
}) => void) | null = null
onError: ((err: Error) => void) | null = null
onStateChange: ((active: boolean) => void) | null = null
constructor(opts?: { url?: string; language?: string; token?: string }) {
this.asr = new AsrStreamClient(opts)
}
get active(): boolean {
return this._active
}
async start(): Promise<void> {
if (this._active) return
this.asr.onTranscript = (seg) => this.onTranscript?.(seg)
this.asr.onError = (err) => {
this.onError?.(err)
this.stop()
}
await this.asr.connect()
this.mic.onData = (pcm) => this.asr.sendPcm(pcm)
this.mic.onError = (err) => {
this.onError?.(err)
this.stop()
}
await this.mic.start()
this._active = true
this.onStateChange?.(true)
}
async stop(): Promise<void> {
if (!this._active) return
this._active = false
this.mic.stop()
try {
await this.asr.end()
} catch {}
this.onStateChange?.(false)
}
async toggle(): Promise<void> {
if (this._active) {
await this.stop()
} else {
await this.start()
}
}
}

View File

@ -0,0 +1,214 @@
import type { TranscriptSegment } from "./asr-client"
/**
* Accumulated voice metadata for a session emotion trends, intent patterns,
* speech rate stats. Designed to be serialized to voice-metadata.md and
* injected as context so the LLM can adapt to the user's vocal state.
*/
export interface VoiceMetadata {
/** Running count of voice segments processed */
segmentCount: number
/** Emotion frequency: { happy: 5, frustrated: 2, ... } */
emotionCounts: Record<string, number>
/** Intent frequency: { question: 3, command: 7, ... } */
intentCounts: Record<string, number>
/** Current/latest emotion detected */
currentEmotion: string
/** Current/latest intent detected */
currentIntent: string
/** Average words per minute across segments */
avgWpm: number
/** Total filler count (um, uh, like) */
totalFillers: number
/** Total pause count */
totalPauses: number
/** Recent emotion sequence (last 10) for trend detection */
recentEmotions: string[]
/** Timestamps of recording sessions */
sessions: Array<{ start: number; end?: number; segmentCount: number }>
}
/**
* Accumulates voice metadata across a session and produces a markdown
* summary suitable for injection into LLM context.
*/
export class VoiceMetadataStore {
private data: VoiceMetadata = {
segmentCount: 0,
emotionCounts: {},
intentCounts: {},
currentEmotion: "",
currentIntent: "",
avgWpm: 0,
totalFillers: 0,
totalPauses: 0,
recentEmotions: [],
sessions: [],
}
private wpmSamples: number[] = []
private sessionStart: number | null = null
get current(): Readonly<VoiceMetadata> {
return this.data
}
/** Call when recording starts */
startSession(): void {
this.sessionStart = Date.now()
this.data.sessions.push({ start: this.sessionStart, segmentCount: 0 })
}
/** Call when recording stops */
endSession(): void {
const last = this.data.sessions[this.data.sessions.length - 1]
if (last && !last.end) {
last.end = Date.now()
}
this.sessionStart = null
}
/** Ingest a transcript segment's metadata */
ingest(seg: TranscriptSegment): void {
if (!seg.is_final) return
this.data.segmentCount++
// Track the current session's segment count
const last = this.data.sessions[this.data.sessions.length - 1]
if (last && !last.end) last.segmentCount++
// Emotion
const emotion = seg.metadata?.emotion
if (emotion) {
this.data.emotionCounts[emotion] = (this.data.emotionCounts[emotion] ?? 0) + 1
this.data.currentEmotion = emotion
this.data.recentEmotions.push(emotion)
if (this.data.recentEmotions.length > 10) {
this.data.recentEmotions.shift()
}
}
// Intent
const intent = seg.metadata?.intent
if (intent) {
this.data.intentCounts[intent] = (this.data.intentCounts[intent] ?? 0) + 1
this.data.currentIntent = intent
}
// Speech rate
if (seg.speech_rate) {
if (seg.speech_rate.words_per_minute > 0) {
this.wpmSamples.push(seg.speech_rate.words_per_minute)
this.data.avgWpm = Math.round(
this.wpmSamples.reduce((a, b) => a + b, 0) / this.wpmSamples.length,
)
}
this.data.totalFillers += seg.speech_rate.filler_count
this.data.totalPauses += seg.speech_rate.pause_count
}
}
/** Dominant emotion (most frequent) */
get dominantEmotion(): string {
let max = 0
let best = ""
for (const [emotion, count] of Object.entries(this.data.emotionCounts)) {
if (count > max) {
max = count
best = emotion
}
}
return best
}
/** Detect emotional trend from recent sequence */
get emotionTrend(): string {
const recent = this.data.recentEmotions
if (recent.length < 3) return ""
const last3 = recent.slice(-3)
if (last3.every((e) => e === last3[0])) return `consistently ${last3[0]}`
// Check for shift
const first = recent.slice(0, Math.floor(recent.length / 2))
const second = recent.slice(Math.floor(recent.length / 2))
const mode = (arr: string[]) => {
const counts: Record<string, number> = {}
for (const v of arr) counts[v] = (counts[v] ?? 0) + 1
return Object.entries(counts).sort((a, b) => b[1] - a[1])[0]?.[0] ?? ""
}
const firstMode = mode(first)
const secondMode = mode(second)
if (firstMode && secondMode && firstMode !== secondMode) {
return `shifting from ${firstMode} to ${secondMode}`
}
return ""
}
/** Short single-line summary for TUI display */
get shortSummary(): string {
const parts: string[] = []
if (this.data.currentEmotion) parts.push(this.data.currentEmotion)
if (this.data.currentIntent) parts.push(this.data.currentIntent)
if (this.data.avgWpm > 0) parts.push(`${this.data.avgWpm}wpm`)
return parts.join(" · ")
}
/**
* Generate a markdown summary for LLM context injection.
* Compact enough to not waste tokens, rich enough to be useful.
*/
toMarkdown(): string {
const d = this.data
if (d.segmentCount === 0) return ""
const lines: string[] = ["## Voice Session Context"]
lines.push("")
// Current state
const current: string[] = []
if (d.currentEmotion) current.push(`**Emotion:** ${d.currentEmotion}`)
if (d.currentIntent) current.push(`**Intent:** ${d.currentIntent}`)
if (current.length) {
lines.push(`Current: ${current.join(", ")}`)
}
// Trend
const trend = this.emotionTrend
if (trend) lines.push(`Trend: ${trend}`)
// Emotion distribution (top 3)
const sortedEmotions = Object.entries(d.emotionCounts)
.sort((a, b) => b[1] - a[1])
.slice(0, 3)
if (sortedEmotions.length) {
const total = Object.values(d.emotionCounts).reduce((a, b) => a + b, 0)
const dist = sortedEmotions
.map(([e, c]) => `${e} ${Math.round((c / total) * 100)}%`)
.join(", ")
lines.push(`Emotions: ${dist}`)
}
// Intent distribution (top 3)
const sortedIntents = Object.entries(d.intentCounts)
.sort((a, b) => b[1] - a[1])
.slice(0, 3)
if (sortedIntents.length) {
const total = Object.values(d.intentCounts).reduce((a, b) => a + b, 0)
const dist = sortedIntents
.map(([i, c]) => `${i} ${Math.round((c / total) * 100)}%`)
.join(", ")
lines.push(`Intents: ${dist}`)
}
// Speech stats
const stats: string[] = []
if (d.avgWpm > 0) stats.push(`${d.avgWpm} wpm`)
if (d.totalFillers > 0) stats.push(`${d.totalFillers} fillers`)
if (d.totalPauses > 0) stats.push(`${d.totalPauses} pauses`)
if (stats.length) lines.push(`Speech: ${stats.join(", ")}`)
lines.push(`Segments: ${d.segmentCount}`)
return lines.join("\n")
}
}

View File

@ -0,0 +1,78 @@
import { spawn, type ChildProcess } from "child_process"
import which from "which"
export class MicCapture {
private proc: ChildProcess | null = null
onData: ((pcm: Buffer) => void) | null = null
onError: ((err: Error) => void) | null = null
get recording(): boolean {
return this.proc !== null && !this.proc.killed
}
async start(): Promise<void> {
if (this.proc) return
const cmd = await MicCapture.findCommand()
if (!cmd) {
throw new Error("sox not found. Install it: brew install sox (macOS) / apt install sox (Linux)")
}
// Capture at device native rate, resample to 16kHz via the "rate" effect.
// Don't pass -r 16000 — on macOS CoreAudio it tries to set the device to
// 16kHz, fails, falls back to 48kHz, and outputs 48kHz raw PCM.
// The "rate 16000" effect after "-" does the actual resampling.
const args =
cmd === "rec"
? ["-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
: ["-d", "-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
this.proc = spawn(cmd, args, {
stdio: ["ignore", "pipe", "pipe"],
})
this.proc.stdout!.on("data", (chunk: Buffer) => {
this.onData?.(chunk)
})
this.proc.stderr!.on("data", (data: Buffer) => {
const msg = data.toString().trim()
// sox emits WARN about sample rate when resampling — expected, not an error
if (msg && !msg.includes("WARN") && !msg.includes("can't set sample rate")) {
this.onError?.(new Error(`mic: ${msg}`))
}
})
this.proc.on("error", (err) => {
this.onError?.(err)
this.proc = null
})
this.proc.on("exit", () => {
this.proc = null
})
}
stop(): void {
if (!this.proc) return
try {
this.proc.kill("SIGTERM")
} catch {}
this.proc = null
}
private static async findCommand(): Promise<string | null> {
for (const cmd of ["rec", "sox"]) {
try {
await which(cmd)
return cmd
} catch {}
}
return null
}
static async available(): Promise<boolean> {
return (await MicCapture.findCommand()) !== null
}
}