Merge 04ce2751a6 into ae614d919f

2026-04-07 22:50:52 -07:00 · 2026-04-07 22:50:52 -07:00 · 970fecd8fd
parent ae614d919f 04ce2751a6
commit 970fecd8fd
6 changed files with 642 additions and 0 deletions
--- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
+++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@ -36,6 +36,7 @@ import { useToast } from "../../ui/toast"
 import { useKV } from "../../context/kv"
 import { useTextareaKeybindings } from "../textarea-keybindings"
 import { DialogSkill } from "../dialog-skill"
+import { VoiceInput, VoiceMetadataStore } from "@/voice"

 export type PromptProps = {
  sessionID?: string
@ -165,6 +166,9 @@ export function Prompt(props: PromptProps) {
    extmarkToPartIndex: Map<number, number>
    interrupt: number
    placeholder: number
+    recording: boolean
+    interimText: string
+    voiceMeta: string
  }>({
    placeholder: randomIndex(list().length),
    prompt: {
@ -174,6 +178,9 @@ export function Prompt(props: PromptProps) {
    mode: "normal",
    extmarkToPartIndex: new Map(),
    interrupt: 0,
+    recording: false,
+    interimText: "",
+    voiceMeta: "",
  })

  createEffect(
@ -261,6 +268,11 @@ export function Prompt(props: PromptProps) {
        onSelect: (dialog) => {
          if (autocomplete.visible) return
          if (!input.focused) return
+          if (store.recording) {
+            voice?.stop()
+            dialog.clear()
+            return
+          }
          // TODO: this should be its own command
          if (store.mode === "shell") {
            setStore("mode", "normal")
@ -370,6 +382,16 @@ export function Prompt(props: PromptProps) {
          input.cursorOffset = Bun.stringWidth(content)
        },
      },
+      {
+        title: "Voice dictate",
+        value: "voice.dictate",
+        keybind: "voice_dictate",
+        category: "Prompt",
+        onSelect: async (dialog) => {
+          dialog.clear()
+          await toggleVoice()
+        },
+      },
      {
        title: "Skills",
        value: "prompt.skills",
@ -587,6 +609,7 @@ export function Prompt(props: PromptProps) {
  ])

  async function submit() {
+    if (store.recording) await voice?.stop()
    if (props.disabled) return
    if (autocomplete?.visible) return
    if (!store.prompt.input) return
@ -624,6 +647,12 @@ export function Prompt(props: PromptProps) {
    const messageID = MessageID.ascending()
    let inputText = store.prompt.input

+    // Append voice metadata context if voice was used during this prompt
+    const voiceContext = voiceMetaStore.toMarkdown()
+    if (voiceContext) {
+      inputText = inputText + "\n\n<!-- voice-context -->\n" + voiceContext + "\n<!-- /voice-context -->"
+    }
+
    // Expand pasted text inline before submitting
    const allExtmarks = input.extmarks.getAllForTypeId(promptPartTypeId)
    const sortedExtmarks = allExtmarks.sort((a: { start: number }, b: { start: number }) => b.start - a.start)
@ -732,6 +761,86 @@ export function Prompt(props: PromptProps) {
  }
  const exit = useExit()

+  // ── Voice dictation ──────────────────────────────────────────────
+  let voice: VoiceInput | undefined
+  const voiceMetaStore = new VoiceMetadataStore()
+
+  function getVoice(): VoiceInput {
+    if (!voice) {
+      voice = new VoiceInput()
+      voice.onTranscript = (seg) => {
+        // Ingest metadata regardless of final/partial
+        voiceMetaStore.ingest(seg)
+        setStore("voiceMeta", voiceMetaStore.shortSummary)
+
+        if (seg.is_final && seg.text.trim()) {
+          if (!input || input.isDestroyed) return
+          input.insertText(seg.text.trim() + " ")
+          setStore("prompt", "input", input.plainText)
+          setStore("interimText", "")
+          setTimeout(() => {
+            if (!input || input.isDestroyed) return
+            input.getLayoutNode().markDirty()
+            renderer.requestRender()
+          }, 0)
+        } else if (!seg.is_final) {
+          setStore("interimText", seg.text)
+        }
+      }
+      voice.onError = (err) => {
+        toast.show({
+          message: `Voice: ${err.message}`,
+          variant: "error",
+          duration: 3000,
+        })
+        setStore("recording", false)
+        setStore("interimText", "")
+        setStore("voiceMeta", "")
+      }
+      voice.onStateChange = (active) => {
+        setStore("recording", active)
+        if (active) {
+          voiceMetaStore.startSession()
+        } else {
+          voiceMetaStore.endSession()
+          setStore("interimText", "")
+        }
+      }
+    }
+    return voice
+  }
+
+  async function toggleVoice() {
+    if (props.disabled) return
+    if (store.mode === "shell") return
+    try {
+      await getVoice().toggle()
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : "Voice input failed"
+      toast.show({
+        message: msg.includes("not found") || msg.includes("ENOENT") ? "Voice requires sox: brew install sox" : `Voice: ${msg}`,
+        variant: "warning",
+        duration: 5000,
+      })
+      setStore("recording", false)
+    }
+  }
+
+  onCleanup(() => {
+    voice?.stop()
+  })
+
+  createEffect(
+    on(
+      () => props.sessionID,
+      () => {
+        voice?.stop()
+      },
+      { defer: true },
+    ),
+  )
+  // ── End voice ────────────────────────────────────────────────────
+
  function pasteText(text: string, virtualText: string) {
    const currentOffset = input.visualCursor.offset
    const extmarkStart = currentOffset
@ -945,6 +1054,11 @@ export function Prompt(props: PromptProps) {
                  setStore("extmarkToPartIndex", new Map())
                  return
                }
+                if (keybind.match("voice_dictate", e)) {
+                  e.preventDefault()
+                  toggleVoice()
+                  return
+                }
                if (keybind.match("app_exit", e)) {
                  if (store.prompt.input === "") {
                    await exit()
@ -1106,6 +1220,17 @@ export function Prompt(props: PromptProps) {
                        <span style={{ fg: theme.warning, bold: true }}>{local.model.variant.current()}</span>
                      </text>
                    </Show>
+                    <Show when={store.recording}>
+                      <text fg={theme.textMuted}>·</text>
+                      <text fg={theme.error}>
+                        ●
+                      </text>
+                      <text fg={theme.textMuted} wrapMode="none">
+                        {store.interimText
+                          ? `"${store.interimText.slice(0, 40)}${store.interimText.length > 40 ? "…" : ""}"`
+                          : store.voiceMeta || "listening…"}
+                      </text>
+                    </Show>
                  </box>
                </Show>
              </box>
@ -1245,6 +1370,11 @@ export function Prompt(props: PromptProps) {
                  <text fg={theme.text}>
                    {keybind.print("command_list")} <span style={{ fg: theme.textMuted }}>commands</span>
                  </text>
+                  <Show when={store.recording}>
+                    <text fg={theme.error}>
+                      {keybind.print("voice_dictate")} <span style={{ fg: theme.textMuted }}>stop</span>
+                    </text>
+                  </Show>
                </Match>
                <Match when={store.mode === "shell"}>
                  <text fg={theme.text}>
--- a/packages/opencode/src/config/config.ts
+++ b/packages/opencode/src/config/config.ts
@ -762,6 +762,7 @@ export namespace Config {
      tips_toggle: z.string().optional().default("<leader>h").describe("Toggle tips on home screen"),
      plugin_manager: z.string().optional().default("none").describe("Open plugin manager dialog"),
      display_thinking: z.string().optional().default("none").describe("Toggle thinking blocks visibility"),
+      voice_dictate: z.string().optional().default("alt+v").describe("Toggle voice dictation"),
    })
    .strict()
    .meta({
--- a/packages/opencode/src/voice/asr-client.ts
+++ b/packages/opencode/src/voice/asr-client.ts
@ -0,0 +1,146 @@
+export interface TranscriptSegment {
+  text: string
+  is_final: boolean
+  metadata?: {
+    emotion?: string
+    intent?: string
+    gender?: string
+    age?: string
+  }
+  metadata_probs?: {
+    emotion?: Array<{ token: string; probability: number }>
+    intent?: Array<{ token: string; probability: number }>
+  }
+  speech_rate?: {
+    words_per_minute: number
+    filler_count: number
+    filler_rate: number
+    pause_count: number
+  }
+}
+
+export class AsrStreamClient {
+  private ws: WebSocket | null = null
+  private url: string
+  private language: string
+  private sampleRate: number
+  private endResolve: (() => void) | null = null
+
+  onTranscript: ((seg: TranscriptSegment) => void) | null = null
+  onError: ((err: Error) => void) | null = null
+
+  private token: string
+
+  constructor(opts?: { url?: string; language?: string; sampleRate?: number; token?: string }) {
+    const base = opts?.url ?? process.env.WHISSLE_ASR_URL ?? "wss://api.whissle.ai/asr/stream"
+    this.token = opts?.token ?? process.env.WHISSLE_AUTH_TOKEN ?? ""
+    this.url = this.token ? `${base}?token=${encodeURIComponent(this.token)}` : base
+    this.language = opts?.language ?? process.env.WHISSLE_ASR_LANGUAGE ?? "en"
+    this.sampleRate = opts?.sampleRate ?? 16000
+  }
+
+  get connected(): boolean {
+    return this.ws?.readyState === WebSocket.OPEN
+  }
+
+  connect(): Promise<void> {
+    return new Promise((resolve, reject) => {
+      if (this.connected) {
+        resolve()
+        return
+      }
+
+      const ws = new WebSocket(this.url)
+      ws.binaryType = "arraybuffer"
+
+      const timeout = setTimeout(() => {
+        reject(new Error("Voice server connection timed out"))
+        try {
+          ws.close()
+        } catch {}
+      }, 10_000)
+
+      ws.addEventListener("open", () => {
+        clearTimeout(timeout)
+        this.ws = ws
+        this.sendConfig()
+        resolve()
+      })
+
+      ws.addEventListener("message", (ev: MessageEvent) => {
+        if (typeof ev.data !== "string") return
+        try {
+          const msg = JSON.parse(ev.data)
+          if (msg.type === "transcript") {
+            this.onTranscript?.({
+              text: msg.text ?? "",
+              is_final: msg.is_final !== false,
+              metadata: msg.metadata,
+              metadata_probs: msg.metadata_probs,
+              speech_rate: msg.speech_rate,
+            })
+          } else if (msg.type === "end") {
+            this.endResolve?.()
+            this.endResolve = null
+            this.close()
+          } else if (msg.type === "error") {
+            this.onError?.(new Error(msg.message ?? "ASR error"))
+          }
+        } catch {}
+      })
+
+      ws.addEventListener("error", () => {
+        clearTimeout(timeout)
+        const err = new Error("Voice server connection failed")
+        reject(err)
+        this.onError?.(err)
+      })
+
+      ws.addEventListener("close", () => {
+        clearTimeout(timeout)
+        this.endResolve?.()
+        this.endResolve = null
+        this.ws = null
+      })
+    })
+  }
+
+  private sendConfig(): void {
+    this.ws?.send(
+      JSON.stringify({
+        type: "config",
+        language: this.language,
+        use_lm: true,
+        sample_rate: this.sampleRate,
+        metadata_prob: true,
+        word_timestamps: true,
+      }),
+    )
+  }
+
+  sendPcm(pcm: Buffer): void {
+    if (!this.connected || !this.ws) return
+    const ab = pcm.buffer.slice(pcm.byteOffset, pcm.byteOffset + pcm.byteLength)
+    this.ws.send(ab)
+  }
+
+  end(): Promise<void> {
+    if (!this.connected || !this.ws) return Promise.resolve()
+    return new Promise((resolve) => {
+      this.endResolve = resolve
+      this.ws!.send(JSON.stringify({ type: "end" }))
+      setTimeout(() => {
+        this.endResolve?.()
+        this.endResolve = null
+        this.close()
+      }, 5_000)
+    })
+  }
+
+  close(): void {
+    try {
+      this.ws?.close()
+    } catch {}
+    this.ws = null
+  }
+}
--- a/packages/opencode/src/voice/index.ts
+++ b/packages/opencode/src/voice/index.ts
@ -0,0 +1,73 @@
+import { MicCapture } from "./mic"
+import { AsrStreamClient } from "./asr-client"
+
+export type { TranscriptSegment } from "./asr-client"
+export type { VoiceMetadata } from "./metadata"
+export { VoiceMetadataStore } from "./metadata"
+
+export class VoiceInput {
+  private mic = new MicCapture()
+  private asr: AsrStreamClient
+  private _active = false
+
+  onTranscript: ((seg: {
+    text: string
+    is_final: boolean
+    metadata?: { emotion?: string; intent?: string; gender?: string; age?: string }
+    metadata_probs?: {
+      emotion?: Array<{ token: string; probability: number }>
+      intent?: Array<{ token: string; probability: number }>
+    }
+    speech_rate?: { words_per_minute: number; filler_count: number; filler_rate: number; pause_count: number }
+  }) => void) | null = null
+  onError: ((err: Error) => void) | null = null
+  onStateChange: ((active: boolean) => void) | null = null
+
+  constructor(opts?: { url?: string; language?: string; token?: string }) {
+    this.asr = new AsrStreamClient(opts)
+  }
+
+  get active(): boolean {
+    return this._active
+  }
+
+  async start(): Promise<void> {
+    if (this._active) return
+
+    this.asr.onTranscript = (seg) => this.onTranscript?.(seg)
+    this.asr.onError = (err) => {
+      this.onError?.(err)
+      this.stop()
+    }
+
+    await this.asr.connect()
+
+    this.mic.onData = (pcm) => this.asr.sendPcm(pcm)
+    this.mic.onError = (err) => {
+      this.onError?.(err)
+      this.stop()
+    }
+
+    await this.mic.start()
+    this._active = true
+    this.onStateChange?.(true)
+  }
+
+  async stop(): Promise<void> {
+    if (!this._active) return
+    this._active = false
+    this.mic.stop()
+    try {
+      await this.asr.end()
+    } catch {}
+    this.onStateChange?.(false)
+  }
+
+  async toggle(): Promise<void> {
+    if (this._active) {
+      await this.stop()
+    } else {
+      await this.start()
+    }
+  }
+}
--- a/packages/opencode/src/voice/metadata.ts
+++ b/packages/opencode/src/voice/metadata.ts
@ -0,0 +1,214 @@
+import type { TranscriptSegment } from "./asr-client"
+
+/**
+ * Accumulated voice metadata for a session — emotion trends, intent patterns,
+ * speech rate stats. Designed to be serialized to voice-metadata.md and
+ * injected as context so the LLM can adapt to the user's vocal state.
+ */
+export interface VoiceMetadata {
+  /** Running count of voice segments processed */
+  segmentCount: number
+  /** Emotion frequency: { happy: 5, frustrated: 2, ... } */
+  emotionCounts: Record<string, number>
+  /** Intent frequency: { question: 3, command: 7, ... } */
+  intentCounts: Record<string, number>
+  /** Current/latest emotion detected */
+  currentEmotion: string
+  /** Current/latest intent detected */
+  currentIntent: string
+  /** Average words per minute across segments */
+  avgWpm: number
+  /** Total filler count (um, uh, like) */
+  totalFillers: number
+  /** Total pause count */
+  totalPauses: number
+  /** Recent emotion sequence (last 10) for trend detection */
+  recentEmotions: string[]
+  /** Timestamps of recording sessions */
+  sessions: Array<{ start: number; end?: number; segmentCount: number }>
+}
+
+/**
+ * Accumulates voice metadata across a session and produces a markdown
+ * summary suitable for injection into LLM context.
+ */
+export class VoiceMetadataStore {
+  private data: VoiceMetadata = {
+    segmentCount: 0,
+    emotionCounts: {},
+    intentCounts: {},
+    currentEmotion: "",
+    currentIntent: "",
+    avgWpm: 0,
+    totalFillers: 0,
+    totalPauses: 0,
+    recentEmotions: [],
+    sessions: [],
+  }
+
+  private wpmSamples: number[] = []
+  private sessionStart: number | null = null
+
+  get current(): Readonly<VoiceMetadata> {
+    return this.data
+  }
+
+  /** Call when recording starts */
+  startSession(): void {
+    this.sessionStart = Date.now()
+    this.data.sessions.push({ start: this.sessionStart, segmentCount: 0 })
+  }
+
+  /** Call when recording stops */
+  endSession(): void {
+    const last = this.data.sessions[this.data.sessions.length - 1]
+    if (last && !last.end) {
+      last.end = Date.now()
+    }
+    this.sessionStart = null
+  }
+
+  /** Ingest a transcript segment's metadata */
+  ingest(seg: TranscriptSegment): void {
+    if (!seg.is_final) return
+
+    this.data.segmentCount++
+
+    // Track the current session's segment count
+    const last = this.data.sessions[this.data.sessions.length - 1]
+    if (last && !last.end) last.segmentCount++
+
+    // Emotion
+    const emotion = seg.metadata?.emotion
+    if (emotion) {
+      this.data.emotionCounts[emotion] = (this.data.emotionCounts[emotion] ?? 0) + 1
+      this.data.currentEmotion = emotion
+      this.data.recentEmotions.push(emotion)
+      if (this.data.recentEmotions.length > 10) {
+        this.data.recentEmotions.shift()
+      }
+    }
+
+    // Intent
+    const intent = seg.metadata?.intent
+    if (intent) {
+      this.data.intentCounts[intent] = (this.data.intentCounts[intent] ?? 0) + 1
+      this.data.currentIntent = intent
+    }
+
+    // Speech rate
+    if (seg.speech_rate) {
+      if (seg.speech_rate.words_per_minute > 0) {
+        this.wpmSamples.push(seg.speech_rate.words_per_minute)
+        this.data.avgWpm = Math.round(
+          this.wpmSamples.reduce((a, b) => a + b, 0) / this.wpmSamples.length,
+        )
+      }
+      this.data.totalFillers += seg.speech_rate.filler_count
+      this.data.totalPauses += seg.speech_rate.pause_count
+    }
+  }
+
+  /** Dominant emotion (most frequent) */
+  get dominantEmotion(): string {
+    let max = 0
+    let best = ""
+    for (const [emotion, count] of Object.entries(this.data.emotionCounts)) {
+      if (count > max) {
+        max = count
+        best = emotion
+      }
+    }
+    return best
+  }
+
+  /** Detect emotional trend from recent sequence */
+  get emotionTrend(): string {
+    const recent = this.data.recentEmotions
+    if (recent.length < 3) return ""
+    const last3 = recent.slice(-3)
+    if (last3.every((e) => e === last3[0])) return `consistently ${last3[0]}`
+    // Check for shift
+    const first = recent.slice(0, Math.floor(recent.length / 2))
+    const second = recent.slice(Math.floor(recent.length / 2))
+    const mode = (arr: string[]) => {
+      const counts: Record<string, number> = {}
+      for (const v of arr) counts[v] = (counts[v] ?? 0) + 1
+      return Object.entries(counts).sort((a, b) => b[1] - a[1])[0]?.[0] ?? ""
+    }
+    const firstMode = mode(first)
+    const secondMode = mode(second)
+    if (firstMode && secondMode && firstMode !== secondMode) {
+      return `shifting from ${firstMode} to ${secondMode}`
+    }
+    return ""
+  }
+
+  /** Short single-line summary for TUI display */
+  get shortSummary(): string {
+    const parts: string[] = []
+    if (this.data.currentEmotion) parts.push(this.data.currentEmotion)
+    if (this.data.currentIntent) parts.push(this.data.currentIntent)
+    if (this.data.avgWpm > 0) parts.push(`${this.data.avgWpm}wpm`)
+    return parts.join(" · ")
+  }
+
+  /**
+   * Generate a markdown summary for LLM context injection.
+   * Compact enough to not waste tokens, rich enough to be useful.
+   */
+  toMarkdown(): string {
+    const d = this.data
+    if (d.segmentCount === 0) return ""
+
+    const lines: string[] = ["## Voice Session Context"]
+    lines.push("")
+
+    // Current state
+    const current: string[] = []
+    if (d.currentEmotion) current.push(`**Emotion:** ${d.currentEmotion}`)
+    if (d.currentIntent) current.push(`**Intent:** ${d.currentIntent}`)
+    if (current.length) {
+      lines.push(`Current: ${current.join(", ")}`)
+    }
+
+    // Trend
+    const trend = this.emotionTrend
+    if (trend) lines.push(`Trend: ${trend}`)
+
+    // Emotion distribution (top 3)
+    const sortedEmotions = Object.entries(d.emotionCounts)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 3)
+    if (sortedEmotions.length) {
+      const total = Object.values(d.emotionCounts).reduce((a, b) => a + b, 0)
+      const dist = sortedEmotions
+        .map(([e, c]) => `${e} ${Math.round((c / total) * 100)}%`)
+        .join(", ")
+      lines.push(`Emotions: ${dist}`)
+    }
+
+    // Intent distribution (top 3)
+    const sortedIntents = Object.entries(d.intentCounts)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 3)
+    if (sortedIntents.length) {
+      const total = Object.values(d.intentCounts).reduce((a, b) => a + b, 0)
+      const dist = sortedIntents
+        .map(([i, c]) => `${i} ${Math.round((c / total) * 100)}%`)
+        .join(", ")
+      lines.push(`Intents: ${dist}`)
+    }
+
+    // Speech stats
+    const stats: string[] = []
+    if (d.avgWpm > 0) stats.push(`${d.avgWpm} wpm`)
+    if (d.totalFillers > 0) stats.push(`${d.totalFillers} fillers`)
+    if (d.totalPauses > 0) stats.push(`${d.totalPauses} pauses`)
+    if (stats.length) lines.push(`Speech: ${stats.join(", ")}`)
+
+    lines.push(`Segments: ${d.segmentCount}`)
+
+    return lines.join("\n")
+  }
+}
--- a/packages/opencode/src/voice/mic.ts
+++ b/packages/opencode/src/voice/mic.ts
@ -0,0 +1,78 @@
+import { spawn, type ChildProcess } from "child_process"
+import which from "which"
+
+export class MicCapture {
+  private proc: ChildProcess | null = null
+
+  onData: ((pcm: Buffer) => void) | null = null
+  onError: ((err: Error) => void) | null = null
+
+  get recording(): boolean {
+    return this.proc !== null && !this.proc.killed
+  }
+
+  async start(): Promise<void> {
+    if (this.proc) return
+
+    const cmd = await MicCapture.findCommand()
+    if (!cmd) {
+      throw new Error("sox not found. Install it: brew install sox (macOS) / apt install sox (Linux)")
+    }
+
+    // Capture at device native rate, resample to 16kHz via the "rate" effect.
+    // Don't pass -r 16000 — on macOS CoreAudio it tries to set the device to
+    // 16kHz, fails, falls back to 48kHz, and outputs 48kHz raw PCM.
+    // The "rate 16000" effect after "-" does the actual resampling.
+    const args =
+      cmd === "rec"
+        ? ["-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
+        : ["-d", "-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
+
+    this.proc = spawn(cmd, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+    })
+
+    this.proc.stdout!.on("data", (chunk: Buffer) => {
+      this.onData?.(chunk)
+    })
+
+    this.proc.stderr!.on("data", (data: Buffer) => {
+      const msg = data.toString().trim()
+      // sox emits WARN about sample rate when resampling — expected, not an error
+      if (msg && !msg.includes("WARN") && !msg.includes("can't set sample rate")) {
+        this.onError?.(new Error(`mic: ${msg}`))
+      }
+    })
+
+    this.proc.on("error", (err) => {
+      this.onError?.(err)
+      this.proc = null
+    })
+
+    this.proc.on("exit", () => {
+      this.proc = null
+    })
+  }
+
+  stop(): void {
+    if (!this.proc) return
+    try {
+      this.proc.kill("SIGTERM")
+    } catch {}
+    this.proc = null
+  }
+
+  private static async findCommand(): Promise<string | null> {
+    for (const cmd of ["rec", "sox"]) {
+      try {
+        await which(cmd)
+        return cmd
+      } catch {}
+    }
+    return null
+  }
+
+  static async available(): Promise<boolean> {
+    return (await MicCapture.findCommand()) !== null
+  }
+}