Merge 04ce2751a6 into ae614d919f
commit
970fecd8fd
|
|
@ -36,6 +36,7 @@ import { useToast } from "../../ui/toast"
|
|||
import { useKV } from "../../context/kv"
|
||||
import { useTextareaKeybindings } from "../textarea-keybindings"
|
||||
import { DialogSkill } from "../dialog-skill"
|
||||
import { VoiceInput, VoiceMetadataStore } from "@/voice"
|
||||
|
||||
export type PromptProps = {
|
||||
sessionID?: string
|
||||
|
|
@ -165,6 +166,9 @@ export function Prompt(props: PromptProps) {
|
|||
extmarkToPartIndex: Map<number, number>
|
||||
interrupt: number
|
||||
placeholder: number
|
||||
recording: boolean
|
||||
interimText: string
|
||||
voiceMeta: string
|
||||
}>({
|
||||
placeholder: randomIndex(list().length),
|
||||
prompt: {
|
||||
|
|
@ -174,6 +178,9 @@ export function Prompt(props: PromptProps) {
|
|||
mode: "normal",
|
||||
extmarkToPartIndex: new Map(),
|
||||
interrupt: 0,
|
||||
recording: false,
|
||||
interimText: "",
|
||||
voiceMeta: "",
|
||||
})
|
||||
|
||||
createEffect(
|
||||
|
|
@ -261,6 +268,11 @@ export function Prompt(props: PromptProps) {
|
|||
onSelect: (dialog) => {
|
||||
if (autocomplete.visible) return
|
||||
if (!input.focused) return
|
||||
if (store.recording) {
|
||||
voice?.stop()
|
||||
dialog.clear()
|
||||
return
|
||||
}
|
||||
// TODO: this should be its own command
|
||||
if (store.mode === "shell") {
|
||||
setStore("mode", "normal")
|
||||
|
|
@ -370,6 +382,16 @@ export function Prompt(props: PromptProps) {
|
|||
input.cursorOffset = Bun.stringWidth(content)
|
||||
},
|
||||
},
|
||||
{
|
||||
title: "Voice dictate",
|
||||
value: "voice.dictate",
|
||||
keybind: "voice_dictate",
|
||||
category: "Prompt",
|
||||
onSelect: async (dialog) => {
|
||||
dialog.clear()
|
||||
await toggleVoice()
|
||||
},
|
||||
},
|
||||
{
|
||||
title: "Skills",
|
||||
value: "prompt.skills",
|
||||
|
|
@ -587,6 +609,7 @@ export function Prompt(props: PromptProps) {
|
|||
])
|
||||
|
||||
async function submit() {
|
||||
if (store.recording) await voice?.stop()
|
||||
if (props.disabled) return
|
||||
if (autocomplete?.visible) return
|
||||
if (!store.prompt.input) return
|
||||
|
|
@ -624,6 +647,12 @@ export function Prompt(props: PromptProps) {
|
|||
const messageID = MessageID.ascending()
|
||||
let inputText = store.prompt.input
|
||||
|
||||
// Append voice metadata context if voice was used during this prompt
|
||||
const voiceContext = voiceMetaStore.toMarkdown()
|
||||
if (voiceContext) {
|
||||
inputText = inputText + "\n\n<!-- voice-context -->\n" + voiceContext + "\n<!-- /voice-context -->"
|
||||
}
|
||||
|
||||
// Expand pasted text inline before submitting
|
||||
const allExtmarks = input.extmarks.getAllForTypeId(promptPartTypeId)
|
||||
const sortedExtmarks = allExtmarks.sort((a: { start: number }, b: { start: number }) => b.start - a.start)
|
||||
|
|
@ -732,6 +761,86 @@ export function Prompt(props: PromptProps) {
|
|||
}
|
||||
const exit = useExit()
|
||||
|
||||
// ── Voice dictation ──────────────────────────────────────────────
|
||||
let voice: VoiceInput | undefined
|
||||
const voiceMetaStore = new VoiceMetadataStore()
|
||||
|
||||
function getVoice(): VoiceInput {
|
||||
if (!voice) {
|
||||
voice = new VoiceInput()
|
||||
voice.onTranscript = (seg) => {
|
||||
// Ingest metadata regardless of final/partial
|
||||
voiceMetaStore.ingest(seg)
|
||||
setStore("voiceMeta", voiceMetaStore.shortSummary)
|
||||
|
||||
if (seg.is_final && seg.text.trim()) {
|
||||
if (!input || input.isDestroyed) return
|
||||
input.insertText(seg.text.trim() + " ")
|
||||
setStore("prompt", "input", input.plainText)
|
||||
setStore("interimText", "")
|
||||
setTimeout(() => {
|
||||
if (!input || input.isDestroyed) return
|
||||
input.getLayoutNode().markDirty()
|
||||
renderer.requestRender()
|
||||
}, 0)
|
||||
} else if (!seg.is_final) {
|
||||
setStore("interimText", seg.text)
|
||||
}
|
||||
}
|
||||
voice.onError = (err) => {
|
||||
toast.show({
|
||||
message: `Voice: ${err.message}`,
|
||||
variant: "error",
|
||||
duration: 3000,
|
||||
})
|
||||
setStore("recording", false)
|
||||
setStore("interimText", "")
|
||||
setStore("voiceMeta", "")
|
||||
}
|
||||
voice.onStateChange = (active) => {
|
||||
setStore("recording", active)
|
||||
if (active) {
|
||||
voiceMetaStore.startSession()
|
||||
} else {
|
||||
voiceMetaStore.endSession()
|
||||
setStore("interimText", "")
|
||||
}
|
||||
}
|
||||
}
|
||||
return voice
|
||||
}
|
||||
|
||||
async function toggleVoice() {
|
||||
if (props.disabled) return
|
||||
if (store.mode === "shell") return
|
||||
try {
|
||||
await getVoice().toggle()
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : "Voice input failed"
|
||||
toast.show({
|
||||
message: msg.includes("not found") || msg.includes("ENOENT") ? "Voice requires sox: brew install sox" : `Voice: ${msg}`,
|
||||
variant: "warning",
|
||||
duration: 5000,
|
||||
})
|
||||
setStore("recording", false)
|
||||
}
|
||||
}
|
||||
|
||||
onCleanup(() => {
|
||||
voice?.stop()
|
||||
})
|
||||
|
||||
createEffect(
|
||||
on(
|
||||
() => props.sessionID,
|
||||
() => {
|
||||
voice?.stop()
|
||||
},
|
||||
{ defer: true },
|
||||
),
|
||||
)
|
||||
// ── End voice ────────────────────────────────────────────────────
|
||||
|
||||
function pasteText(text: string, virtualText: string) {
|
||||
const currentOffset = input.visualCursor.offset
|
||||
const extmarkStart = currentOffset
|
||||
|
|
@ -945,6 +1054,11 @@ export function Prompt(props: PromptProps) {
|
|||
setStore("extmarkToPartIndex", new Map())
|
||||
return
|
||||
}
|
||||
if (keybind.match("voice_dictate", e)) {
|
||||
e.preventDefault()
|
||||
toggleVoice()
|
||||
return
|
||||
}
|
||||
if (keybind.match("app_exit", e)) {
|
||||
if (store.prompt.input === "") {
|
||||
await exit()
|
||||
|
|
@ -1106,6 +1220,17 @@ export function Prompt(props: PromptProps) {
|
|||
<span style={{ fg: theme.warning, bold: true }}>{local.model.variant.current()}</span>
|
||||
</text>
|
||||
</Show>
|
||||
<Show when={store.recording}>
|
||||
<text fg={theme.textMuted}>·</text>
|
||||
<text fg={theme.error}>
|
||||
●
|
||||
</text>
|
||||
<text fg={theme.textMuted} wrapMode="none">
|
||||
{store.interimText
|
||||
? `"${store.interimText.slice(0, 40)}${store.interimText.length > 40 ? "…" : ""}"`
|
||||
: store.voiceMeta || "listening…"}
|
||||
</text>
|
||||
</Show>
|
||||
</box>
|
||||
</Show>
|
||||
</box>
|
||||
|
|
@ -1245,6 +1370,11 @@ export function Prompt(props: PromptProps) {
|
|||
<text fg={theme.text}>
|
||||
{keybind.print("command_list")} <span style={{ fg: theme.textMuted }}>commands</span>
|
||||
</text>
|
||||
<Show when={store.recording}>
|
||||
<text fg={theme.error}>
|
||||
{keybind.print("voice_dictate")} <span style={{ fg: theme.textMuted }}>stop</span>
|
||||
</text>
|
||||
</Show>
|
||||
</Match>
|
||||
<Match when={store.mode === "shell"}>
|
||||
<text fg={theme.text}>
|
||||
|
|
|
|||
|
|
@ -762,6 +762,7 @@ export namespace Config {
|
|||
tips_toggle: z.string().optional().default("<leader>h").describe("Toggle tips on home screen"),
|
||||
plugin_manager: z.string().optional().default("none").describe("Open plugin manager dialog"),
|
||||
display_thinking: z.string().optional().default("none").describe("Toggle thinking blocks visibility"),
|
||||
voice_dictate: z.string().optional().default("alt+v").describe("Toggle voice dictation"),
|
||||
})
|
||||
.strict()
|
||||
.meta({
|
||||
|
|
|
|||
|
|
@ -0,0 +1,146 @@
|
|||
export interface TranscriptSegment {
|
||||
text: string
|
||||
is_final: boolean
|
||||
metadata?: {
|
||||
emotion?: string
|
||||
intent?: string
|
||||
gender?: string
|
||||
age?: string
|
||||
}
|
||||
metadata_probs?: {
|
||||
emotion?: Array<{ token: string; probability: number }>
|
||||
intent?: Array<{ token: string; probability: number }>
|
||||
}
|
||||
speech_rate?: {
|
||||
words_per_minute: number
|
||||
filler_count: number
|
||||
filler_rate: number
|
||||
pause_count: number
|
||||
}
|
||||
}
|
||||
|
||||
export class AsrStreamClient {
|
||||
private ws: WebSocket | null = null
|
||||
private url: string
|
||||
private language: string
|
||||
private sampleRate: number
|
||||
private endResolve: (() => void) | null = null
|
||||
|
||||
onTranscript: ((seg: TranscriptSegment) => void) | null = null
|
||||
onError: ((err: Error) => void) | null = null
|
||||
|
||||
private token: string
|
||||
|
||||
constructor(opts?: { url?: string; language?: string; sampleRate?: number; token?: string }) {
|
||||
const base = opts?.url ?? process.env.WHISSLE_ASR_URL ?? "wss://api.whissle.ai/asr/stream"
|
||||
this.token = opts?.token ?? process.env.WHISSLE_AUTH_TOKEN ?? ""
|
||||
this.url = this.token ? `${base}?token=${encodeURIComponent(this.token)}` : base
|
||||
this.language = opts?.language ?? process.env.WHISSLE_ASR_LANGUAGE ?? "en"
|
||||
this.sampleRate = opts?.sampleRate ?? 16000
|
||||
}
|
||||
|
||||
get connected(): boolean {
|
||||
return this.ws?.readyState === WebSocket.OPEN
|
||||
}
|
||||
|
||||
connect(): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
if (this.connected) {
|
||||
resolve()
|
||||
return
|
||||
}
|
||||
|
||||
const ws = new WebSocket(this.url)
|
||||
ws.binaryType = "arraybuffer"
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
reject(new Error("Voice server connection timed out"))
|
||||
try {
|
||||
ws.close()
|
||||
} catch {}
|
||||
}, 10_000)
|
||||
|
||||
ws.addEventListener("open", () => {
|
||||
clearTimeout(timeout)
|
||||
this.ws = ws
|
||||
this.sendConfig()
|
||||
resolve()
|
||||
})
|
||||
|
||||
ws.addEventListener("message", (ev: MessageEvent) => {
|
||||
if (typeof ev.data !== "string") return
|
||||
try {
|
||||
const msg = JSON.parse(ev.data)
|
||||
if (msg.type === "transcript") {
|
||||
this.onTranscript?.({
|
||||
text: msg.text ?? "",
|
||||
is_final: msg.is_final !== false,
|
||||
metadata: msg.metadata,
|
||||
metadata_probs: msg.metadata_probs,
|
||||
speech_rate: msg.speech_rate,
|
||||
})
|
||||
} else if (msg.type === "end") {
|
||||
this.endResolve?.()
|
||||
this.endResolve = null
|
||||
this.close()
|
||||
} else if (msg.type === "error") {
|
||||
this.onError?.(new Error(msg.message ?? "ASR error"))
|
||||
}
|
||||
} catch {}
|
||||
})
|
||||
|
||||
ws.addEventListener("error", () => {
|
||||
clearTimeout(timeout)
|
||||
const err = new Error("Voice server connection failed")
|
||||
reject(err)
|
||||
this.onError?.(err)
|
||||
})
|
||||
|
||||
ws.addEventListener("close", () => {
|
||||
clearTimeout(timeout)
|
||||
this.endResolve?.()
|
||||
this.endResolve = null
|
||||
this.ws = null
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
private sendConfig(): void {
|
||||
this.ws?.send(
|
||||
JSON.stringify({
|
||||
type: "config",
|
||||
language: this.language,
|
||||
use_lm: true,
|
||||
sample_rate: this.sampleRate,
|
||||
metadata_prob: true,
|
||||
word_timestamps: true,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
sendPcm(pcm: Buffer): void {
|
||||
if (!this.connected || !this.ws) return
|
||||
const ab = pcm.buffer.slice(pcm.byteOffset, pcm.byteOffset + pcm.byteLength)
|
||||
this.ws.send(ab)
|
||||
}
|
||||
|
||||
end(): Promise<void> {
|
||||
if (!this.connected || !this.ws) return Promise.resolve()
|
||||
return new Promise((resolve) => {
|
||||
this.endResolve = resolve
|
||||
this.ws!.send(JSON.stringify({ type: "end" }))
|
||||
setTimeout(() => {
|
||||
this.endResolve?.()
|
||||
this.endResolve = null
|
||||
this.close()
|
||||
}, 5_000)
|
||||
})
|
||||
}
|
||||
|
||||
close(): void {
|
||||
try {
|
||||
this.ws?.close()
|
||||
} catch {}
|
||||
this.ws = null
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
import { MicCapture } from "./mic"
|
||||
import { AsrStreamClient } from "./asr-client"
|
||||
|
||||
export type { TranscriptSegment } from "./asr-client"
|
||||
export type { VoiceMetadata } from "./metadata"
|
||||
export { VoiceMetadataStore } from "./metadata"
|
||||
|
||||
export class VoiceInput {
|
||||
private mic = new MicCapture()
|
||||
private asr: AsrStreamClient
|
||||
private _active = false
|
||||
|
||||
onTranscript: ((seg: {
|
||||
text: string
|
||||
is_final: boolean
|
||||
metadata?: { emotion?: string; intent?: string; gender?: string; age?: string }
|
||||
metadata_probs?: {
|
||||
emotion?: Array<{ token: string; probability: number }>
|
||||
intent?: Array<{ token: string; probability: number }>
|
||||
}
|
||||
speech_rate?: { words_per_minute: number; filler_count: number; filler_rate: number; pause_count: number }
|
||||
}) => void) | null = null
|
||||
onError: ((err: Error) => void) | null = null
|
||||
onStateChange: ((active: boolean) => void) | null = null
|
||||
|
||||
constructor(opts?: { url?: string; language?: string; token?: string }) {
|
||||
this.asr = new AsrStreamClient(opts)
|
||||
}
|
||||
|
||||
get active(): boolean {
|
||||
return this._active
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (this._active) return
|
||||
|
||||
this.asr.onTranscript = (seg) => this.onTranscript?.(seg)
|
||||
this.asr.onError = (err) => {
|
||||
this.onError?.(err)
|
||||
this.stop()
|
||||
}
|
||||
|
||||
await this.asr.connect()
|
||||
|
||||
this.mic.onData = (pcm) => this.asr.sendPcm(pcm)
|
||||
this.mic.onError = (err) => {
|
||||
this.onError?.(err)
|
||||
this.stop()
|
||||
}
|
||||
|
||||
await this.mic.start()
|
||||
this._active = true
|
||||
this.onStateChange?.(true)
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (!this._active) return
|
||||
this._active = false
|
||||
this.mic.stop()
|
||||
try {
|
||||
await this.asr.end()
|
||||
} catch {}
|
||||
this.onStateChange?.(false)
|
||||
}
|
||||
|
||||
async toggle(): Promise<void> {
|
||||
if (this._active) {
|
||||
await this.stop()
|
||||
} else {
|
||||
await this.start()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,214 @@
|
|||
import type { TranscriptSegment } from "./asr-client"
|
||||
|
||||
/**
|
||||
* Accumulated voice metadata for a session — emotion trends, intent patterns,
|
||||
* speech rate stats. Designed to be serialized to voice-metadata.md and
|
||||
* injected as context so the LLM can adapt to the user's vocal state.
|
||||
*/
|
||||
export interface VoiceMetadata {
|
||||
/** Running count of voice segments processed */
|
||||
segmentCount: number
|
||||
/** Emotion frequency: { happy: 5, frustrated: 2, ... } */
|
||||
emotionCounts: Record<string, number>
|
||||
/** Intent frequency: { question: 3, command: 7, ... } */
|
||||
intentCounts: Record<string, number>
|
||||
/** Current/latest emotion detected */
|
||||
currentEmotion: string
|
||||
/** Current/latest intent detected */
|
||||
currentIntent: string
|
||||
/** Average words per minute across segments */
|
||||
avgWpm: number
|
||||
/** Total filler count (um, uh, like) */
|
||||
totalFillers: number
|
||||
/** Total pause count */
|
||||
totalPauses: number
|
||||
/** Recent emotion sequence (last 10) for trend detection */
|
||||
recentEmotions: string[]
|
||||
/** Timestamps of recording sessions */
|
||||
sessions: Array<{ start: number; end?: number; segmentCount: number }>
|
||||
}
|
||||
|
||||
/**
|
||||
* Accumulates voice metadata across a session and produces a markdown
|
||||
* summary suitable for injection into LLM context.
|
||||
*/
|
||||
export class VoiceMetadataStore {
|
||||
private data: VoiceMetadata = {
|
||||
segmentCount: 0,
|
||||
emotionCounts: {},
|
||||
intentCounts: {},
|
||||
currentEmotion: "",
|
||||
currentIntent: "",
|
||||
avgWpm: 0,
|
||||
totalFillers: 0,
|
||||
totalPauses: 0,
|
||||
recentEmotions: [],
|
||||
sessions: [],
|
||||
}
|
||||
|
||||
private wpmSamples: number[] = []
|
||||
private sessionStart: number | null = null
|
||||
|
||||
get current(): Readonly<VoiceMetadata> {
|
||||
return this.data
|
||||
}
|
||||
|
||||
/** Call when recording starts */
|
||||
startSession(): void {
|
||||
this.sessionStart = Date.now()
|
||||
this.data.sessions.push({ start: this.sessionStart, segmentCount: 0 })
|
||||
}
|
||||
|
||||
/** Call when recording stops */
|
||||
endSession(): void {
|
||||
const last = this.data.sessions[this.data.sessions.length - 1]
|
||||
if (last && !last.end) {
|
||||
last.end = Date.now()
|
||||
}
|
||||
this.sessionStart = null
|
||||
}
|
||||
|
||||
/** Ingest a transcript segment's metadata */
|
||||
ingest(seg: TranscriptSegment): void {
|
||||
if (!seg.is_final) return
|
||||
|
||||
this.data.segmentCount++
|
||||
|
||||
// Track the current session's segment count
|
||||
const last = this.data.sessions[this.data.sessions.length - 1]
|
||||
if (last && !last.end) last.segmentCount++
|
||||
|
||||
// Emotion
|
||||
const emotion = seg.metadata?.emotion
|
||||
if (emotion) {
|
||||
this.data.emotionCounts[emotion] = (this.data.emotionCounts[emotion] ?? 0) + 1
|
||||
this.data.currentEmotion = emotion
|
||||
this.data.recentEmotions.push(emotion)
|
||||
if (this.data.recentEmotions.length > 10) {
|
||||
this.data.recentEmotions.shift()
|
||||
}
|
||||
}
|
||||
|
||||
// Intent
|
||||
const intent = seg.metadata?.intent
|
||||
if (intent) {
|
||||
this.data.intentCounts[intent] = (this.data.intentCounts[intent] ?? 0) + 1
|
||||
this.data.currentIntent = intent
|
||||
}
|
||||
|
||||
// Speech rate
|
||||
if (seg.speech_rate) {
|
||||
if (seg.speech_rate.words_per_minute > 0) {
|
||||
this.wpmSamples.push(seg.speech_rate.words_per_minute)
|
||||
this.data.avgWpm = Math.round(
|
||||
this.wpmSamples.reduce((a, b) => a + b, 0) / this.wpmSamples.length,
|
||||
)
|
||||
}
|
||||
this.data.totalFillers += seg.speech_rate.filler_count
|
||||
this.data.totalPauses += seg.speech_rate.pause_count
|
||||
}
|
||||
}
|
||||
|
||||
/** Dominant emotion (most frequent) */
|
||||
get dominantEmotion(): string {
|
||||
let max = 0
|
||||
let best = ""
|
||||
for (const [emotion, count] of Object.entries(this.data.emotionCounts)) {
|
||||
if (count > max) {
|
||||
max = count
|
||||
best = emotion
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
/** Detect emotional trend from recent sequence */
|
||||
get emotionTrend(): string {
|
||||
const recent = this.data.recentEmotions
|
||||
if (recent.length < 3) return ""
|
||||
const last3 = recent.slice(-3)
|
||||
if (last3.every((e) => e === last3[0])) return `consistently ${last3[0]}`
|
||||
// Check for shift
|
||||
const first = recent.slice(0, Math.floor(recent.length / 2))
|
||||
const second = recent.slice(Math.floor(recent.length / 2))
|
||||
const mode = (arr: string[]) => {
|
||||
const counts: Record<string, number> = {}
|
||||
for (const v of arr) counts[v] = (counts[v] ?? 0) + 1
|
||||
return Object.entries(counts).sort((a, b) => b[1] - a[1])[0]?.[0] ?? ""
|
||||
}
|
||||
const firstMode = mode(first)
|
||||
const secondMode = mode(second)
|
||||
if (firstMode && secondMode && firstMode !== secondMode) {
|
||||
return `shifting from ${firstMode} to ${secondMode}`
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
/** Short single-line summary for TUI display */
|
||||
get shortSummary(): string {
|
||||
const parts: string[] = []
|
||||
if (this.data.currentEmotion) parts.push(this.data.currentEmotion)
|
||||
if (this.data.currentIntent) parts.push(this.data.currentIntent)
|
||||
if (this.data.avgWpm > 0) parts.push(`${this.data.avgWpm}wpm`)
|
||||
return parts.join(" · ")
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a markdown summary for LLM context injection.
|
||||
* Compact enough to not waste tokens, rich enough to be useful.
|
||||
*/
|
||||
toMarkdown(): string {
|
||||
const d = this.data
|
||||
if (d.segmentCount === 0) return ""
|
||||
|
||||
const lines: string[] = ["## Voice Session Context"]
|
||||
lines.push("")
|
||||
|
||||
// Current state
|
||||
const current: string[] = []
|
||||
if (d.currentEmotion) current.push(`**Emotion:** ${d.currentEmotion}`)
|
||||
if (d.currentIntent) current.push(`**Intent:** ${d.currentIntent}`)
|
||||
if (current.length) {
|
||||
lines.push(`Current: ${current.join(", ")}`)
|
||||
}
|
||||
|
||||
// Trend
|
||||
const trend = this.emotionTrend
|
||||
if (trend) lines.push(`Trend: ${trend}`)
|
||||
|
||||
// Emotion distribution (top 3)
|
||||
const sortedEmotions = Object.entries(d.emotionCounts)
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 3)
|
||||
if (sortedEmotions.length) {
|
||||
const total = Object.values(d.emotionCounts).reduce((a, b) => a + b, 0)
|
||||
const dist = sortedEmotions
|
||||
.map(([e, c]) => `${e} ${Math.round((c / total) * 100)}%`)
|
||||
.join(", ")
|
||||
lines.push(`Emotions: ${dist}`)
|
||||
}
|
||||
|
||||
// Intent distribution (top 3)
|
||||
const sortedIntents = Object.entries(d.intentCounts)
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 3)
|
||||
if (sortedIntents.length) {
|
||||
const total = Object.values(d.intentCounts).reduce((a, b) => a + b, 0)
|
||||
const dist = sortedIntents
|
||||
.map(([i, c]) => `${i} ${Math.round((c / total) * 100)}%`)
|
||||
.join(", ")
|
||||
lines.push(`Intents: ${dist}`)
|
||||
}
|
||||
|
||||
// Speech stats
|
||||
const stats: string[] = []
|
||||
if (d.avgWpm > 0) stats.push(`${d.avgWpm} wpm`)
|
||||
if (d.totalFillers > 0) stats.push(`${d.totalFillers} fillers`)
|
||||
if (d.totalPauses > 0) stats.push(`${d.totalPauses} pauses`)
|
||||
if (stats.length) lines.push(`Speech: ${stats.join(", ")}`)
|
||||
|
||||
lines.push(`Segments: ${d.segmentCount}`)
|
||||
|
||||
return lines.join("\n")
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import { spawn, type ChildProcess } from "child_process"
|
||||
import which from "which"
|
||||
|
||||
export class MicCapture {
|
||||
private proc: ChildProcess | null = null
|
||||
|
||||
onData: ((pcm: Buffer) => void) | null = null
|
||||
onError: ((err: Error) => void) | null = null
|
||||
|
||||
get recording(): boolean {
|
||||
return this.proc !== null && !this.proc.killed
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (this.proc) return
|
||||
|
||||
const cmd = await MicCapture.findCommand()
|
||||
if (!cmd) {
|
||||
throw new Error("sox not found. Install it: brew install sox (macOS) / apt install sox (Linux)")
|
||||
}
|
||||
|
||||
// Capture at device native rate, resample to 16kHz via the "rate" effect.
|
||||
// Don't pass -r 16000 — on macOS CoreAudio it tries to set the device to
|
||||
// 16kHz, fails, falls back to 48kHz, and outputs 48kHz raw PCM.
|
||||
// The "rate 16000" effect after "-" does the actual resampling.
|
||||
const args =
|
||||
cmd === "rec"
|
||||
? ["-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
|
||||
: ["-d", "-q", "-t", "raw", "-b", "16", "-c", "1", "-e", "signed-integer", "-", "rate", "16000"]
|
||||
|
||||
this.proc = spawn(cmd, args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
})
|
||||
|
||||
this.proc.stdout!.on("data", (chunk: Buffer) => {
|
||||
this.onData?.(chunk)
|
||||
})
|
||||
|
||||
this.proc.stderr!.on("data", (data: Buffer) => {
|
||||
const msg = data.toString().trim()
|
||||
// sox emits WARN about sample rate when resampling — expected, not an error
|
||||
if (msg && !msg.includes("WARN") && !msg.includes("can't set sample rate")) {
|
||||
this.onError?.(new Error(`mic: ${msg}`))
|
||||
}
|
||||
})
|
||||
|
||||
this.proc.on("error", (err) => {
|
||||
this.onError?.(err)
|
||||
this.proc = null
|
||||
})
|
||||
|
||||
this.proc.on("exit", () => {
|
||||
this.proc = null
|
||||
})
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (!this.proc) return
|
||||
try {
|
||||
this.proc.kill("SIGTERM")
|
||||
} catch {}
|
||||
this.proc = null
|
||||
}
|
||||
|
||||
private static async findCommand(): Promise<string | null> {
|
||||
for (const cmd of ["rec", "sox"]) {
|
||||
try {
|
||||
await which(cmd)
|
||||
return cmd
|
||||
} catch {}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
static async available(): Promise<boolean> {
|
||||
return (await MicCapture.findCommand()) !== null
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue