diff --git a/bun.lock b/bun.lock
index df16de854a..79ff4ee161 100644
--- a/bun.lock
+++ b/bun.lock
@@ -317,6 +317,7 @@
"name": "mobile-voice",
"version": "1.0.0",
"dependencies": {
+ "@fugood/react-native-audio-pcm-stream": "1.1.4",
"@react-navigation/bottom-tabs": "^7.15.5",
"@react-navigation/elements": "^2.9.10",
"@react-navigation/native": "^7.1.33",
@@ -345,14 +346,13 @@
"react-dom": "19.2.0",
"react-native": "0.83.4",
"react-native-audio-api": "^0.11.7",
- "react-native-executorch": "^0.8.0",
- "react-native-executorch-expo-resource-fetcher": "^0.8.0",
"react-native-gesture-handler": "~2.30.0",
"react-native-reanimated": "4.2.1",
"react-native-safe-area-context": "~5.6.2",
"react-native-screens": "~4.23.0",
"react-native-web": "~0.21.0",
"react-native-worklets": "0.7.2",
+ "whisper.rn": "0.5.5",
},
"devDependencies": {
"@types/react": "~19.2.2",
@@ -1378,6 +1378,8 @@
"@fontsource/inter": ["@fontsource/inter@5.2.8", "", {}, "sha512-P6r5WnJoKiNVV+zvW2xM13gNdFhAEpQ9dQJHt3naLvfg+LkF2ldgSLiF4T41lf1SQCM9QmkqPTn4TH568IRagg=="],
+ "@fugood/react-native-audio-pcm-stream": ["@fugood/react-native-audio-pcm-stream@1.1.4", "", {}, "sha512-M6H6ay4ea0vpioII9T/C9qXFPeGpxGN24nl0REP2/wtsorZXg3zzHjZbf3UUUwjf6lEEHMlGCJfXUsxwC/vV8w=="],
+
"@graphql-typed-document-node/core": ["@graphql-typed-document-node/core@3.2.0", "", { "peerDependencies": { "graphql": "^0.8.0 || ^0.9.0 || ^0.10.0 || ^0.11.0 || ^0.12.0 || ^0.13.0 || ^14.0.0 || ^15.0.0 || ^16.0.0 || ^17.0.0" } }, "sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ=="],
"@happy-dom/global-registrator": ["@happy-dom/global-registrator@20.0.11", "", { "dependencies": { "@types/node": "^20.0.0", "happy-dom": "^20.0.11" } }, "sha512-GqNqiShBT/lzkHTMC/slKBrvN0DsD4Di8ssBk4aDaVgEn+2WMzE6DXxq701ndSXj7/0cJ8mNT71pM7Bnrr6JRw=="],
@@ -1396,8 +1398,6 @@
"@hono/zod-validator": ["@hono/zod-validator@0.4.2", "", { "peerDependencies": { "hono": ">=3.9.0", "zod": "^3.19.1" } }, "sha512-1rrlBg+EpDPhzOV4hT9pxr5+xDVmKuz6YJl+la7VCwK6ass5ldyKm5fD+umJdV2zhHD6jROoCCv8NbTwyfhT0g=="],
- "@huggingface/jinja": ["@huggingface/jinja@0.5.6", "", {}, "sha512-MyMWyLnjqo+KRJYSH7oWNbsOn5onuIvfXYPcc0WOGxU0eHUV7oAYUoQTl2BMdu7ml+ea/bu11UM+EshbeHwtIA=="],
-
"@ibm/plex": ["@ibm/plex@6.4.1", "", { "dependencies": { "@ibm/telemetry-js": "^1.5.1" } }, "sha512-fnsipQywHt3zWvsnlyYKMikcVI7E2fEwpiPnIHFqlbByXVfQfANAAeJk1IV4mNnxhppUIDlhU0TzwYwL++Rn2g=="],
"@ibm/telemetry-js": ["@ibm/telemetry-js@1.11.0", "", { "bin": { "ibmtelemetry": "dist/collect.js" } }, "sha512-RO/9j+URJnSfseWg9ZkEX9p+a3Ousd33DBU7rOafoZB08RqdzxFVYJ2/iM50dkBuD0o7WX7GYt1sLbNgCoE+pA=="],
@@ -3918,10 +3918,6 @@
"jsonfile": ["jsonfile@4.0.0", "", { "optionalDependencies": { "graceful-fs": "^4.1.6" } }, "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg=="],
- "jsonrepair": ["jsonrepair@3.13.3", "", { "bin": { "jsonrepair": "bin/cli.js" } }, "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ=="],
-
- "jsonschema": ["jsonschema@1.5.0", "", {}, "sha512-K+A9hhqbn0f3pJX17Q/7H6yQfD/5OXgdrR5UE12gMXCiN9D5Xq2o5mddV2QEcX/bjla99ASsAAQUyMCCRWAEhw=="],
-
"jsonwebtoken": ["jsonwebtoken@9.0.3", "", { "dependencies": { "jws": "^4.0.1", "lodash.includes": "^4.3.0", "lodash.isboolean": "^3.0.3", "lodash.isinteger": "^4.0.4", "lodash.isnumber": "^3.0.3", "lodash.isplainobject": "^4.0.6", "lodash.isstring": "^4.0.1", "lodash.once": "^4.0.0", "ms": "^2.1.1", "semver": "^7.5.4" } }, "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g=="],
"jwa": ["jwa@2.0.1", "", { "dependencies": { "buffer-equal-constant-time": "^1.0.1", "ecdsa-sig-formatter": "1.0.11", "safe-buffer": "^5.0.1" } }, "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg=="],
@@ -4536,7 +4532,7 @@
"plist": ["plist@3.1.0", "", { "dependencies": { "@xmldom/xmldom": "^0.8.8", "base64-js": "^1.5.1", "xmlbuilder": "^15.1.1" } }, "sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ=="],
- "pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="],
+ "pngjs": ["pngjs@5.0.0", "", {}, "sha512-40QW5YalBNfQo5yRYmiw7Yz6TKKVr3h6970B2YE+3fQpsWcrbj1PzJgxeJ19DRQjhMbKPIuMY8rFaXc8moolVw=="],
"poe-oauth": ["poe-oauth@0.0.3", "", {}, "sha512-KgxDylcuq/mov8URSplrBGjrIjkQwjN/Ml8BhqaGsAvHzYN3yhuROdv1sDRfwqncg7TT8XzJvMeJAWmv/4NDLw=="],
@@ -4650,10 +4646,6 @@
"react-native-audio-api": ["react-native-audio-api@0.11.7", "", { "dependencies": { "semver": "^7.7.3" }, "peerDependencies": { "react": "*", "react-native": "*" }, "bin": { "setup-rn-audio-api-web": "scripts/setup-rn-audio-api-web.js" } }, "sha512-2oIoP77Tn2nlouRVfEC3bAsuSyKU6xhGNkSnVXTLLQQZslEDoYX2cN9pVRZoWOqhFrLT8q4IZI9HaFgYL13L1A=="],
- "react-native-executorch": ["react-native-executorch@0.8.0", "", { "dependencies": { "@huggingface/jinja": "^0.5.0", "jsonrepair": "^3.12.0", "jsonschema": "^1.5.0", "pngjs": "^7.0.0", "zod": "^4.3.6" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-9zRiJiCSTOYbES4htuk+yqkhgec/i4L1E63ZYgJ1AHkDbvHyoYLH3KkKjjzxDw7NYJCCOx+6vj9l9JrodoCbzg=="],
-
- "react-native-executorch-expo-resource-fetcher": ["react-native-executorch-expo-resource-fetcher@0.8.0", "", { "peerDependencies": { "expo": ">=54.0.0", "expo-asset": ">=12.0.0", "expo-file-system": ">=19.0.0", "react-native": "*", "react-native-executorch": "*" } }, "sha512-vdAne2FBL0nCQ2c2yHTSt8Uttm0Klmo/K7tirSVlKxgVtli4cmsfl+UpR5giaNtlRZ3ImMAMXNW34j0fItmRfQ=="],
-
"react-native-gesture-handler": ["react-native-gesture-handler@2.30.1", "", { "dependencies": { "@egjs/hammerjs": "^2.0.17", "hoist-non-react-statics": "^3.3.0", "invariant": "^2.2.4" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-xIUBDo5ktmJs++0fZlavQNvDEE4PsihWhSeJsJtoz4Q6p0MiTM9TgrTgfEgzRR36qGPytFoeq+ShLrVwGdpUdA=="],
"react-native-is-edge-to-edge": ["react-native-is-edge-to-edge@1.3.1", "", { "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-NIXU/iT5+ORyCc7p0z2nnlkouYKX425vuU1OEm6bMMtWWR9yvb+Xg5AZmImTKoF9abxCPqrKC3rOZsKzUYgYZA=="],
@@ -5446,6 +5438,8 @@
"which-typed-array": ["which-typed-array@1.1.20", "", { "dependencies": { "available-typed-arrays": "^1.0.7", "call-bind": "^1.0.8", "call-bound": "^1.0.4", "for-each": "^0.3.5", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-tostringtag": "^1.0.2" } }, "sha512-LYfpUkmqwl0h9A2HL09Mms427Q1RZWuOHsukfVcKRq9q95iQxdw0ix1JQrqbcDR9PH1QDwf5Qo8OZb5lksZ8Xg=="],
+ "whisper.rn": ["whisper.rn@0.5.5", "", { "dependencies": { "safe-buffer": "^5.2.1" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-awFE+ImMtRdGhA+hjm3GEwnSvyEVP1sdhMb+MyCa5bVdoOCpaxrwVwXDo9U46Qwkhwml3PCFaauTsGmRkTyhdw=="],
+
"why-is-node-running": ["why-is-node-running@3.2.2", "", { "bin": { "why-is-node-running": "cli.js" } }, "sha512-NKUzAelcoCXhXL4dJzKIwXeR8iEVqsA0Lq6Vnd0UXvgaKbzVo4ZTHROF2Jidrv+SgxOQ03fMinnNhzZATxOD3A=="],
"widest-line": ["widest-line@5.0.0", "", { "dependencies": { "string-width": "^7.0.0" } }, "sha512-c9bZp7b5YtRj2wOe6dlj32MK+Bx/M/d+9VB2SHM1OtsUHR0aV0tdP6DWh/iMt0kWi1t5g1Iudu6hQRNd1A4PVA=="],
@@ -5846,6 +5840,8 @@
"@jimp/core/mime": ["mime@3.0.0", "", { "bin": { "mime": "cli.js" } }, "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A=="],
+ "@jimp/js-png/pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="],
+
"@jimp/plugin-blit/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
"@jimp/plugin-circle/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
@@ -6556,8 +6552,6 @@
"proper-lockfile/signal-exit": ["signal-exit@3.0.7", "", {}, "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="],
- "qrcode/pngjs": ["pngjs@5.0.0", "", {}, "sha512-40QW5YalBNfQo5yRYmiw7Yz6TKKVr3h6970B2YE+3fQpsWcrbj1PzJgxeJ19DRQjhMbKPIuMY8rFaXc8moolVw=="],
-
"qrcode/yargs": ["yargs@15.4.1", "", { "dependencies": { "cliui": "^6.0.0", "decamelize": "^1.2.0", "find-up": "^4.1.0", "get-caller-file": "^2.0.1", "require-directory": "^2.1.1", "require-main-filename": "^2.0.0", "set-blocking": "^2.0.0", "string-width": "^4.2.0", "which-module": "^2.0.0", "y18n": "^4.0.0", "yargs-parser": "^18.1.2" } }, "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A=="],
"raw-body/iconv-lite": ["iconv-lite@0.4.24", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3" } }, "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA=="],
@@ -6582,8 +6576,6 @@
"react-native/yargs": ["yargs@17.7.2", "", { "dependencies": { "cliui": "^8.0.1", "escalade": "^3.1.1", "get-caller-file": "^2.0.5", "require-directory": "^2.1.1", "string-width": "^4.2.3", "y18n": "^5.0.5", "yargs-parser": "^21.1.1" } }, "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w=="],
- "react-native-executorch/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
-
"react-native-reanimated/react-native-is-edge-to-edge": ["react-native-is-edge-to-edge@1.2.1", "", { "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-FLbPWl/MyYQWz+KwqOZsSyj2JmLKglHatd3xLZWskXOpRaio4LfEDEz8E/A6uD8QoTHW6Aobw1jbEwK7KMgR7Q=="],
"react-native-reanimated/semver": ["semver@7.7.3", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q=="],
diff --git a/packages/mobile-voice/app.json b/packages/mobile-voice/app.json
index 43a9be652a..66782dd6d8 100644
--- a/packages/mobile-voice/app.json
+++ b/packages/mobile-voice/app.json
@@ -1,6 +1,6 @@
{
"expo": {
- "name": "mobile-voice",
+ "name": "Control",
"slug": "mobile-voice",
"version": "1.0.0",
"orientation": "portrait",
@@ -10,6 +10,9 @@
"ios": {
"icon": "./assets/images/icon.png",
"bundleIdentifier": "com.anomalyco.mobilevoice",
+ "entitlements": {
+ "com.apple.developer.kernel.extended-virtual-addressing": true
+ },
"infoPlist": {
"NSMicrophoneUsageDescription": "This app needs microphone access for live speech-to-text dictation.",
"NSAppTransportSecurity": {
diff --git a/packages/mobile-voice/package.json b/packages/mobile-voice/package.json
index 42e70e3b64..d4c68fa7be 100644
--- a/packages/mobile-voice/package.json
+++ b/packages/mobile-voice/package.json
@@ -13,6 +13,7 @@
"lint": "expo lint"
},
"dependencies": {
+ "@fugood/react-native-audio-pcm-stream": "1.1.4",
"@react-navigation/bottom-tabs": "^7.15.5",
"@react-navigation/elements": "^2.9.10",
"@react-navigation/native": "^7.1.33",
@@ -41,14 +42,13 @@
"react-dom": "19.2.0",
"react-native": "0.83.4",
"react-native-audio-api": "^0.11.7",
- "react-native-executorch": "^0.8.0",
- "react-native-executorch-expo-resource-fetcher": "^0.8.0",
"react-native-gesture-handler": "~2.30.0",
"react-native-reanimated": "4.2.1",
"react-native-safe-area-context": "~5.6.2",
"react-native-screens": "~4.23.0",
"react-native-web": "~0.21.0",
- "react-native-worklets": "0.7.2"
+ "react-native-worklets": "0.7.2",
+ "whisper.rn": "0.5.5"
},
"devDependencies": {
"@types/react": "~19.2.2",
diff --git a/packages/mobile-voice/src/app/_layout.tsx b/packages/mobile-voice/src/app/_layout.tsx
index a2fa275dc0..67571e0b94 100644
--- a/packages/mobile-voice/src/app/_layout.tsx
+++ b/packages/mobile-voice/src/app/_layout.tsx
@@ -1,25 +1,20 @@
-import React from 'react';
-import { Slot } from 'expo-router';
-import { LogBox } from 'react-native';
-import { initExecutorch } from 'react-native-executorch';
-import { ExpoResourceFetcher } from 'react-native-executorch-expo-resource-fetcher';
+import React from "react"
+import { Slot } from "expo-router"
+import { LogBox } from "react-native"
import {
configureNotificationBehavior,
registerBackgroundNotificationTask,
-} from '@/notifications/monitoring-notifications';
-
-// Initialize the ExecuTorch resource fetcher before any model hooks run
-initExecutorch({ resourceFetcher: ExpoResourceFetcher });
+} from "@/notifications/monitoring-notifications"
// Suppress known non-actionable warnings from third-party libs.
LogBox.ignoreLogs([
- 'RecordingNotificationManager is not implemented on iOS',
- '[React Native ExecuTorch] No content-length header',
-]);
+ "RecordingNotificationManager is not implemented on iOS",
+ "`transcribeRealtime` is deprecated, use `RealtimeTranscriber` instead",
+])
-configureNotificationBehavior();
-registerBackgroundNotificationTask().catch(() => {});
+configureNotificationBehavior()
+registerBackgroundNotificationTask().catch(() => {})
export default function RootLayout() {
- return ;
+ return
}
diff --git a/packages/mobile-voice/src/app/index.tsx b/packages/mobile-voice/src/app/index.tsx
index 88c50ceeaf..b7083ce88a 100644
--- a/packages/mobile-voice/src/app/index.tsx
+++ b/packages/mobile-voice/src/app/index.tsx
@@ -7,6 +7,7 @@ import {
ScrollView,
Modal,
Alert,
+ ActivityIndicator,
LayoutChangeEvent,
AppState,
AppStateStatus,
@@ -25,11 +26,13 @@ import Animated, {
} from "react-native-reanimated"
import { SafeAreaView } from "react-native-safe-area-context"
import { StatusBar } from "expo-status-bar"
+import { SymbolView } from "expo-symbols"
import * as Haptics from "expo-haptics"
import { useAudioPlayer } from "expo-audio"
-import { useSpeechToText, WHISPER_BASE_EN } from "react-native-executorch"
-import { ExpoResourceFetcher } from "react-native-executorch-expo-resource-fetcher"
-import { AudioManager, AudioRecorder } from "react-native-audio-api"
+import { initWhisper, releaseAllWhisper, type WhisperContext } from "whisper.rn"
+import { RealtimeTranscriber, type RealtimeTranscribeEvent } from "whisper.rn/src/realtime-transcription"
+import { AudioPcmStreamAdapter } from "whisper.rn/src/realtime-transcription/adapters/AudioPcmStreamAdapter"
+import { AudioManager } from "react-native-audio-api"
import * as Notifications from "expo-notifications"
import * as FileSystem from "expo-file-system/legacy"
import Constants from "expo-constants"
@@ -49,8 +52,6 @@ import {
onPushTokenChange,
} from "@/notifications/monitoring-notifications"
-const SAMPLE_RATE = 16000
-const AUDIO_BUFFER_SECONDS = 0.02
const CONTROL_HEIGHT = 86
const SEND_SETTLE_MS = 240
const WAVEFORM_ROWS = 5
@@ -61,6 +62,187 @@ const DROPDOWN_VISIBLE_ROWS = 6
const TAP_THRESHOLD_MS = 300
const DEFAULT_RELAY_URL = "https://apn.dev.opencode.ai"
const SERVER_STATE_FILE = `${FileSystem.documentDirectory}mobile-voice-servers.json`
+const WHISPER_SETTINGS_FILE = `${FileSystem.documentDirectory}mobile-voice-whisper-settings.json`
+const WHISPER_MODELS_DIR = `${FileSystem.documentDirectory}whisper-models`
+const WHISPER_REPO = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main"
+const WHISPER_MODELS = [
+ "ggml-tiny.en-q5_1.bin",
+ "ggml-tiny.en-q8_0.bin",
+ "ggml-tiny.en.bin",
+ "ggml-tiny-q5_1.bin",
+ "ggml-tiny-q8_0.bin",
+ "ggml-tiny.bin",
+ "ggml-base.en-q5_1.bin",
+ "ggml-base.en-q8_0.bin",
+ "ggml-base.en.bin",
+ "ggml-base-q5_1.bin",
+ "ggml-base-q8_0.bin",
+ "ggml-base.bin",
+ "ggml-small.en-q5_1.bin",
+ "ggml-small.en-q8_0.bin",
+ "ggml-small.en.bin",
+ "ggml-small-q5_1.bin",
+ "ggml-small-q8_0.bin",
+ "ggml-small.bin",
+ "ggml-medium.en-q5_0.bin",
+ "ggml-medium.en-q8_0.bin",
+ "ggml-medium.en.bin",
+ "ggml-medium-q5_0.bin",
+ "ggml-medium-q8_0.bin",
+ "ggml-medium.bin",
+ "ggml-large-v1.bin",
+ "ggml-large-v2-q5_0.bin",
+ "ggml-large-v2-q8_0.bin",
+ "ggml-large-v2.bin",
+ "ggml-large-v3-q5_0.bin",
+ "ggml-large-v3-turbo-q5_0.bin",
+ "ggml-large-v3-turbo-q8_0.bin",
+ "ggml-large-v3-turbo.bin",
+ "ggml-large-v3.bin",
+] as const
+
+type WhisperModelID = (typeof WHISPER_MODELS)[number]
+type TranscriptionMode = "bulk" | "realtime"
+const DEFAULT_WHISPER_MODEL: WhisperModelID = "ggml-medium.bin"
+const DEFAULT_TRANSCRIPTION_MODE: TranscriptionMode = "bulk"
+
+const WHISPER_MODEL_LABELS: Record = {
+ "ggml-tiny.en-q5_1.bin": "tiny.en q5_1",
+ "ggml-tiny.en-q8_0.bin": "tiny.en q8_0",
+ "ggml-tiny.en.bin": "tiny.en",
+ "ggml-tiny-q5_1.bin": "tiny q5_1",
+ "ggml-tiny-q8_0.bin": "tiny q8_0",
+ "ggml-tiny.bin": "tiny",
+ "ggml-base.en-q5_1.bin": "base.en q5_1",
+ "ggml-base.en-q8_0.bin": "base.en q8_0",
+ "ggml-base.en.bin": "base.en",
+ "ggml-base-q5_1.bin": "base q5_1",
+ "ggml-base-q8_0.bin": "base q8_0",
+ "ggml-base.bin": "base",
+ "ggml-small.en-q5_1.bin": "small.en q5_1",
+ "ggml-small.en-q8_0.bin": "small.en q8_0",
+ "ggml-small.en.bin": "small.en",
+ "ggml-small-q5_1.bin": "small q5_1",
+ "ggml-small-q8_0.bin": "small q8_0",
+ "ggml-small.bin": "small",
+ "ggml-medium.en-q5_0.bin": "medium.en q5_0",
+ "ggml-medium.en-q8_0.bin": "medium.en q8_0",
+ "ggml-medium.en.bin": "medium.en",
+ "ggml-medium-q5_0.bin": "medium q5_0",
+ "ggml-medium-q8_0.bin": "medium q8_0",
+ "ggml-medium.bin": "medium",
+ "ggml-large-v1.bin": "large-v1",
+ "ggml-large-v2-q5_0.bin": "large-v2 q5_0",
+ "ggml-large-v2-q8_0.bin": "large-v2 q8_0",
+ "ggml-large-v2.bin": "large-v2",
+ "ggml-large-v3-q5_0.bin": "large-v3 q5_0",
+ "ggml-large-v3-turbo-q5_0.bin": "large-v3 turbo q5_0",
+ "ggml-large-v3-turbo-q8_0.bin": "large-v3 turbo q8_0",
+ "ggml-large-v3-turbo.bin": "large-v3 turbo",
+ "ggml-large-v3.bin": "large-v3",
+}
+
+const WHISPER_MODEL_SIZES: Record = {
+ "ggml-tiny.en-q5_1.bin": 32166155,
+ "ggml-tiny.en-q8_0.bin": 43550795,
+ "ggml-tiny.en.bin": 77704715,
+ "ggml-tiny-q5_1.bin": 32152673,
+ "ggml-tiny-q8_0.bin": 43537433,
+ "ggml-tiny.bin": 77691713,
+ "ggml-base.en-q5_1.bin": 59721011,
+ "ggml-base.en-q8_0.bin": 81781811,
+ "ggml-base.en.bin": 147964211,
+ "ggml-base-q5_1.bin": 59707625,
+ "ggml-base-q8_0.bin": 81768585,
+ "ggml-base.bin": 147951465,
+ "ggml-small.en-q5_1.bin": 190098681,
+ "ggml-small.en-q8_0.bin": 264477561,
+ "ggml-small.en.bin": 487614201,
+ "ggml-small-q5_1.bin": 190085487,
+ "ggml-small-q8_0.bin": 264464607,
+ "ggml-small.bin": 487601967,
+ "ggml-medium.en-q5_0.bin": 539225533,
+ "ggml-medium.en-q8_0.bin": 823382461,
+ "ggml-medium.en.bin": 1533774781,
+ "ggml-medium-q5_0.bin": 539212467,
+ "ggml-medium-q8_0.bin": 823369779,
+ "ggml-medium.bin": 1533763059,
+ "ggml-large-v1.bin": 3094623691,
+ "ggml-large-v2-q5_0.bin": 1080732091,
+ "ggml-large-v2-q8_0.bin": 1656129691,
+ "ggml-large-v2.bin": 3094623691,
+ "ggml-large-v3-q5_0.bin": 1081140203,
+ "ggml-large-v3-turbo-q5_0.bin": 574041195,
+ "ggml-large-v3-turbo-q8_0.bin": 874188075,
+ "ggml-large-v3-turbo.bin": 1624555275,
+ "ggml-large-v3.bin": 3095033483,
+}
+
+function isWhisperModelID(value: unknown): value is WhisperModelID {
+ return typeof value === "string" && (WHISPER_MODELS as readonly string[]).includes(value)
+}
+
+function isEnglishOnlyWhisperModel(modelID: WhisperModelID): boolean {
+ return modelID.includes(".en")
+}
+
+function isTranscriptionMode(value: unknown): value is TranscriptionMode {
+ return value === "bulk" || value === "realtime"
+}
+
+function formatWhisperModelSize(bytes: number): string {
+ const mib = bytes / (1024 * 1024)
+ if (mib >= 1024) {
+ return `${(mib / 1024).toFixed(1)} GB`
+ }
+
+ return `${Math.round(mib)} MB`
+}
+
+function cleanTranscriptText(text: string): string {
+ return text.replace(/[ \t]+$/gm, "").trimEnd()
+}
+
+function cleanSessionText(text: string): string {
+ return cleanTranscriptText(text).trimStart()
+}
+
+function normalizeTranscriptSessions(text: string): string {
+ const cleaned = cleanTranscriptText(text)
+ if (!cleaned) {
+ return ""
+ }
+
+ return cleaned
+ .split(/\n\n+/)
+ .map((session) => cleanSessionText(session))
+ .filter((session) => session.length > 0)
+ .join("\n\n")
+}
+
+function mergeTranscriptChunk(previous: string, chunk: string): string {
+ const cleanPrevious = cleanTranscriptText(previous)
+ const cleanChunk = cleanSessionText(chunk)
+
+ if (!cleanChunk) {
+ return cleanPrevious
+ }
+
+ if (!cleanPrevious) {
+ return cleanChunk
+ }
+
+ const normalizedChunk = cleanChunk
+ if (!normalizedChunk) {
+ return cleanPrevious
+ }
+
+ if (/^[,.;:!?)]/.test(normalizedChunk)) {
+ return `${cleanPrevious}${normalizedChunk}`
+ }
+
+ return `${cleanPrevious} ${normalizedChunk}`
+}
type ServerItem = {
id: string
@@ -135,6 +317,11 @@ type SavedState = {
activeSessionId: string | null
}
+type WhisperSavedState = {
+ defaultModel: WhisperModelID
+ mode: TranscriptionMode
+}
+
type Cam = {
CameraView: (typeof import("expo-camera"))["CameraView"]
requestCameraPermissionsAsync: (typeof import("expo-camera"))["Camera"]["requestCameraPermissionsAsync"]
@@ -245,12 +432,16 @@ function fromSaved(input: SavedState): {
export default function DictationScreen() {
const [camera, setCamera] = useState(null)
- const [modelReset, setModelReset] = useState(false)
- const model = useSpeechToText({
- model: WHISPER_BASE_EN,
- preventLoad: modelReset,
- })
-
+ const [defaultWhisperModel, setDefaultWhisperModel] = useState(DEFAULT_WHISPER_MODEL)
+ const [activeWhisperModel, setActiveWhisperModel] = useState(null)
+ const [installedWhisperModels, setInstalledWhisperModels] = useState([])
+ const [whisperSettingsOpen, setWhisperSettingsOpen] = useState(false)
+ const [downloadingModelID, setDownloadingModelID] = useState(null)
+ const [downloadProgress, setDownloadProgress] = useState(0)
+ const [isPreparingWhisperModel, setIsPreparingWhisperModel] = useState(true)
+ const [transcriptionMode, setTranscriptionMode] = useState(DEFAULT_TRANSCRIPTION_MODE)
+ const [isTranscribingBulk, setIsTranscribingBulk] = useState(false)
+ const [whisperError, setWhisperError] = useState("")
const [transcribedText, setTranscribedText] = useState("")
const [isRecording, setIsRecording] = useState(false)
const [permissionGranted, setPermissionGranted] = useState(false)
@@ -283,21 +474,23 @@ export default function DictationScreen() {
const pressInTimeRef = useRef(0)
const accumulatedRef = useRef("")
const baseTextRef = useRef("")
- // Keep a ref to model so audio callbacks always use the latest hook closure
- const modelRef = useRef(model)
- modelRef.current = model
- const prewarmPromiseRef = useRef | null>(null)
- const hasPrewarmedRef = useRef(false)
+ const whisperContextRef = useRef(null)
+ const whisperContextModelRef = useRef(null)
+ const whisperTranscriberRef = useRef(null)
+ const bulkAudioStreamRef = useRef(null)
+ const bulkAudioChunksRef = useRef([])
+ const bulkTranscriptionJobRef = useRef(0)
+ const downloadProgressRef = useRef(0)
+ const waveformPulseIntervalRef = useRef | null>(null)
const sendSettleTimeoutRef = useRef | null>(null)
const foregroundMonitorAbortRef = useRef(null)
const monitorJobRef = useRef(null)
const previousPushTokenRef = useRef(null)
const scanLockRef = useRef(false)
const restoredRef = useRef(false)
+ const whisperRestoredRef = useRef(false)
const refreshSeqRef = useRef>({})
- const [recorder] = useState(() => new AudioRecorder())
-
useEffect(() => {
serversRef.current = servers
}, [servers])
@@ -338,12 +531,38 @@ export default function DictationScreen() {
monitorJobRef.current = monitorJob
}, [monitorJob])
- const ensureAudioRoute = useCallback(async () => {
- await AudioManager.setAudioSessionActivity(true)
- const devices = await AudioManager.getDevicesInfo()
- if (devices.currentInputs.length === 0 && devices.availableInputs.length > 0) {
- await AudioManager.setInputDevice(devices.availableInputs[0].id)
+ const modelPath = useCallback((modelID: WhisperModelID) => `${WHISPER_MODELS_DIR}/${modelID}`, [])
+
+ const refreshInstalledWhisperModels = useCallback(async () => {
+ const next: WhisperModelID[] = []
+
+ for (const modelID of WHISPER_MODELS) {
+ try {
+ const info = await FileSystem.getInfoAsync(modelPath(modelID))
+ if (info.exists) {
+ next.push(modelID)
+ }
+ } catch {
+ // Ignore model metadata read errors.
+ }
}
+
+ setInstalledWhisperModels(next)
+ return next
+ }, [modelPath])
+
+ const stopWaveformPulse = useCallback(() => {
+ if (waveformPulseIntervalRef.current) {
+ clearInterval(waveformPulseIntervalRef.current)
+ waveformPulseIntervalRef.current = null
+ }
+ }, [])
+
+ const clearWaveform = useCallback(() => {
+ const cleared = new Array(waveformLevelsRef.current.length).fill(0)
+ waveformLevelsRef.current = cleared
+ setWaveformLevels(cleared)
+ setWaveformTick(Date.now())
}, [])
useEffect(() => {
@@ -351,23 +570,9 @@ export default function DictationScreen() {
if (sendSettleTimeoutRef.current) {
clearTimeout(sendSettleTimeoutRef.current)
}
+ stopWaveformPulse()
}
- }, [])
-
- // Warm up the model once after load to reduce first-utterance latency.
- useEffect(() => {
- if (!model.isReady || hasPrewarmedRef.current) return
- hasPrewarmedRef.current = true
- prewarmPromiseRef.current = (async () => {
- try {
- await modelRef.current.transcribe(new Float32Array(SAMPLE_RATE / 2), {
- verbose: false,
- })
- } catch {
- // Prewarm best-effort only.
- }
- })()
- }, [model.isReady])
+ }, [stopWaveformPulse])
// Set up audio session and request permissions on mount
useEffect(() => {
@@ -411,6 +616,215 @@ export default function DictationScreen() {
})()
}, [])
+ const loadWhisperContext = useCallback(
+ async (modelID: WhisperModelID) => {
+ if (whisperContextRef.current && whisperContextModelRef.current === modelID) {
+ setActiveWhisperModel(modelID)
+ return whisperContextRef.current
+ }
+
+ setIsPreparingWhisperModel(true)
+ setWhisperError("")
+
+ try {
+ const existing = whisperContextRef.current
+ whisperContextRef.current = null
+ whisperContextModelRef.current = null
+ if (existing) {
+ await existing.release().catch(() => {})
+ }
+
+ const context = await initWhisper({
+ filePath: modelPath(modelID),
+ useGpu: Platform.OS === "ios",
+ })
+
+ whisperContextRef.current = context
+ whisperContextModelRef.current = modelID
+ setActiveWhisperModel(modelID)
+ return context
+ } catch (error) {
+ const message = error instanceof Error ? error.message : "Failed to load Whisper model"
+ setWhisperError(message)
+ throw error
+ } finally {
+ setIsPreparingWhisperModel(false)
+ }
+ },
+ [modelPath],
+ )
+
+ const downloadWhisperModel = useCallback(
+ async (modelID: WhisperModelID) => {
+ if (downloadingModelID && downloadingModelID !== modelID) {
+ return false
+ }
+
+ setDownloadingModelID(modelID)
+ downloadProgressRef.current = 0
+ setDownloadProgress(0)
+ setWhisperError("")
+
+ try {
+ await FileSystem.makeDirectoryAsync(WHISPER_MODELS_DIR, { intermediates: true }).catch(() => {})
+
+ const targetPath = modelPath(modelID)
+ await FileSystem.deleteAsync(targetPath, { idempotent: true }).catch(() => {})
+
+ const download = FileSystem.createDownloadResumable(
+ `${WHISPER_REPO}/${modelID}`,
+ targetPath,
+ {},
+ (event: FileSystem.DownloadProgressData) => {
+ const total = event.totalBytesExpectedToWrite
+ if (!total) return
+ const rawProgress = Math.max(0, Math.min(1, event.totalBytesWritten / total))
+ const progress = Math.max(downloadProgressRef.current, rawProgress)
+ downloadProgressRef.current = progress
+ setDownloadProgress(progress)
+ },
+ )
+
+ const result = await download.downloadAsync()
+ if (!result?.uri) {
+ throw new Error("Whisper model download did not complete")
+ }
+
+ await refreshInstalledWhisperModels()
+ return true
+ } catch (error) {
+ const message = error instanceof Error ? error.message : "Failed to download Whisper model"
+ setWhisperError(message)
+ return false
+ } finally {
+ setDownloadingModelID((current) => (current === modelID ? null : current))
+ }
+ },
+ [downloadingModelID, modelPath, refreshInstalledWhisperModels],
+ )
+
+ const ensureWhisperModelReady = useCallback(
+ async (modelID: WhisperModelID) => {
+ const info = await FileSystem.getInfoAsync(modelPath(modelID))
+ if (!info.exists) {
+ const downloaded = await downloadWhisperModel(modelID)
+ if (!downloaded) {
+ throw new Error(`Unable to download ${modelID}`)
+ }
+ }
+ return loadWhisperContext(modelID)
+ },
+ [downloadWhisperModel, loadWhisperContext, modelPath],
+ )
+
+ useEffect(() => {
+ let mounted = true
+
+ ;(async () => {
+ await FileSystem.makeDirectoryAsync(WHISPER_MODELS_DIR, { intermediates: true }).catch(() => {})
+
+ let nextDefaultModel: WhisperModelID = DEFAULT_WHISPER_MODEL
+ let nextMode: TranscriptionMode = DEFAULT_TRANSCRIPTION_MODE
+ try {
+ const data = await FileSystem.readAsStringAsync(WHISPER_SETTINGS_FILE)
+ if (data) {
+ const parsed = JSON.parse(data) as Partial
+ if (isWhisperModelID(parsed.defaultModel)) {
+ nextDefaultModel = parsed.defaultModel
+ }
+ if (isTranscriptionMode(parsed.mode)) {
+ nextMode = parsed.mode
+ }
+ }
+ } catch {
+ // Use default settings if state file is missing or invalid.
+ }
+
+ if (!mounted) return
+
+ whisperRestoredRef.current = true
+ setDefaultWhisperModel(nextDefaultModel)
+ setTranscriptionMode(nextMode)
+
+ await refreshInstalledWhisperModels()
+
+ try {
+ await ensureWhisperModelReady(nextDefaultModel)
+ } catch (error) {
+ console.error("[Whisper] Failed to initialize default model:", error)
+ } finally {
+ if (mounted) {
+ setIsPreparingWhisperModel(false)
+ }
+ }
+ })()
+
+ return () => {
+ mounted = false
+ }
+ }, [ensureWhisperModelReady, refreshInstalledWhisperModels])
+
+ useEffect(() => {
+ if (!whisperRestoredRef.current) return
+ const payload: WhisperSavedState = { defaultModel: defaultWhisperModel, mode: transcriptionMode }
+ FileSystem.writeAsStringAsync(WHISPER_SETTINGS_FILE, JSON.stringify(payload)).catch(() => {})
+ }, [defaultWhisperModel, transcriptionMode])
+
+ useEffect(() => {
+ return () => {
+ const transcriber = whisperTranscriberRef.current
+ whisperTranscriberRef.current = null
+ if (transcriber) {
+ void (async () => {
+ await transcriber.stop().catch(() => {})
+ await transcriber.release().catch(() => {})
+ })()
+ }
+
+ const bulkStream = bulkAudioStreamRef.current
+ bulkAudioStreamRef.current = null
+ if (bulkStream) {
+ void (async () => {
+ await bulkStream.stop().catch(() => {})
+ await bulkStream.release().catch(() => {})
+ })()
+ }
+
+ const context = whisperContextRef.current
+ whisperContextRef.current = null
+ whisperContextModelRef.current = null
+
+ if (context) {
+ context.release().catch(() => {})
+ }
+
+ releaseAllWhisper().catch(() => {})
+ }
+ }, [])
+
+ const startWaveformPulse = useCallback(() => {
+ if (waveformPulseIntervalRef.current) return
+
+ waveformPulseIntervalRef.current = setInterval(() => {
+ if (!isRecordingRef.current) return
+
+ const next = waveformLevelsRef.current.map((value) => {
+ const decay = value * 0.45
+ const lift = Math.random() * 0.95
+ return Math.max(0.08, Math.min(1, decay + lift * 0.55))
+ })
+
+ waveformLevelsRef.current = next
+
+ const now = Date.now()
+ if (now - lastWaveformCommitRef.current > 45) {
+ setWaveformLevels(next)
+ setWaveformTick(now)
+ lastWaveformCommitRef.current = now
+ }
+ }, 70)
+ }, [])
+
useEffect(() => {
const sub = AppState.addEventListener("change", (nextState) => {
setAppState(nextState)
@@ -463,192 +877,296 @@ export default function DictationScreen() {
return () => notificationSub.remove()
}, [])
- const startRecording = useCallback(async () => {
- const m = modelRef.current
- if (!m.isReady || isRecordingRef.current || isStartingRef.current) return
-
- isStartingRef.current = true
- const sessionId = Date.now()
- activeSessionRef.current = sessionId
- accumulatedRef.current = ""
- baseTextRef.current = transcribedText
- isRecordingRef.current = true
- setIsRecording(true)
- const cancelled = () => !isRecordingRef.current || activeSessionRef.current !== sessionId
-
- // If prewarm is still running, wait once here to avoid ModelGenerating race.
- if (prewarmPromiseRef.current) {
- await prewarmPromiseRef.current
- prewarmPromiseRef.current = null
- }
- if (cancelled()) {
- isStartingRef.current = false
- return
- }
-
- try {
- await ensureAudioRoute()
- } catch (e) {
- console.warn("[Dictation] Failed to ensure audio route:", e)
- }
- if (cancelled()) {
- isStartingRef.current = false
- return
- }
-
- recorder.onError((err) => {
- console.error("[Dictation] Recorder error:", err.message)
- if (activeSessionRef.current !== sessionId) return
- isRecordingRef.current = false
- activeSessionRef.current = 0
- setIsRecording(false)
- recorder.clearOnAudioReady()
- recorder.clearOnError()
- modelRef.current.streamStop()
- })
-
- const readyResult = recorder.onAudioReady(
- {
- sampleRate: SAMPLE_RATE,
- bufferLength: AUDIO_BUFFER_SECONDS * SAMPLE_RATE,
- channelCount: 1,
- },
- (chunk) => {
- if (activeSessionRef.current !== sessionId) return
- const samples = chunk.buffer.getChannelData(0)
- if (!samples || samples.length === 0) return
-
- // Defensive guard against invalid chunk data coming from unstable audio routes.
- let valid = true
- for (let i = 0; i < samples.length; i += 32) {
- if (!Number.isFinite(samples[i])) {
- valid = false
- break
- }
- }
- if (!valid) return
-
- const columns = waveformLevelsRef.current.length
- const segmentLength = Math.max(1, Math.floor(samples.length / Math.max(columns, 1)))
- const next = new Array(columns).fill(0)
-
- for (let b = 0; b < columns; b++) {
- const start = b * segmentLength
- const end = Math.min(samples.length, start + segmentLength)
-
- let sum = 0
- for (let i = start; i < end; i++) {
- const s = samples[i]
- sum += s * s
- }
-
- const rms = Math.sqrt(sum / Math.max(end - start, 1))
- const base = Math.min(1, rms * 10)
- const previous = waveformLevelsRef.current[b] ?? 0
- // Fast rise, slower decay for more natural meter behavior
- next[b] = base > previous ? base : previous * 0.82
- }
-
- waveformLevelsRef.current = next
- const now = Date.now()
- if (now - lastWaveformCommitRef.current > 45) {
- setWaveformLevels(next)
- setWaveformTick(now)
- lastWaveformCommitRef.current = now
- }
-
- // Always use the latest model ref to avoid stale closure.
- modelRef.current.streamInsert(samples)
- },
- )
-
- if (readyResult.status === "error") {
- console.error("[Dictation] onAudioReady failed:", readyResult.message)
- isRecordingRef.current = false
- activeSessionRef.current = 0
- setIsRecording(false)
- recorder.clearOnAudioReady()
- recorder.clearOnError()
- isStartingRef.current = false
- return
- }
- if (cancelled()) {
- recorder.clearOnAudioReady()
- recorder.clearOnError()
- modelRef.current.streamStop()
- isStartingRef.current = false
- return
- }
-
- // Start stream first, then begin feeding chunks from recorder.
- const streamIter = modelRef.current.stream({ verbose: false })
- let sawTextInSession = false
- const streamTask = (async () => {
- for await (const { committed, nonCommitted } of streamIter) {
- if (!isRecordingRef.current) break
-
- if (committed.text) {
- accumulatedRef.current += committed.text
- }
-
- if (committed.text || nonCommitted.text) {
- sawTextInSession = true
- }
-
- const base = baseTextRef.current
- const separator = base.length > 0 ? "\n\n" : ""
- // Whisper can emit a leading-space token at the start of each session.
- const sessionText = (accumulatedRef.current + nonCommitted.text).replace(/^\s+/, "")
- setTranscribedText(base + separator + sessionText)
- }
- })()
-
- const startResult = recorder.start()
- if (startResult.status === "error") {
- console.error("[Dictation] Recorder start failed:", startResult.message)
- modelRef.current.streamStop()
- isRecordingRef.current = false
- activeSessionRef.current = 0
- setIsRecording(false)
- recorder.clearOnAudioReady()
- recorder.clearOnError()
- isStartingRef.current = false
- return
- }
- isStartingRef.current = false
-
- try {
- await streamTask
- if (sawTextInSession) {
- setHasCompletedSession(true)
- }
- } catch (error) {
- console.error("[Dictation] Streaming error:", error)
- }
- }, [ensureAudioRoute, recorder, transcribedText])
-
- const stopRecording = useCallback(() => {
- if (!isRecordingRef.current) return
-
+ const finalizeRecordingState = useCallback(() => {
isRecordingRef.current = false
activeSessionRef.current = 0
isStartingRef.current = false
setIsRecording(false)
+ stopWaveformPulse()
+ clearWaveform()
+ }, [clearWaveform, stopWaveformPulse])
+
+ const startRecording = useCallback(async () => {
+ if (isRecordingRef.current || isStartingRef.current || downloadingModelID || isTranscribingBulk) return
+
+ isStartingRef.current = true
+ const sessionID = Date.now()
+ activeSessionRef.current = sessionID
+ accumulatedRef.current = ""
+ baseTextRef.current = normalizeTranscriptSessions(transcribedText)
+ if (baseTextRef.current !== transcribedText) {
+ setTranscribedText(baseTextRef.current)
+ }
+ isRecordingRef.current = true
+ setIsRecording(true)
+ setWhisperError("")
+
+ const cancelled = () => !isRecordingRef.current || activeSessionRef.current !== sessionID
+
+ try {
+ const context = await ensureWhisperModelReady(defaultWhisperModel)
+ if (cancelled()) {
+ isStartingRef.current = false
+ return
+ }
+
+ const previousTranscriber = whisperTranscriberRef.current
+ whisperTranscriberRef.current = null
+ if (previousTranscriber) {
+ await previousTranscriber.stop().catch(() => {})
+ await previousTranscriber.release().catch(() => {})
+ }
+
+ const previousBulkStream = bulkAudioStreamRef.current
+ bulkAudioStreamRef.current = null
+ if (previousBulkStream) {
+ await previousBulkStream.stop().catch(() => {})
+ await previousBulkStream.release().catch(() => {})
+ }
+
+ bulkAudioChunksRef.current = []
+ bulkTranscriptionJobRef.current = 0
+
+ startWaveformPulse()
+
+ const englishOnlyModel = isEnglishOnlyWhisperModel(defaultWhisperModel)
+
+ if (transcriptionMode === "bulk") {
+ const audioStream = new AudioPcmStreamAdapter()
+ audioStream.onData((packet: unknown) => {
+ if (activeSessionRef.current !== sessionID) return
+ const data = (packet as { data?: unknown }).data
+ if (!(data instanceof Uint8Array) || data.length === 0) return
+ bulkAudioChunksRef.current.push(new Uint8Array(data))
+ })
+ audioStream.onError((error: string) => {
+ if (activeSessionRef.current !== sessionID) return
+ setWhisperError(error)
+ console.error("[Dictation] Bulk audio stream error:", error)
+ })
+
+ await audioStream.initialize({
+ sampleRate: 16000,
+ channels: 1,
+ bitsPerSample: 16,
+ bufferSize: 16 * 1024,
+ audioSource: 6,
+ })
+ await audioStream.start()
+
+ bulkAudioStreamRef.current = audioStream
+
+ if (cancelled()) {
+ await audioStream.stop().catch(() => {})
+ await audioStream.release().catch(() => {})
+ if (bulkAudioStreamRef.current === audioStream) {
+ bulkAudioStreamRef.current = null
+ }
+ finalizeRecordingState()
+ return
+ }
+
+ isStartingRef.current = false
+ return
+ }
+
+ const transcriber = new RealtimeTranscriber(
+ {
+ whisperContext: context,
+ audioStream: new AudioPcmStreamAdapter(),
+ },
+ {
+ audioSliceSec: 4,
+ audioMinSec: 0.8,
+ maxSlicesInMemory: 6,
+ transcribeOptions: {
+ language: englishOnlyModel ? "en" : "auto",
+ translate: !englishOnlyModel,
+ maxLen: 1,
+ },
+ logger: () => {},
+ },
+ {
+ onTranscribe: (event: RealtimeTranscribeEvent) => {
+ if (activeSessionRef.current !== sessionID) return
+ if (event.type !== "transcribe") return
+
+ const nextSessionText = mergeTranscriptChunk(accumulatedRef.current, event.data?.result ?? "")
+ accumulatedRef.current = nextSessionText
+
+ const base = normalizeTranscriptSessions(baseTextRef.current)
+ const separator = base.length > 0 && nextSessionText.length > 0 ? "\n\n" : ""
+ setTranscribedText(normalizeTranscriptSessions(base + separator + nextSessionText))
+
+ if (nextSessionText.length > 0) {
+ setHasCompletedSession(true)
+ }
+ },
+ onError: (error: string) => {
+ if (activeSessionRef.current !== sessionID) return
+ console.error("[Dictation] Whisper realtime error:", error)
+ setWhisperError(error)
+ },
+ onStatusChange: (active: boolean) => {
+ if (activeSessionRef.current !== sessionID) return
+ if (!active) {
+ if (whisperTranscriberRef.current === transcriber) {
+ whisperTranscriberRef.current = null
+ }
+ finalizeRecordingState()
+ }
+ },
+ },
+ )
+
+ whisperTranscriberRef.current = transcriber
+ await transcriber.start()
+
+ if (cancelled()) {
+ await transcriber.stop().catch(() => {})
+ await transcriber.release().catch(() => {})
+ if (whisperTranscriberRef.current === transcriber) {
+ whisperTranscriberRef.current = null
+ }
+ finalizeRecordingState()
+ return
+ }
+
+ isStartingRef.current = false
+ } catch (error) {
+ console.error("[Dictation] Failed to start realtime transcription:", error)
+ const message = error instanceof Error ? error.message : "Unable to start transcription"
+ setWhisperError(message)
+
+ const activeTranscriber = whisperTranscriberRef.current
+ whisperTranscriberRef.current = null
+ if (activeTranscriber) {
+ void (async () => {
+ await activeTranscriber.stop().catch(() => {})
+ await activeTranscriber.release().catch(() => {})
+ })()
+ }
+
+ finalizeRecordingState()
+ Haptics.notificationAsync(Haptics.NotificationFeedbackType.Error).catch(() => {})
+ }
+ }, [
+ defaultWhisperModel,
+ downloadingModelID,
+ ensureWhisperModelReady,
+ finalizeRecordingState,
+ isTranscribingBulk,
+ startWaveformPulse,
+ transcriptionMode,
+ transcribedText,
+ ])
+
+ const stopRecording = useCallback(() => {
+ if (!isRecordingRef.current && !isStartingRef.current) return
+
Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Light).catch(() => {})
- recorder.stop()
- recorder.clearOnAudioReady()
- recorder.clearOnError()
- modelRef.current.streamStop()
- const cleared = new Array(waveformLevelsRef.current.length).fill(0)
- waveformLevelsRef.current = cleared
- setWaveformLevels(cleared)
- setWaveformTick(Date.now())
- }, [recorder])
+
+ const baseAtStop = normalizeTranscriptSessions(baseTextRef.current)
+ const englishOnlyModel = isEnglishOnlyWhisperModel(defaultWhisperModel)
+
+ const transcriber = whisperTranscriberRef.current
+ whisperTranscriberRef.current = null
+ if (transcriber) {
+ void (async () => {
+ await transcriber.stop().catch((error: unknown) => {
+ console.warn("[Dictation] Failed to stop realtime transcription:", error)
+ })
+ await transcriber.release().catch(() => {})
+ })()
+ }
+
+ const bulkStream = bulkAudioStreamRef.current
+ bulkAudioStreamRef.current = null
+ const bulkChunks = bulkAudioChunksRef.current
+ bulkAudioChunksRef.current = []
+
+ finalizeRecordingState()
+
+ if (transcriptionMode !== "bulk") {
+ return
+ }
+
+ const runID = Date.now()
+ bulkTranscriptionJobRef.current = runID
+
+ void (async () => {
+ if (bulkStream) {
+ await bulkStream.stop().catch((error: unknown) => {
+ console.warn("[Dictation] Failed to stop bulk audio stream:", error)
+ })
+ await bulkStream.release().catch(() => {})
+ }
+
+ if (bulkChunks.length === 0) {
+ return
+ }
+
+ const totalLength = bulkChunks.reduce((sum, chunk) => sum + chunk.length, 0)
+ if (totalLength === 0) {
+ return
+ }
+
+ const merged = new Uint8Array(totalLength)
+ let offset = 0
+ for (const chunk of bulkChunks) {
+ merged.set(chunk, offset)
+ offset += chunk.length
+ }
+
+ const context = whisperContextRef.current
+ if (!context) {
+ setWhisperError("Whisper model is not loaded")
+ return
+ }
+
+ setIsTranscribingBulk(true)
+
+ try {
+ const { promise } = context.transcribeData(merged.buffer, {
+ language: englishOnlyModel ? "en" : "auto",
+ translate: !englishOnlyModel,
+ maxLen: 1,
+ })
+
+ const result = await promise
+ if (bulkTranscriptionJobRef.current !== runID) {
+ return
+ }
+
+ const sessionText = cleanSessionText(result.result ?? "")
+ if (!sessionText) {
+ return
+ }
+
+ const separator = baseAtStop.length > 0 ? "\n\n" : ""
+ setTranscribedText(normalizeTranscriptSessions(baseAtStop + separator + sessionText))
+ setHasCompletedSession(true)
+ } catch (error) {
+ if (bulkTranscriptionJobRef.current !== runID) {
+ return
+ }
+ const message = error instanceof Error ? error.message : "Bulk transcription failed"
+ setWhisperError(message)
+ console.error("[Dictation] Bulk transcription failed:", error)
+ } finally {
+ if (bulkTranscriptionJobRef.current === runID) {
+ setIsTranscribingBulk(false)
+ }
+ }
+ })()
+ }, [defaultWhisperModel, finalizeRecordingState, transcriptionMode])
const clearIconRotation = useSharedValue(0)
const sendOutProgress = useSharedValue(0)
const handleClearTranscript = useCallback(() => {
+ Haptics.selectionAsync().catch(() => {})
+
clearIconRotation.value = withSequence(
withTiming(-30, { duration: 90 }),
withTiming(30, { duration: 120 }),
@@ -662,42 +1180,10 @@ export default function DictationScreen() {
baseTextRef.current = ""
setTranscribedText("")
setHasCompletedSession(false)
- const cleared = new Array(waveformLevelsRef.current.length).fill(0)
- waveformLevelsRef.current = cleared
- setWaveformLevels(cleared)
- setWaveformTick(Date.now())
+ clearWaveform()
sendOutProgress.value = 0
setIsSending(false)
- }, [clearIconRotation, sendOutProgress, stopRecording])
-
- const handleDeleteModel = useCallback(async () => {
- if (modelReset) return
-
- if (isRecordingRef.current) {
- stopRecording()
- }
-
- setModelReset(true)
- accumulatedRef.current = ""
- baseTextRef.current = ""
- setTranscribedText("")
- setHasCompletedSession(false)
- const cleared = new Array(waveformLevelsRef.current.length).fill(0)
- waveformLevelsRef.current = cleared
- setWaveformLevels(cleared)
- setWaveformTick(Date.now())
- sendOutProgress.value = 0
- setIsSending(false)
- Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Medium).catch(() => {})
-
- try {
- await ExpoResourceFetcher.deleteResources(WHISPER_BASE_EN.modelSource, WHISPER_BASE_EN.tokenizerSource)
- } catch (err) {
- console.error("Failed to delete model resources:", err)
- }
-
- setModelReset(false)
- }, [modelReset, sendOutProgress, stopRecording])
+ }, [clearIconRotation, clearWaveform, sendOutProgress, stopRecording])
const resetTranscriptState = useCallback(() => {
if (isRecordingRef.current) {
@@ -707,11 +1193,91 @@ export default function DictationScreen() {
baseTextRef.current = ""
setTranscribedText("")
setHasCompletedSession(false)
- const cleared = new Array(waveformLevelsRef.current.length).fill(0)
- waveformLevelsRef.current = cleared
- setWaveformLevels(cleared)
- setWaveformTick(Date.now())
- }, [stopRecording])
+ clearWaveform()
+ }, [clearWaveform, stopRecording])
+
+ const handleOpenWhisperSettings = useCallback(() => {
+ Haptics.selectionAsync().catch(() => {})
+ setDropdownMode("none")
+ setWhisperSettingsOpen(true)
+ }, [])
+
+ const handleDownloadWhisperModel = useCallback(
+ async (modelID: WhisperModelID) => {
+ const ok = await downloadWhisperModel(modelID)
+ if (ok) {
+ Haptics.selectionAsync().catch(() => {})
+ }
+ },
+ [downloadWhisperModel],
+ )
+
+ const handleSelectWhisperModel = useCallback(
+ async (modelID: WhisperModelID) => {
+ if (isRecordingRef.current || isStartingRef.current) {
+ stopRecording()
+ }
+
+ try {
+ await ensureWhisperModelReady(modelID)
+ setDefaultWhisperModel(modelID)
+ setWhisperError("")
+ Haptics.selectionAsync().catch(() => {})
+ } catch (error) {
+ const message = error instanceof Error ? error.message : "Unable to switch Whisper model"
+ setWhisperError(message)
+ }
+ },
+ [ensureWhisperModelReady, stopRecording],
+ )
+
+ const handleDeleteWhisperModel = useCallback(
+ async (modelID: WhisperModelID) => {
+ if (downloadingModelID === modelID) return
+
+ if (isRecordingRef.current || isStartingRef.current) {
+ stopRecording()
+ }
+
+ if (whisperContextModelRef.current === modelID && whisperContextRef.current) {
+ const activeContext = whisperContextRef.current
+ whisperContextRef.current = null
+ whisperContextModelRef.current = null
+ setActiveWhisperModel(null)
+ await activeContext.release().catch(() => {})
+ }
+
+ await FileSystem.deleteAsync(modelPath(modelID), { idempotent: true }).catch(() => {})
+ const nextInstalled = await refreshInstalledWhisperModels()
+
+ if (defaultWhisperModel === modelID) {
+ const fallbackModel = nextInstalled[0] ?? DEFAULT_WHISPER_MODEL
+ setDefaultWhisperModel(fallbackModel)
+ try {
+ await ensureWhisperModelReady(fallbackModel)
+ } catch {
+ // Keep UI responsive if fallback init fails.
+ }
+ } else if (activeWhisperModel == null && nextInstalled.includes(defaultWhisperModel)) {
+ try {
+ await ensureWhisperModelReady(defaultWhisperModel)
+ } catch {
+ // Keep UI responsive if default model init fails.
+ }
+ }
+
+ Haptics.selectionAsync().catch(() => {})
+ },
+ [
+ activeWhisperModel,
+ defaultWhisperModel,
+ downloadingModelID,
+ ensureWhisperModelReady,
+ modelPath,
+ refreshInstalledWhisperModels,
+ stopRecording,
+ ],
+ )
const completeSend = useCallback(() => {
if (sendSettleTimeoutRef.current) {
@@ -960,10 +1526,13 @@ export default function DictationScreen() {
}
}, [stopRecording])
- const modelLoading = !model.isReady
- const prog = model.downloadProgress > 1 ? model.downloadProgress / 100 : model.downloadProgress
- const load = Math.max(0, Math.min(1, Number.isFinite(prog) ? prog : 0))
- const pct = Math.round(load * 100)
+ const modelDownloading = downloadingModelID !== null
+ const modelLoading = isPreparingWhisperModel || activeWhisperModel == null || modelDownloading || isTranscribingBulk
+ const modelLoadingState = modelDownloading ? "downloading" : modelLoading ? "loading" : "ready"
+ const pct = Math.round(Math.max(0, Math.min(1, downloadProgress)) * 100)
+ const loadingModelLabel = downloadingModelID
+ ? WHISPER_MODEL_LABELS[downloadingModelID]
+ : WHISPER_MODEL_LABELS[defaultWhisperModel]
const hasTranscript = transcribedText.trim().length > 0
const shouldShowSend = hasCompletedSession && hasTranscript
const activeServer = servers.find((s) => s.id === activeServerId) ?? null
@@ -995,12 +1564,12 @@ export default function DictationScreen() {
}, [isRecording, recordingProgress])
useEffect(() => {
- const isGenerating = isRecording || model.isGenerating
+ const isGenerating = isRecording
waveformVisibility.value = withTiming(isGenerating ? 1 : 0, {
duration: isGenerating ? 180 : 240,
easing: Easing.inOut(Easing.quad),
})
- }, [isRecording, model.isGenerating, waveformVisibility])
+ }, [isRecording, waveformVisibility])
useEffect(() => {
serverMenuProgress.value = withTiming(isDropdownOpen ? 1 : 0, {
@@ -1736,14 +2305,16 @@ export default function DictationScreen() {
{
- void handleDeleteModel()
- }}
+ onPress={handleOpenWhisperSettings}
style={({ pressed }) => [styles.clearButton, pressed && styles.clearButtonPressed]}
hitSlop={8}
- disabled={modelLoading || modelReset}
>
- DL
+
) : null}
+ {whisperError ? (
+
+ {whisperError}
+
+ ) : null}
+
- {modelLoading ? (
+ {isTranscribingBulk ? (
+
+
+
+ ) : modelLoadingState !== "ready" ? (
<>
-
+
- {`Downloading model ${pct}%`}
+
+ {modelLoadingState === "downloading"
+ ? `Downloading ${loadingModelLabel} ${pct}%`
+ : `Loading ${loadingModelLabel}`}
+
>
) : (
@@ -1831,6 +2422,162 @@ export default function DictationScreen() {
+ setWhisperSettingsOpen(false)}
+ >
+
+
+
+ Whisper models
+ Default: {WHISPER_MODEL_LABELS[defaultWhisperModel]}
+
+ setWhisperSettingsOpen(false)}>
+ Done
+
+
+
+
+ Transcription
+
+ setTranscriptionMode("bulk")}
+ disabled={isRecording || isTranscribingBulk}
+ style={({ pressed }) => [
+ styles.settingsModeButton,
+ transcriptionMode === "bulk" && styles.settingsModeButtonActive,
+ (isRecording || isTranscribingBulk) && styles.settingsInlinePressableDisabled,
+ pressed && styles.clearButtonPressed,
+ ]}
+ >
+
+ On Release
+
+
+
+ setTranscriptionMode("realtime")}
+ disabled={isRecording || isTranscribingBulk}
+ style={({ pressed }) => [
+ styles.settingsModeButton,
+ transcriptionMode === "realtime" && styles.settingsModeButtonActive,
+ (isRecording || isTranscribingBulk) && styles.settingsInlinePressableDisabled,
+ pressed && styles.clearButtonPressed,
+ ]}
+ >
+
+ Realtime
+
+
+
+
+
+
+ {WHISPER_MODELS.map((modelID) => {
+ const installed = installedWhisperModels.includes(modelID)
+ const isDefault = defaultWhisperModel === modelID
+ const isDownloading = downloadingModelID === modelID
+ const actionDisabled = (downloadingModelID !== null && !isDownloading) || isTranscribingBulk
+ const rowLabel = isDefault ? `${modelID} · default` : modelID
+ const actionIcon = isDownloading ? "…" : installed ? "✓" : "↓"
+ const downloadPct = Math.round(Math.max(0, Math.min(1, downloadProgress)) * 100)
+ const actionLabel = isDownloading
+ ? "Downloading"
+ : installed
+ ? isDefault
+ ? "Selected"
+ : "Select"
+ : "Download"
+ const sizeLabel = formatWhisperModelSize(WHISPER_MODEL_SIZES[modelID])
+
+ return (
+
+ {
+ if (installed) {
+ void handleSelectWhisperModel(modelID)
+ }
+ }}
+ onLongPress={() => {
+ if (!installed || isDownloading) return
+ Alert.alert("Delete model?", `Remove ${modelID} from this device?`, [
+ { text: "Cancel", style: "cancel" },
+ {
+ text: "Delete",
+ style: "destructive",
+ onPress: () => {
+ void handleDeleteWhisperModel(modelID)
+ },
+ },
+ ])
+ }}
+ delayLongPress={350}
+ disabled={!installed || actionDisabled || isPreparingWhisperModel}
+ style={({ pressed }) => [
+ styles.settingsInlineLabelPressable,
+ (!installed || actionDisabled || isPreparingWhisperModel) &&
+ styles.settingsInlinePressableDisabled,
+ pressed && styles.clearButtonPressed,
+ ]}
+ >
+ {rowLabel}
+
+
+ {sizeLabel}
+
+ {
+ if (isDownloading) return
+ if (installed) {
+ void handleSelectWhisperModel(modelID)
+ return
+ }
+ void handleDownloadWhisperModel(modelID)
+ }}
+ disabled={actionDisabled || (installed && isPreparingWhisperModel)}
+ accessibilityLabel={actionLabel}
+ style={({ pressed }) => [
+ styles.settingsInlineIconButton,
+ (actionDisabled || (installed && isPreparingWhisperModel)) &&
+ styles.settingsInlinePressableDisabled,
+ pressed && styles.clearButtonPressed,
+ ]}
+ >
+ {isDownloading ? (
+
+ {downloadPct}
+
+ ) : (
+
+ {actionIcon}
+
+ )}
+
+
+ )
+ })}
+
+
+
+
Promise
+ subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void
+ }>
+ transcribeData(
+ data: ArrayBuffer,
+ options?: TranscribeOptions,
+ ): {
+ stop: () => Promise
+ promise: Promise
+ }
+ release(): Promise
+ }
+
+ export type ContextOptions = {
+ filePath: string | number
+ useGpu?: boolean
+ useCoreMLIos?: boolean
+ useFlashAttn?: boolean
+ }
+
+ export function initWhisper(options: ContextOptions): Promise
+ export function releaseAllWhisper(): Promise
+}
+
+declare module "whisper.rn/realtime-transcription/index" {
+ import type { TranscribeOptions, TranscribeResult, WhisperContext } from "whisper.rn"
+
+ export type RealtimeTranscribeEvent = {
+ type: "start" | "transcribe" | "end" | "error"
+ sliceIndex: number
+ data?: TranscribeResult
+ isCapturing: boolean
+ processTime: number
+ recordingTime: number
+ }
+
+ export type RealtimeOptions = {
+ audioSliceSec?: number
+ audioMinSec?: number
+ maxSlicesInMemory?: number
+ transcribeOptions?: TranscribeOptions
+ logger?: (message: string) => void
+ }
+
+ export type RealtimeTranscriberCallbacks = {
+ onTranscribe?: (event: RealtimeTranscribeEvent) => void
+ onError?: (error: string) => void
+ onStatusChange?: (isActive: boolean) => void
+ }
+
+ export type RealtimeTranscriberDependencies = {
+ whisperContext: WhisperContext
+ audioStream: unknown
+ vadContext?: unknown
+ fs?: unknown
+ }
+
+ export class RealtimeTranscriber {
+ constructor(
+ dependencies: RealtimeTranscriberDependencies,
+ options?: RealtimeOptions,
+ callbacks?: RealtimeTranscriberCallbacks,
+ )
+ start(): Promise
+ stop(): Promise
+ release(): Promise
+ updateCallbacks(callbacks: Partial): void
+ }
+}
+
+declare module "whisper.rn/realtime-transcription" {
+ export * from "whisper.rn/realtime-transcription/index"
+}
+
+declare module "whisper.rn/src/realtime-transcription" {
+ export * from "whisper.rn/realtime-transcription/index"
+}
+
+declare module "whisper.rn/realtime-transcription/adapters/AudioPcmStreamAdapter" {
+ export class AudioPcmStreamAdapter {
+ initialize(config: Record): Promise
+ start(): Promise
+ stop(): Promise
+ isRecording(): boolean
+ onData(callback: (data: unknown) => void): void
+ onError(callback: (error: string) => void): void
+ onStatusChange(callback: (isRecording: boolean) => void): void
+ release(): Promise
+ }
+}
+
+declare module "whisper.rn/src/realtime-transcription/adapters/AudioPcmStreamAdapter" {
+ export * from "whisper.rn/realtime-transcription/adapters/AudioPcmStreamAdapter"
+}
diff --git a/packages/opencode/src/server/push-relay.ts b/packages/opencode/src/server/push-relay.ts
index d0b7698129..d4c5eecb3a 100644
--- a/packages/opencode/src/server/push-relay.ts
+++ b/packages/opencode/src/server/push-relay.ts
@@ -139,8 +139,8 @@ async function notify(input: { type: Type; sessionID: string }): Promise
const session = await Session.get(sessionID)
out.title = session.title
+ let latestUser: string | undefined
for await (const msg of MessageV2.stream(sessionID)) {
- if (msg.info.role !== "user") continue
const body = msg.parts
.map((part) => {
if (part.type !== "text") return ""
@@ -151,8 +151,19 @@ async function notify(input: { type: Type; sessionID: string }): Promise
.join(" ")
const next = words(body)
if (!next) continue
- out.body = next
- break
+
+ if (msg.info.role === "assistant") {
+ out.body = next
+ break
+ }
+
+ if (!latestUser && msg.info.role === "user") {
+ latestUser = next
+ }
+ }
+
+ if (!out.body) {
+ out.body = latestUser
}
} catch (error) {
log.info("notification metadata unavailable", {