diff --git a/bun.lock b/bun.lock index df16de854a..79ff4ee161 100644 --- a/bun.lock +++ b/bun.lock @@ -317,6 +317,7 @@ "name": "mobile-voice", "version": "1.0.0", "dependencies": { + "@fugood/react-native-audio-pcm-stream": "1.1.4", "@react-navigation/bottom-tabs": "^7.15.5", "@react-navigation/elements": "^2.9.10", "@react-navigation/native": "^7.1.33", @@ -345,14 +346,13 @@ "react-dom": "19.2.0", "react-native": "0.83.4", "react-native-audio-api": "^0.11.7", - "react-native-executorch": "^0.8.0", - "react-native-executorch-expo-resource-fetcher": "^0.8.0", "react-native-gesture-handler": "~2.30.0", "react-native-reanimated": "4.2.1", "react-native-safe-area-context": "~5.6.2", "react-native-screens": "~4.23.0", "react-native-web": "~0.21.0", "react-native-worklets": "0.7.2", + "whisper.rn": "0.5.5", }, "devDependencies": { "@types/react": "~19.2.2", @@ -1378,6 +1378,8 @@ "@fontsource/inter": ["@fontsource/inter@5.2.8", "", {}, "sha512-P6r5WnJoKiNVV+zvW2xM13gNdFhAEpQ9dQJHt3naLvfg+LkF2ldgSLiF4T41lf1SQCM9QmkqPTn4TH568IRagg=="], + "@fugood/react-native-audio-pcm-stream": ["@fugood/react-native-audio-pcm-stream@1.1.4", "", {}, "sha512-M6H6ay4ea0vpioII9T/C9qXFPeGpxGN24nl0REP2/wtsorZXg3zzHjZbf3UUUwjf6lEEHMlGCJfXUsxwC/vV8w=="], + "@graphql-typed-document-node/core": ["@graphql-typed-document-node/core@3.2.0", "", { "peerDependencies": { "graphql": "^0.8.0 || ^0.9.0 || ^0.10.0 || ^0.11.0 || ^0.12.0 || ^0.13.0 || ^14.0.0 || ^15.0.0 || ^16.0.0 || ^17.0.0" } }, "sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ=="], "@happy-dom/global-registrator": ["@happy-dom/global-registrator@20.0.11", "", { "dependencies": { "@types/node": "^20.0.0", "happy-dom": "^20.0.11" } }, "sha512-GqNqiShBT/lzkHTMC/slKBrvN0DsD4Di8ssBk4aDaVgEn+2WMzE6DXxq701ndSXj7/0cJ8mNT71pM7Bnrr6JRw=="], @@ -1396,8 +1398,6 @@ "@hono/zod-validator": ["@hono/zod-validator@0.4.2", "", { "peerDependencies": { "hono": ">=3.9.0", "zod": "^3.19.1" } }, "sha512-1rrlBg+EpDPhzOV4hT9pxr5+xDVmKuz6YJl+la7VCwK6ass5ldyKm5fD+umJdV2zhHD6jROoCCv8NbTwyfhT0g=="], - "@huggingface/jinja": ["@huggingface/jinja@0.5.6", "", {}, "sha512-MyMWyLnjqo+KRJYSH7oWNbsOn5onuIvfXYPcc0WOGxU0eHUV7oAYUoQTl2BMdu7ml+ea/bu11UM+EshbeHwtIA=="], - "@ibm/plex": ["@ibm/plex@6.4.1", "", { "dependencies": { "@ibm/telemetry-js": "^1.5.1" } }, "sha512-fnsipQywHt3zWvsnlyYKMikcVI7E2fEwpiPnIHFqlbByXVfQfANAAeJk1IV4mNnxhppUIDlhU0TzwYwL++Rn2g=="], "@ibm/telemetry-js": ["@ibm/telemetry-js@1.11.0", "", { "bin": { "ibmtelemetry": "dist/collect.js" } }, "sha512-RO/9j+URJnSfseWg9ZkEX9p+a3Ousd33DBU7rOafoZB08RqdzxFVYJ2/iM50dkBuD0o7WX7GYt1sLbNgCoE+pA=="], @@ -3918,10 +3918,6 @@ "jsonfile": ["jsonfile@4.0.0", "", { "optionalDependencies": { "graceful-fs": "^4.1.6" } }, "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg=="], - "jsonrepair": ["jsonrepair@3.13.3", "", { "bin": { "jsonrepair": "bin/cli.js" } }, "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ=="], - - "jsonschema": ["jsonschema@1.5.0", "", {}, "sha512-K+A9hhqbn0f3pJX17Q/7H6yQfD/5OXgdrR5UE12gMXCiN9D5Xq2o5mddV2QEcX/bjla99ASsAAQUyMCCRWAEhw=="], - "jsonwebtoken": ["jsonwebtoken@9.0.3", "", { "dependencies": { "jws": "^4.0.1", "lodash.includes": "^4.3.0", "lodash.isboolean": "^3.0.3", "lodash.isinteger": "^4.0.4", "lodash.isnumber": "^3.0.3", "lodash.isplainobject": "^4.0.6", "lodash.isstring": "^4.0.1", "lodash.once": "^4.0.0", "ms": "^2.1.1", "semver": "^7.5.4" } }, "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g=="], "jwa": ["jwa@2.0.1", "", { "dependencies": { "buffer-equal-constant-time": "^1.0.1", "ecdsa-sig-formatter": "1.0.11", "safe-buffer": "^5.0.1" } }, "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg=="], @@ -4536,7 +4532,7 @@ "plist": ["plist@3.1.0", "", { "dependencies": { "@xmldom/xmldom": "^0.8.8", "base64-js": "^1.5.1", "xmlbuilder": "^15.1.1" } }, "sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ=="], - "pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="], + "pngjs": ["pngjs@5.0.0", "", {}, "sha512-40QW5YalBNfQo5yRYmiw7Yz6TKKVr3h6970B2YE+3fQpsWcrbj1PzJgxeJ19DRQjhMbKPIuMY8rFaXc8moolVw=="], "poe-oauth": ["poe-oauth@0.0.3", "", {}, "sha512-KgxDylcuq/mov8URSplrBGjrIjkQwjN/Ml8BhqaGsAvHzYN3yhuROdv1sDRfwqncg7TT8XzJvMeJAWmv/4NDLw=="], @@ -4650,10 +4646,6 @@ "react-native-audio-api": ["react-native-audio-api@0.11.7", "", { "dependencies": { "semver": "^7.7.3" }, "peerDependencies": { "react": "*", "react-native": "*" }, "bin": { "setup-rn-audio-api-web": "scripts/setup-rn-audio-api-web.js" } }, "sha512-2oIoP77Tn2nlouRVfEC3bAsuSyKU6xhGNkSnVXTLLQQZslEDoYX2cN9pVRZoWOqhFrLT8q4IZI9HaFgYL13L1A=="], - "react-native-executorch": ["react-native-executorch@0.8.0", "", { "dependencies": { "@huggingface/jinja": "^0.5.0", "jsonrepair": "^3.12.0", "jsonschema": "^1.5.0", "pngjs": "^7.0.0", "zod": "^4.3.6" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-9zRiJiCSTOYbES4htuk+yqkhgec/i4L1E63ZYgJ1AHkDbvHyoYLH3KkKjjzxDw7NYJCCOx+6vj9l9JrodoCbzg=="], - - "react-native-executorch-expo-resource-fetcher": ["react-native-executorch-expo-resource-fetcher@0.8.0", "", { "peerDependencies": { "expo": ">=54.0.0", "expo-asset": ">=12.0.0", "expo-file-system": ">=19.0.0", "react-native": "*", "react-native-executorch": "*" } }, "sha512-vdAne2FBL0nCQ2c2yHTSt8Uttm0Klmo/K7tirSVlKxgVtli4cmsfl+UpR5giaNtlRZ3ImMAMXNW34j0fItmRfQ=="], - "react-native-gesture-handler": ["react-native-gesture-handler@2.30.1", "", { "dependencies": { "@egjs/hammerjs": "^2.0.17", "hoist-non-react-statics": "^3.3.0", "invariant": "^2.2.4" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-xIUBDo5ktmJs++0fZlavQNvDEE4PsihWhSeJsJtoz4Q6p0MiTM9TgrTgfEgzRR36qGPytFoeq+ShLrVwGdpUdA=="], "react-native-is-edge-to-edge": ["react-native-is-edge-to-edge@1.3.1", "", { "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-NIXU/iT5+ORyCc7p0z2nnlkouYKX425vuU1OEm6bMMtWWR9yvb+Xg5AZmImTKoF9abxCPqrKC3rOZsKzUYgYZA=="], @@ -5446,6 +5438,8 @@ "which-typed-array": ["which-typed-array@1.1.20", "", { "dependencies": { "available-typed-arrays": "^1.0.7", "call-bind": "^1.0.8", "call-bound": "^1.0.4", "for-each": "^0.3.5", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-tostringtag": "^1.0.2" } }, "sha512-LYfpUkmqwl0h9A2HL09Mms427Q1RZWuOHsukfVcKRq9q95iQxdw0ix1JQrqbcDR9PH1QDwf5Qo8OZb5lksZ8Xg=="], + "whisper.rn": ["whisper.rn@0.5.5", "", { "dependencies": { "safe-buffer": "^5.2.1" }, "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-awFE+ImMtRdGhA+hjm3GEwnSvyEVP1sdhMb+MyCa5bVdoOCpaxrwVwXDo9U46Qwkhwml3PCFaauTsGmRkTyhdw=="], + "why-is-node-running": ["why-is-node-running@3.2.2", "", { "bin": { "why-is-node-running": "cli.js" } }, "sha512-NKUzAelcoCXhXL4dJzKIwXeR8iEVqsA0Lq6Vnd0UXvgaKbzVo4ZTHROF2Jidrv+SgxOQ03fMinnNhzZATxOD3A=="], "widest-line": ["widest-line@5.0.0", "", { "dependencies": { "string-width": "^7.0.0" } }, "sha512-c9bZp7b5YtRj2wOe6dlj32MK+Bx/M/d+9VB2SHM1OtsUHR0aV0tdP6DWh/iMt0kWi1t5g1Iudu6hQRNd1A4PVA=="], @@ -5846,6 +5840,8 @@ "@jimp/core/mime": ["mime@3.0.0", "", { "bin": { "mime": "cli.js" } }, "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A=="], + "@jimp/js-png/pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="], + "@jimp/plugin-blit/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], "@jimp/plugin-circle/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], @@ -6556,8 +6552,6 @@ "proper-lockfile/signal-exit": ["signal-exit@3.0.7", "", {}, "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="], - "qrcode/pngjs": ["pngjs@5.0.0", "", {}, "sha512-40QW5YalBNfQo5yRYmiw7Yz6TKKVr3h6970B2YE+3fQpsWcrbj1PzJgxeJ19DRQjhMbKPIuMY8rFaXc8moolVw=="], - "qrcode/yargs": ["yargs@15.4.1", "", { "dependencies": { "cliui": "^6.0.0", "decamelize": "^1.2.0", "find-up": "^4.1.0", "get-caller-file": "^2.0.1", "require-directory": "^2.1.1", "require-main-filename": "^2.0.0", "set-blocking": "^2.0.0", "string-width": "^4.2.0", "which-module": "^2.0.0", "y18n": "^4.0.0", "yargs-parser": "^18.1.2" } }, "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A=="], "raw-body/iconv-lite": ["iconv-lite@0.4.24", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3" } }, "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA=="], @@ -6582,8 +6576,6 @@ "react-native/yargs": ["yargs@17.7.2", "", { "dependencies": { "cliui": "^8.0.1", "escalade": "^3.1.1", "get-caller-file": "^2.0.5", "require-directory": "^2.1.1", "string-width": "^4.2.3", "y18n": "^5.0.5", "yargs-parser": "^21.1.1" } }, "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w=="], - "react-native-executorch/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], - "react-native-reanimated/react-native-is-edge-to-edge": ["react-native-is-edge-to-edge@1.2.1", "", { "peerDependencies": { "react": "*", "react-native": "*" } }, "sha512-FLbPWl/MyYQWz+KwqOZsSyj2JmLKglHatd3xLZWskXOpRaio4LfEDEz8E/A6uD8QoTHW6Aobw1jbEwK7KMgR7Q=="], "react-native-reanimated/semver": ["semver@7.7.3", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q=="], diff --git a/packages/mobile-voice/app.json b/packages/mobile-voice/app.json index 43a9be652a..66782dd6d8 100644 --- a/packages/mobile-voice/app.json +++ b/packages/mobile-voice/app.json @@ -1,6 +1,6 @@ { "expo": { - "name": "mobile-voice", + "name": "Control", "slug": "mobile-voice", "version": "1.0.0", "orientation": "portrait", @@ -10,6 +10,9 @@ "ios": { "icon": "./assets/images/icon.png", "bundleIdentifier": "com.anomalyco.mobilevoice", + "entitlements": { + "com.apple.developer.kernel.extended-virtual-addressing": true + }, "infoPlist": { "NSMicrophoneUsageDescription": "This app needs microphone access for live speech-to-text dictation.", "NSAppTransportSecurity": { diff --git a/packages/mobile-voice/package.json b/packages/mobile-voice/package.json index 42e70e3b64..d4c68fa7be 100644 --- a/packages/mobile-voice/package.json +++ b/packages/mobile-voice/package.json @@ -13,6 +13,7 @@ "lint": "expo lint" }, "dependencies": { + "@fugood/react-native-audio-pcm-stream": "1.1.4", "@react-navigation/bottom-tabs": "^7.15.5", "@react-navigation/elements": "^2.9.10", "@react-navigation/native": "^7.1.33", @@ -41,14 +42,13 @@ "react-dom": "19.2.0", "react-native": "0.83.4", "react-native-audio-api": "^0.11.7", - "react-native-executorch": "^0.8.0", - "react-native-executorch-expo-resource-fetcher": "^0.8.0", "react-native-gesture-handler": "~2.30.0", "react-native-reanimated": "4.2.1", "react-native-safe-area-context": "~5.6.2", "react-native-screens": "~4.23.0", "react-native-web": "~0.21.0", - "react-native-worklets": "0.7.2" + "react-native-worklets": "0.7.2", + "whisper.rn": "0.5.5" }, "devDependencies": { "@types/react": "~19.2.2", diff --git a/packages/mobile-voice/src/app/_layout.tsx b/packages/mobile-voice/src/app/_layout.tsx index a2fa275dc0..67571e0b94 100644 --- a/packages/mobile-voice/src/app/_layout.tsx +++ b/packages/mobile-voice/src/app/_layout.tsx @@ -1,25 +1,20 @@ -import React from 'react'; -import { Slot } from 'expo-router'; -import { LogBox } from 'react-native'; -import { initExecutorch } from 'react-native-executorch'; -import { ExpoResourceFetcher } from 'react-native-executorch-expo-resource-fetcher'; +import React from "react" +import { Slot } from "expo-router" +import { LogBox } from "react-native" import { configureNotificationBehavior, registerBackgroundNotificationTask, -} from '@/notifications/monitoring-notifications'; - -// Initialize the ExecuTorch resource fetcher before any model hooks run -initExecutorch({ resourceFetcher: ExpoResourceFetcher }); +} from "@/notifications/monitoring-notifications" // Suppress known non-actionable warnings from third-party libs. LogBox.ignoreLogs([ - 'RecordingNotificationManager is not implemented on iOS', - '[React Native ExecuTorch] No content-length header', -]); + "RecordingNotificationManager is not implemented on iOS", + "`transcribeRealtime` is deprecated, use `RealtimeTranscriber` instead", +]) -configureNotificationBehavior(); -registerBackgroundNotificationTask().catch(() => {}); +configureNotificationBehavior() +registerBackgroundNotificationTask().catch(() => {}) export default function RootLayout() { - return ; + return } diff --git a/packages/mobile-voice/src/app/index.tsx b/packages/mobile-voice/src/app/index.tsx index 88c50ceeaf..b7083ce88a 100644 --- a/packages/mobile-voice/src/app/index.tsx +++ b/packages/mobile-voice/src/app/index.tsx @@ -7,6 +7,7 @@ import { ScrollView, Modal, Alert, + ActivityIndicator, LayoutChangeEvent, AppState, AppStateStatus, @@ -25,11 +26,13 @@ import Animated, { } from "react-native-reanimated" import { SafeAreaView } from "react-native-safe-area-context" import { StatusBar } from "expo-status-bar" +import { SymbolView } from "expo-symbols" import * as Haptics from "expo-haptics" import { useAudioPlayer } from "expo-audio" -import { useSpeechToText, WHISPER_BASE_EN } from "react-native-executorch" -import { ExpoResourceFetcher } from "react-native-executorch-expo-resource-fetcher" -import { AudioManager, AudioRecorder } from "react-native-audio-api" +import { initWhisper, releaseAllWhisper, type WhisperContext } from "whisper.rn" +import { RealtimeTranscriber, type RealtimeTranscribeEvent } from "whisper.rn/src/realtime-transcription" +import { AudioPcmStreamAdapter } from "whisper.rn/src/realtime-transcription/adapters/AudioPcmStreamAdapter" +import { AudioManager } from "react-native-audio-api" import * as Notifications from "expo-notifications" import * as FileSystem from "expo-file-system/legacy" import Constants from "expo-constants" @@ -49,8 +52,6 @@ import { onPushTokenChange, } from "@/notifications/monitoring-notifications" -const SAMPLE_RATE = 16000 -const AUDIO_BUFFER_SECONDS = 0.02 const CONTROL_HEIGHT = 86 const SEND_SETTLE_MS = 240 const WAVEFORM_ROWS = 5 @@ -61,6 +62,187 @@ const DROPDOWN_VISIBLE_ROWS = 6 const TAP_THRESHOLD_MS = 300 const DEFAULT_RELAY_URL = "https://apn.dev.opencode.ai" const SERVER_STATE_FILE = `${FileSystem.documentDirectory}mobile-voice-servers.json` +const WHISPER_SETTINGS_FILE = `${FileSystem.documentDirectory}mobile-voice-whisper-settings.json` +const WHISPER_MODELS_DIR = `${FileSystem.documentDirectory}whisper-models` +const WHISPER_REPO = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" +const WHISPER_MODELS = [ + "ggml-tiny.en-q5_1.bin", + "ggml-tiny.en-q8_0.bin", + "ggml-tiny.en.bin", + "ggml-tiny-q5_1.bin", + "ggml-tiny-q8_0.bin", + "ggml-tiny.bin", + "ggml-base.en-q5_1.bin", + "ggml-base.en-q8_0.bin", + "ggml-base.en.bin", + "ggml-base-q5_1.bin", + "ggml-base-q8_0.bin", + "ggml-base.bin", + "ggml-small.en-q5_1.bin", + "ggml-small.en-q8_0.bin", + "ggml-small.en.bin", + "ggml-small-q5_1.bin", + "ggml-small-q8_0.bin", + "ggml-small.bin", + "ggml-medium.en-q5_0.bin", + "ggml-medium.en-q8_0.bin", + "ggml-medium.en.bin", + "ggml-medium-q5_0.bin", + "ggml-medium-q8_0.bin", + "ggml-medium.bin", + "ggml-large-v1.bin", + "ggml-large-v2-q5_0.bin", + "ggml-large-v2-q8_0.bin", + "ggml-large-v2.bin", + "ggml-large-v3-q5_0.bin", + "ggml-large-v3-turbo-q5_0.bin", + "ggml-large-v3-turbo-q8_0.bin", + "ggml-large-v3-turbo.bin", + "ggml-large-v3.bin", +] as const + +type WhisperModelID = (typeof WHISPER_MODELS)[number] +type TranscriptionMode = "bulk" | "realtime" +const DEFAULT_WHISPER_MODEL: WhisperModelID = "ggml-medium.bin" +const DEFAULT_TRANSCRIPTION_MODE: TranscriptionMode = "bulk" + +const WHISPER_MODEL_LABELS: Record = { + "ggml-tiny.en-q5_1.bin": "tiny.en q5_1", + "ggml-tiny.en-q8_0.bin": "tiny.en q8_0", + "ggml-tiny.en.bin": "tiny.en", + "ggml-tiny-q5_1.bin": "tiny q5_1", + "ggml-tiny-q8_0.bin": "tiny q8_0", + "ggml-tiny.bin": "tiny", + "ggml-base.en-q5_1.bin": "base.en q5_1", + "ggml-base.en-q8_0.bin": "base.en q8_0", + "ggml-base.en.bin": "base.en", + "ggml-base-q5_1.bin": "base q5_1", + "ggml-base-q8_0.bin": "base q8_0", + "ggml-base.bin": "base", + "ggml-small.en-q5_1.bin": "small.en q5_1", + "ggml-small.en-q8_0.bin": "small.en q8_0", + "ggml-small.en.bin": "small.en", + "ggml-small-q5_1.bin": "small q5_1", + "ggml-small-q8_0.bin": "small q8_0", + "ggml-small.bin": "small", + "ggml-medium.en-q5_0.bin": "medium.en q5_0", + "ggml-medium.en-q8_0.bin": "medium.en q8_0", + "ggml-medium.en.bin": "medium.en", + "ggml-medium-q5_0.bin": "medium q5_0", + "ggml-medium-q8_0.bin": "medium q8_0", + "ggml-medium.bin": "medium", + "ggml-large-v1.bin": "large-v1", + "ggml-large-v2-q5_0.bin": "large-v2 q5_0", + "ggml-large-v2-q8_0.bin": "large-v2 q8_0", + "ggml-large-v2.bin": "large-v2", + "ggml-large-v3-q5_0.bin": "large-v3 q5_0", + "ggml-large-v3-turbo-q5_0.bin": "large-v3 turbo q5_0", + "ggml-large-v3-turbo-q8_0.bin": "large-v3 turbo q8_0", + "ggml-large-v3-turbo.bin": "large-v3 turbo", + "ggml-large-v3.bin": "large-v3", +} + +const WHISPER_MODEL_SIZES: Record = { + "ggml-tiny.en-q5_1.bin": 32166155, + "ggml-tiny.en-q8_0.bin": 43550795, + "ggml-tiny.en.bin": 77704715, + "ggml-tiny-q5_1.bin": 32152673, + "ggml-tiny-q8_0.bin": 43537433, + "ggml-tiny.bin": 77691713, + "ggml-base.en-q5_1.bin": 59721011, + "ggml-base.en-q8_0.bin": 81781811, + "ggml-base.en.bin": 147964211, + "ggml-base-q5_1.bin": 59707625, + "ggml-base-q8_0.bin": 81768585, + "ggml-base.bin": 147951465, + "ggml-small.en-q5_1.bin": 190098681, + "ggml-small.en-q8_0.bin": 264477561, + "ggml-small.en.bin": 487614201, + "ggml-small-q5_1.bin": 190085487, + "ggml-small-q8_0.bin": 264464607, + "ggml-small.bin": 487601967, + "ggml-medium.en-q5_0.bin": 539225533, + "ggml-medium.en-q8_0.bin": 823382461, + "ggml-medium.en.bin": 1533774781, + "ggml-medium-q5_0.bin": 539212467, + "ggml-medium-q8_0.bin": 823369779, + "ggml-medium.bin": 1533763059, + "ggml-large-v1.bin": 3094623691, + "ggml-large-v2-q5_0.bin": 1080732091, + "ggml-large-v2-q8_0.bin": 1656129691, + "ggml-large-v2.bin": 3094623691, + "ggml-large-v3-q5_0.bin": 1081140203, + "ggml-large-v3-turbo-q5_0.bin": 574041195, + "ggml-large-v3-turbo-q8_0.bin": 874188075, + "ggml-large-v3-turbo.bin": 1624555275, + "ggml-large-v3.bin": 3095033483, +} + +function isWhisperModelID(value: unknown): value is WhisperModelID { + return typeof value === "string" && (WHISPER_MODELS as readonly string[]).includes(value) +} + +function isEnglishOnlyWhisperModel(modelID: WhisperModelID): boolean { + return modelID.includes(".en") +} + +function isTranscriptionMode(value: unknown): value is TranscriptionMode { + return value === "bulk" || value === "realtime" +} + +function formatWhisperModelSize(bytes: number): string { + const mib = bytes / (1024 * 1024) + if (mib >= 1024) { + return `${(mib / 1024).toFixed(1)} GB` + } + + return `${Math.round(mib)} MB` +} + +function cleanTranscriptText(text: string): string { + return text.replace(/[ \t]+$/gm, "").trimEnd() +} + +function cleanSessionText(text: string): string { + return cleanTranscriptText(text).trimStart() +} + +function normalizeTranscriptSessions(text: string): string { + const cleaned = cleanTranscriptText(text) + if (!cleaned) { + return "" + } + + return cleaned + .split(/\n\n+/) + .map((session) => cleanSessionText(session)) + .filter((session) => session.length > 0) + .join("\n\n") +} + +function mergeTranscriptChunk(previous: string, chunk: string): string { + const cleanPrevious = cleanTranscriptText(previous) + const cleanChunk = cleanSessionText(chunk) + + if (!cleanChunk) { + return cleanPrevious + } + + if (!cleanPrevious) { + return cleanChunk + } + + const normalizedChunk = cleanChunk + if (!normalizedChunk) { + return cleanPrevious + } + + if (/^[,.;:!?)]/.test(normalizedChunk)) { + return `${cleanPrevious}${normalizedChunk}` + } + + return `${cleanPrevious} ${normalizedChunk}` +} type ServerItem = { id: string @@ -135,6 +317,11 @@ type SavedState = { activeSessionId: string | null } +type WhisperSavedState = { + defaultModel: WhisperModelID + mode: TranscriptionMode +} + type Cam = { CameraView: (typeof import("expo-camera"))["CameraView"] requestCameraPermissionsAsync: (typeof import("expo-camera"))["Camera"]["requestCameraPermissionsAsync"] @@ -245,12 +432,16 @@ function fromSaved(input: SavedState): { export default function DictationScreen() { const [camera, setCamera] = useState(null) - const [modelReset, setModelReset] = useState(false) - const model = useSpeechToText({ - model: WHISPER_BASE_EN, - preventLoad: modelReset, - }) - + const [defaultWhisperModel, setDefaultWhisperModel] = useState(DEFAULT_WHISPER_MODEL) + const [activeWhisperModel, setActiveWhisperModel] = useState(null) + const [installedWhisperModels, setInstalledWhisperModels] = useState([]) + const [whisperSettingsOpen, setWhisperSettingsOpen] = useState(false) + const [downloadingModelID, setDownloadingModelID] = useState(null) + const [downloadProgress, setDownloadProgress] = useState(0) + const [isPreparingWhisperModel, setIsPreparingWhisperModel] = useState(true) + const [transcriptionMode, setTranscriptionMode] = useState(DEFAULT_TRANSCRIPTION_MODE) + const [isTranscribingBulk, setIsTranscribingBulk] = useState(false) + const [whisperError, setWhisperError] = useState("") const [transcribedText, setTranscribedText] = useState("") const [isRecording, setIsRecording] = useState(false) const [permissionGranted, setPermissionGranted] = useState(false) @@ -283,21 +474,23 @@ export default function DictationScreen() { const pressInTimeRef = useRef(0) const accumulatedRef = useRef("") const baseTextRef = useRef("") - // Keep a ref to model so audio callbacks always use the latest hook closure - const modelRef = useRef(model) - modelRef.current = model - const prewarmPromiseRef = useRef | null>(null) - const hasPrewarmedRef = useRef(false) + const whisperContextRef = useRef(null) + const whisperContextModelRef = useRef(null) + const whisperTranscriberRef = useRef(null) + const bulkAudioStreamRef = useRef(null) + const bulkAudioChunksRef = useRef([]) + const bulkTranscriptionJobRef = useRef(0) + const downloadProgressRef = useRef(0) + const waveformPulseIntervalRef = useRef | null>(null) const sendSettleTimeoutRef = useRef | null>(null) const foregroundMonitorAbortRef = useRef(null) const monitorJobRef = useRef(null) const previousPushTokenRef = useRef(null) const scanLockRef = useRef(false) const restoredRef = useRef(false) + const whisperRestoredRef = useRef(false) const refreshSeqRef = useRef>({}) - const [recorder] = useState(() => new AudioRecorder()) - useEffect(() => { serversRef.current = servers }, [servers]) @@ -338,12 +531,38 @@ export default function DictationScreen() { monitorJobRef.current = monitorJob }, [monitorJob]) - const ensureAudioRoute = useCallback(async () => { - await AudioManager.setAudioSessionActivity(true) - const devices = await AudioManager.getDevicesInfo() - if (devices.currentInputs.length === 0 && devices.availableInputs.length > 0) { - await AudioManager.setInputDevice(devices.availableInputs[0].id) + const modelPath = useCallback((modelID: WhisperModelID) => `${WHISPER_MODELS_DIR}/${modelID}`, []) + + const refreshInstalledWhisperModels = useCallback(async () => { + const next: WhisperModelID[] = [] + + for (const modelID of WHISPER_MODELS) { + try { + const info = await FileSystem.getInfoAsync(modelPath(modelID)) + if (info.exists) { + next.push(modelID) + } + } catch { + // Ignore model metadata read errors. + } } + + setInstalledWhisperModels(next) + return next + }, [modelPath]) + + const stopWaveformPulse = useCallback(() => { + if (waveformPulseIntervalRef.current) { + clearInterval(waveformPulseIntervalRef.current) + waveformPulseIntervalRef.current = null + } + }, []) + + const clearWaveform = useCallback(() => { + const cleared = new Array(waveformLevelsRef.current.length).fill(0) + waveformLevelsRef.current = cleared + setWaveformLevels(cleared) + setWaveformTick(Date.now()) }, []) useEffect(() => { @@ -351,23 +570,9 @@ export default function DictationScreen() { if (sendSettleTimeoutRef.current) { clearTimeout(sendSettleTimeoutRef.current) } + stopWaveformPulse() } - }, []) - - // Warm up the model once after load to reduce first-utterance latency. - useEffect(() => { - if (!model.isReady || hasPrewarmedRef.current) return - hasPrewarmedRef.current = true - prewarmPromiseRef.current = (async () => { - try { - await modelRef.current.transcribe(new Float32Array(SAMPLE_RATE / 2), { - verbose: false, - }) - } catch { - // Prewarm best-effort only. - } - })() - }, [model.isReady]) + }, [stopWaveformPulse]) // Set up audio session and request permissions on mount useEffect(() => { @@ -411,6 +616,215 @@ export default function DictationScreen() { })() }, []) + const loadWhisperContext = useCallback( + async (modelID: WhisperModelID) => { + if (whisperContextRef.current && whisperContextModelRef.current === modelID) { + setActiveWhisperModel(modelID) + return whisperContextRef.current + } + + setIsPreparingWhisperModel(true) + setWhisperError("") + + try { + const existing = whisperContextRef.current + whisperContextRef.current = null + whisperContextModelRef.current = null + if (existing) { + await existing.release().catch(() => {}) + } + + const context = await initWhisper({ + filePath: modelPath(modelID), + useGpu: Platform.OS === "ios", + }) + + whisperContextRef.current = context + whisperContextModelRef.current = modelID + setActiveWhisperModel(modelID) + return context + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to load Whisper model" + setWhisperError(message) + throw error + } finally { + setIsPreparingWhisperModel(false) + } + }, + [modelPath], + ) + + const downloadWhisperModel = useCallback( + async (modelID: WhisperModelID) => { + if (downloadingModelID && downloadingModelID !== modelID) { + return false + } + + setDownloadingModelID(modelID) + downloadProgressRef.current = 0 + setDownloadProgress(0) + setWhisperError("") + + try { + await FileSystem.makeDirectoryAsync(WHISPER_MODELS_DIR, { intermediates: true }).catch(() => {}) + + const targetPath = modelPath(modelID) + await FileSystem.deleteAsync(targetPath, { idempotent: true }).catch(() => {}) + + const download = FileSystem.createDownloadResumable( + `${WHISPER_REPO}/${modelID}`, + targetPath, + {}, + (event: FileSystem.DownloadProgressData) => { + const total = event.totalBytesExpectedToWrite + if (!total) return + const rawProgress = Math.max(0, Math.min(1, event.totalBytesWritten / total)) + const progress = Math.max(downloadProgressRef.current, rawProgress) + downloadProgressRef.current = progress + setDownloadProgress(progress) + }, + ) + + const result = await download.downloadAsync() + if (!result?.uri) { + throw new Error("Whisper model download did not complete") + } + + await refreshInstalledWhisperModels() + return true + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to download Whisper model" + setWhisperError(message) + return false + } finally { + setDownloadingModelID((current) => (current === modelID ? null : current)) + } + }, + [downloadingModelID, modelPath, refreshInstalledWhisperModels], + ) + + const ensureWhisperModelReady = useCallback( + async (modelID: WhisperModelID) => { + const info = await FileSystem.getInfoAsync(modelPath(modelID)) + if (!info.exists) { + const downloaded = await downloadWhisperModel(modelID) + if (!downloaded) { + throw new Error(`Unable to download ${modelID}`) + } + } + return loadWhisperContext(modelID) + }, + [downloadWhisperModel, loadWhisperContext, modelPath], + ) + + useEffect(() => { + let mounted = true + + ;(async () => { + await FileSystem.makeDirectoryAsync(WHISPER_MODELS_DIR, { intermediates: true }).catch(() => {}) + + let nextDefaultModel: WhisperModelID = DEFAULT_WHISPER_MODEL + let nextMode: TranscriptionMode = DEFAULT_TRANSCRIPTION_MODE + try { + const data = await FileSystem.readAsStringAsync(WHISPER_SETTINGS_FILE) + if (data) { + const parsed = JSON.parse(data) as Partial + if (isWhisperModelID(parsed.defaultModel)) { + nextDefaultModel = parsed.defaultModel + } + if (isTranscriptionMode(parsed.mode)) { + nextMode = parsed.mode + } + } + } catch { + // Use default settings if state file is missing or invalid. + } + + if (!mounted) return + + whisperRestoredRef.current = true + setDefaultWhisperModel(nextDefaultModel) + setTranscriptionMode(nextMode) + + await refreshInstalledWhisperModels() + + try { + await ensureWhisperModelReady(nextDefaultModel) + } catch (error) { + console.error("[Whisper] Failed to initialize default model:", error) + } finally { + if (mounted) { + setIsPreparingWhisperModel(false) + } + } + })() + + return () => { + mounted = false + } + }, [ensureWhisperModelReady, refreshInstalledWhisperModels]) + + useEffect(() => { + if (!whisperRestoredRef.current) return + const payload: WhisperSavedState = { defaultModel: defaultWhisperModel, mode: transcriptionMode } + FileSystem.writeAsStringAsync(WHISPER_SETTINGS_FILE, JSON.stringify(payload)).catch(() => {}) + }, [defaultWhisperModel, transcriptionMode]) + + useEffect(() => { + return () => { + const transcriber = whisperTranscriberRef.current + whisperTranscriberRef.current = null + if (transcriber) { + void (async () => { + await transcriber.stop().catch(() => {}) + await transcriber.release().catch(() => {}) + })() + } + + const bulkStream = bulkAudioStreamRef.current + bulkAudioStreamRef.current = null + if (bulkStream) { + void (async () => { + await bulkStream.stop().catch(() => {}) + await bulkStream.release().catch(() => {}) + })() + } + + const context = whisperContextRef.current + whisperContextRef.current = null + whisperContextModelRef.current = null + + if (context) { + context.release().catch(() => {}) + } + + releaseAllWhisper().catch(() => {}) + } + }, []) + + const startWaveformPulse = useCallback(() => { + if (waveformPulseIntervalRef.current) return + + waveformPulseIntervalRef.current = setInterval(() => { + if (!isRecordingRef.current) return + + const next = waveformLevelsRef.current.map((value) => { + const decay = value * 0.45 + const lift = Math.random() * 0.95 + return Math.max(0.08, Math.min(1, decay + lift * 0.55)) + }) + + waveformLevelsRef.current = next + + const now = Date.now() + if (now - lastWaveformCommitRef.current > 45) { + setWaveformLevels(next) + setWaveformTick(now) + lastWaveformCommitRef.current = now + } + }, 70) + }, []) + useEffect(() => { const sub = AppState.addEventListener("change", (nextState) => { setAppState(nextState) @@ -463,192 +877,296 @@ export default function DictationScreen() { return () => notificationSub.remove() }, []) - const startRecording = useCallback(async () => { - const m = modelRef.current - if (!m.isReady || isRecordingRef.current || isStartingRef.current) return - - isStartingRef.current = true - const sessionId = Date.now() - activeSessionRef.current = sessionId - accumulatedRef.current = "" - baseTextRef.current = transcribedText - isRecordingRef.current = true - setIsRecording(true) - const cancelled = () => !isRecordingRef.current || activeSessionRef.current !== sessionId - - // If prewarm is still running, wait once here to avoid ModelGenerating race. - if (prewarmPromiseRef.current) { - await prewarmPromiseRef.current - prewarmPromiseRef.current = null - } - if (cancelled()) { - isStartingRef.current = false - return - } - - try { - await ensureAudioRoute() - } catch (e) { - console.warn("[Dictation] Failed to ensure audio route:", e) - } - if (cancelled()) { - isStartingRef.current = false - return - } - - recorder.onError((err) => { - console.error("[Dictation] Recorder error:", err.message) - if (activeSessionRef.current !== sessionId) return - isRecordingRef.current = false - activeSessionRef.current = 0 - setIsRecording(false) - recorder.clearOnAudioReady() - recorder.clearOnError() - modelRef.current.streamStop() - }) - - const readyResult = recorder.onAudioReady( - { - sampleRate: SAMPLE_RATE, - bufferLength: AUDIO_BUFFER_SECONDS * SAMPLE_RATE, - channelCount: 1, - }, - (chunk) => { - if (activeSessionRef.current !== sessionId) return - const samples = chunk.buffer.getChannelData(0) - if (!samples || samples.length === 0) return - - // Defensive guard against invalid chunk data coming from unstable audio routes. - let valid = true - for (let i = 0; i < samples.length; i += 32) { - if (!Number.isFinite(samples[i])) { - valid = false - break - } - } - if (!valid) return - - const columns = waveformLevelsRef.current.length - const segmentLength = Math.max(1, Math.floor(samples.length / Math.max(columns, 1))) - const next = new Array(columns).fill(0) - - for (let b = 0; b < columns; b++) { - const start = b * segmentLength - const end = Math.min(samples.length, start + segmentLength) - - let sum = 0 - for (let i = start; i < end; i++) { - const s = samples[i] - sum += s * s - } - - const rms = Math.sqrt(sum / Math.max(end - start, 1)) - const base = Math.min(1, rms * 10) - const previous = waveformLevelsRef.current[b] ?? 0 - // Fast rise, slower decay for more natural meter behavior - next[b] = base > previous ? base : previous * 0.82 - } - - waveformLevelsRef.current = next - const now = Date.now() - if (now - lastWaveformCommitRef.current > 45) { - setWaveformLevels(next) - setWaveformTick(now) - lastWaveformCommitRef.current = now - } - - // Always use the latest model ref to avoid stale closure. - modelRef.current.streamInsert(samples) - }, - ) - - if (readyResult.status === "error") { - console.error("[Dictation] onAudioReady failed:", readyResult.message) - isRecordingRef.current = false - activeSessionRef.current = 0 - setIsRecording(false) - recorder.clearOnAudioReady() - recorder.clearOnError() - isStartingRef.current = false - return - } - if (cancelled()) { - recorder.clearOnAudioReady() - recorder.clearOnError() - modelRef.current.streamStop() - isStartingRef.current = false - return - } - - // Start stream first, then begin feeding chunks from recorder. - const streamIter = modelRef.current.stream({ verbose: false }) - let sawTextInSession = false - const streamTask = (async () => { - for await (const { committed, nonCommitted } of streamIter) { - if (!isRecordingRef.current) break - - if (committed.text) { - accumulatedRef.current += committed.text - } - - if (committed.text || nonCommitted.text) { - sawTextInSession = true - } - - const base = baseTextRef.current - const separator = base.length > 0 ? "\n\n" : "" - // Whisper can emit a leading-space token at the start of each session. - const sessionText = (accumulatedRef.current + nonCommitted.text).replace(/^\s+/, "") - setTranscribedText(base + separator + sessionText) - } - })() - - const startResult = recorder.start() - if (startResult.status === "error") { - console.error("[Dictation] Recorder start failed:", startResult.message) - modelRef.current.streamStop() - isRecordingRef.current = false - activeSessionRef.current = 0 - setIsRecording(false) - recorder.clearOnAudioReady() - recorder.clearOnError() - isStartingRef.current = false - return - } - isStartingRef.current = false - - try { - await streamTask - if (sawTextInSession) { - setHasCompletedSession(true) - } - } catch (error) { - console.error("[Dictation] Streaming error:", error) - } - }, [ensureAudioRoute, recorder, transcribedText]) - - const stopRecording = useCallback(() => { - if (!isRecordingRef.current) return - + const finalizeRecordingState = useCallback(() => { isRecordingRef.current = false activeSessionRef.current = 0 isStartingRef.current = false setIsRecording(false) + stopWaveformPulse() + clearWaveform() + }, [clearWaveform, stopWaveformPulse]) + + const startRecording = useCallback(async () => { + if (isRecordingRef.current || isStartingRef.current || downloadingModelID || isTranscribingBulk) return + + isStartingRef.current = true + const sessionID = Date.now() + activeSessionRef.current = sessionID + accumulatedRef.current = "" + baseTextRef.current = normalizeTranscriptSessions(transcribedText) + if (baseTextRef.current !== transcribedText) { + setTranscribedText(baseTextRef.current) + } + isRecordingRef.current = true + setIsRecording(true) + setWhisperError("") + + const cancelled = () => !isRecordingRef.current || activeSessionRef.current !== sessionID + + try { + const context = await ensureWhisperModelReady(defaultWhisperModel) + if (cancelled()) { + isStartingRef.current = false + return + } + + const previousTranscriber = whisperTranscriberRef.current + whisperTranscriberRef.current = null + if (previousTranscriber) { + await previousTranscriber.stop().catch(() => {}) + await previousTranscriber.release().catch(() => {}) + } + + const previousBulkStream = bulkAudioStreamRef.current + bulkAudioStreamRef.current = null + if (previousBulkStream) { + await previousBulkStream.stop().catch(() => {}) + await previousBulkStream.release().catch(() => {}) + } + + bulkAudioChunksRef.current = [] + bulkTranscriptionJobRef.current = 0 + + startWaveformPulse() + + const englishOnlyModel = isEnglishOnlyWhisperModel(defaultWhisperModel) + + if (transcriptionMode === "bulk") { + const audioStream = new AudioPcmStreamAdapter() + audioStream.onData((packet: unknown) => { + if (activeSessionRef.current !== sessionID) return + const data = (packet as { data?: unknown }).data + if (!(data instanceof Uint8Array) || data.length === 0) return + bulkAudioChunksRef.current.push(new Uint8Array(data)) + }) + audioStream.onError((error: string) => { + if (activeSessionRef.current !== sessionID) return + setWhisperError(error) + console.error("[Dictation] Bulk audio stream error:", error) + }) + + await audioStream.initialize({ + sampleRate: 16000, + channels: 1, + bitsPerSample: 16, + bufferSize: 16 * 1024, + audioSource: 6, + }) + await audioStream.start() + + bulkAudioStreamRef.current = audioStream + + if (cancelled()) { + await audioStream.stop().catch(() => {}) + await audioStream.release().catch(() => {}) + if (bulkAudioStreamRef.current === audioStream) { + bulkAudioStreamRef.current = null + } + finalizeRecordingState() + return + } + + isStartingRef.current = false + return + } + + const transcriber = new RealtimeTranscriber( + { + whisperContext: context, + audioStream: new AudioPcmStreamAdapter(), + }, + { + audioSliceSec: 4, + audioMinSec: 0.8, + maxSlicesInMemory: 6, + transcribeOptions: { + language: englishOnlyModel ? "en" : "auto", + translate: !englishOnlyModel, + maxLen: 1, + }, + logger: () => {}, + }, + { + onTranscribe: (event: RealtimeTranscribeEvent) => { + if (activeSessionRef.current !== sessionID) return + if (event.type !== "transcribe") return + + const nextSessionText = mergeTranscriptChunk(accumulatedRef.current, event.data?.result ?? "") + accumulatedRef.current = nextSessionText + + const base = normalizeTranscriptSessions(baseTextRef.current) + const separator = base.length > 0 && nextSessionText.length > 0 ? "\n\n" : "" + setTranscribedText(normalizeTranscriptSessions(base + separator + nextSessionText)) + + if (nextSessionText.length > 0) { + setHasCompletedSession(true) + } + }, + onError: (error: string) => { + if (activeSessionRef.current !== sessionID) return + console.error("[Dictation] Whisper realtime error:", error) + setWhisperError(error) + }, + onStatusChange: (active: boolean) => { + if (activeSessionRef.current !== sessionID) return + if (!active) { + if (whisperTranscriberRef.current === transcriber) { + whisperTranscriberRef.current = null + } + finalizeRecordingState() + } + }, + }, + ) + + whisperTranscriberRef.current = transcriber + await transcriber.start() + + if (cancelled()) { + await transcriber.stop().catch(() => {}) + await transcriber.release().catch(() => {}) + if (whisperTranscriberRef.current === transcriber) { + whisperTranscriberRef.current = null + } + finalizeRecordingState() + return + } + + isStartingRef.current = false + } catch (error) { + console.error("[Dictation] Failed to start realtime transcription:", error) + const message = error instanceof Error ? error.message : "Unable to start transcription" + setWhisperError(message) + + const activeTranscriber = whisperTranscriberRef.current + whisperTranscriberRef.current = null + if (activeTranscriber) { + void (async () => { + await activeTranscriber.stop().catch(() => {}) + await activeTranscriber.release().catch(() => {}) + })() + } + + finalizeRecordingState() + Haptics.notificationAsync(Haptics.NotificationFeedbackType.Error).catch(() => {}) + } + }, [ + defaultWhisperModel, + downloadingModelID, + ensureWhisperModelReady, + finalizeRecordingState, + isTranscribingBulk, + startWaveformPulse, + transcriptionMode, + transcribedText, + ]) + + const stopRecording = useCallback(() => { + if (!isRecordingRef.current && !isStartingRef.current) return + Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Light).catch(() => {}) - recorder.stop() - recorder.clearOnAudioReady() - recorder.clearOnError() - modelRef.current.streamStop() - const cleared = new Array(waveformLevelsRef.current.length).fill(0) - waveformLevelsRef.current = cleared - setWaveformLevels(cleared) - setWaveformTick(Date.now()) - }, [recorder]) + + const baseAtStop = normalizeTranscriptSessions(baseTextRef.current) + const englishOnlyModel = isEnglishOnlyWhisperModel(defaultWhisperModel) + + const transcriber = whisperTranscriberRef.current + whisperTranscriberRef.current = null + if (transcriber) { + void (async () => { + await transcriber.stop().catch((error: unknown) => { + console.warn("[Dictation] Failed to stop realtime transcription:", error) + }) + await transcriber.release().catch(() => {}) + })() + } + + const bulkStream = bulkAudioStreamRef.current + bulkAudioStreamRef.current = null + const bulkChunks = bulkAudioChunksRef.current + bulkAudioChunksRef.current = [] + + finalizeRecordingState() + + if (transcriptionMode !== "bulk") { + return + } + + const runID = Date.now() + bulkTranscriptionJobRef.current = runID + + void (async () => { + if (bulkStream) { + await bulkStream.stop().catch((error: unknown) => { + console.warn("[Dictation] Failed to stop bulk audio stream:", error) + }) + await bulkStream.release().catch(() => {}) + } + + if (bulkChunks.length === 0) { + return + } + + const totalLength = bulkChunks.reduce((sum, chunk) => sum + chunk.length, 0) + if (totalLength === 0) { + return + } + + const merged = new Uint8Array(totalLength) + let offset = 0 + for (const chunk of bulkChunks) { + merged.set(chunk, offset) + offset += chunk.length + } + + const context = whisperContextRef.current + if (!context) { + setWhisperError("Whisper model is not loaded") + return + } + + setIsTranscribingBulk(true) + + try { + const { promise } = context.transcribeData(merged.buffer, { + language: englishOnlyModel ? "en" : "auto", + translate: !englishOnlyModel, + maxLen: 1, + }) + + const result = await promise + if (bulkTranscriptionJobRef.current !== runID) { + return + } + + const sessionText = cleanSessionText(result.result ?? "") + if (!sessionText) { + return + } + + const separator = baseAtStop.length > 0 ? "\n\n" : "" + setTranscribedText(normalizeTranscriptSessions(baseAtStop + separator + sessionText)) + setHasCompletedSession(true) + } catch (error) { + if (bulkTranscriptionJobRef.current !== runID) { + return + } + const message = error instanceof Error ? error.message : "Bulk transcription failed" + setWhisperError(message) + console.error("[Dictation] Bulk transcription failed:", error) + } finally { + if (bulkTranscriptionJobRef.current === runID) { + setIsTranscribingBulk(false) + } + } + })() + }, [defaultWhisperModel, finalizeRecordingState, transcriptionMode]) const clearIconRotation = useSharedValue(0) const sendOutProgress = useSharedValue(0) const handleClearTranscript = useCallback(() => { + Haptics.selectionAsync().catch(() => {}) + clearIconRotation.value = withSequence( withTiming(-30, { duration: 90 }), withTiming(30, { duration: 120 }), @@ -662,42 +1180,10 @@ export default function DictationScreen() { baseTextRef.current = "" setTranscribedText("") setHasCompletedSession(false) - const cleared = new Array(waveformLevelsRef.current.length).fill(0) - waveformLevelsRef.current = cleared - setWaveformLevels(cleared) - setWaveformTick(Date.now()) + clearWaveform() sendOutProgress.value = 0 setIsSending(false) - }, [clearIconRotation, sendOutProgress, stopRecording]) - - const handleDeleteModel = useCallback(async () => { - if (modelReset) return - - if (isRecordingRef.current) { - stopRecording() - } - - setModelReset(true) - accumulatedRef.current = "" - baseTextRef.current = "" - setTranscribedText("") - setHasCompletedSession(false) - const cleared = new Array(waveformLevelsRef.current.length).fill(0) - waveformLevelsRef.current = cleared - setWaveformLevels(cleared) - setWaveformTick(Date.now()) - sendOutProgress.value = 0 - setIsSending(false) - Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Medium).catch(() => {}) - - try { - await ExpoResourceFetcher.deleteResources(WHISPER_BASE_EN.modelSource, WHISPER_BASE_EN.tokenizerSource) - } catch (err) { - console.error("Failed to delete model resources:", err) - } - - setModelReset(false) - }, [modelReset, sendOutProgress, stopRecording]) + }, [clearIconRotation, clearWaveform, sendOutProgress, stopRecording]) const resetTranscriptState = useCallback(() => { if (isRecordingRef.current) { @@ -707,11 +1193,91 @@ export default function DictationScreen() { baseTextRef.current = "" setTranscribedText("") setHasCompletedSession(false) - const cleared = new Array(waveformLevelsRef.current.length).fill(0) - waveformLevelsRef.current = cleared - setWaveformLevels(cleared) - setWaveformTick(Date.now()) - }, [stopRecording]) + clearWaveform() + }, [clearWaveform, stopRecording]) + + const handleOpenWhisperSettings = useCallback(() => { + Haptics.selectionAsync().catch(() => {}) + setDropdownMode("none") + setWhisperSettingsOpen(true) + }, []) + + const handleDownloadWhisperModel = useCallback( + async (modelID: WhisperModelID) => { + const ok = await downloadWhisperModel(modelID) + if (ok) { + Haptics.selectionAsync().catch(() => {}) + } + }, + [downloadWhisperModel], + ) + + const handleSelectWhisperModel = useCallback( + async (modelID: WhisperModelID) => { + if (isRecordingRef.current || isStartingRef.current) { + stopRecording() + } + + try { + await ensureWhisperModelReady(modelID) + setDefaultWhisperModel(modelID) + setWhisperError("") + Haptics.selectionAsync().catch(() => {}) + } catch (error) { + const message = error instanceof Error ? error.message : "Unable to switch Whisper model" + setWhisperError(message) + } + }, + [ensureWhisperModelReady, stopRecording], + ) + + const handleDeleteWhisperModel = useCallback( + async (modelID: WhisperModelID) => { + if (downloadingModelID === modelID) return + + if (isRecordingRef.current || isStartingRef.current) { + stopRecording() + } + + if (whisperContextModelRef.current === modelID && whisperContextRef.current) { + const activeContext = whisperContextRef.current + whisperContextRef.current = null + whisperContextModelRef.current = null + setActiveWhisperModel(null) + await activeContext.release().catch(() => {}) + } + + await FileSystem.deleteAsync(modelPath(modelID), { idempotent: true }).catch(() => {}) + const nextInstalled = await refreshInstalledWhisperModels() + + if (defaultWhisperModel === modelID) { + const fallbackModel = nextInstalled[0] ?? DEFAULT_WHISPER_MODEL + setDefaultWhisperModel(fallbackModel) + try { + await ensureWhisperModelReady(fallbackModel) + } catch { + // Keep UI responsive if fallback init fails. + } + } else if (activeWhisperModel == null && nextInstalled.includes(defaultWhisperModel)) { + try { + await ensureWhisperModelReady(defaultWhisperModel) + } catch { + // Keep UI responsive if default model init fails. + } + } + + Haptics.selectionAsync().catch(() => {}) + }, + [ + activeWhisperModel, + defaultWhisperModel, + downloadingModelID, + ensureWhisperModelReady, + modelPath, + refreshInstalledWhisperModels, + stopRecording, + ], + ) const completeSend = useCallback(() => { if (sendSettleTimeoutRef.current) { @@ -960,10 +1526,13 @@ export default function DictationScreen() { } }, [stopRecording]) - const modelLoading = !model.isReady - const prog = model.downloadProgress > 1 ? model.downloadProgress / 100 : model.downloadProgress - const load = Math.max(0, Math.min(1, Number.isFinite(prog) ? prog : 0)) - const pct = Math.round(load * 100) + const modelDownloading = downloadingModelID !== null + const modelLoading = isPreparingWhisperModel || activeWhisperModel == null || modelDownloading || isTranscribingBulk + const modelLoadingState = modelDownloading ? "downloading" : modelLoading ? "loading" : "ready" + const pct = Math.round(Math.max(0, Math.min(1, downloadProgress)) * 100) + const loadingModelLabel = downloadingModelID + ? WHISPER_MODEL_LABELS[downloadingModelID] + : WHISPER_MODEL_LABELS[defaultWhisperModel] const hasTranscript = transcribedText.trim().length > 0 const shouldShowSend = hasCompletedSession && hasTranscript const activeServer = servers.find((s) => s.id === activeServerId) ?? null @@ -995,12 +1564,12 @@ export default function DictationScreen() { }, [isRecording, recordingProgress]) useEffect(() => { - const isGenerating = isRecording || model.isGenerating + const isGenerating = isRecording waveformVisibility.value = withTiming(isGenerating ? 1 : 0, { duration: isGenerating ? 180 : 240, easing: Easing.inOut(Easing.quad), }) - }, [isRecording, model.isGenerating, waveformVisibility]) + }, [isRecording, waveformVisibility]) useEffect(() => { serverMenuProgress.value = withTiming(isDropdownOpen ? 1 : 0, { @@ -1736,14 +2305,16 @@ export default function DictationScreen() { { - void handleDeleteModel() - }} + onPress={handleOpenWhisperSettings} style={({ pressed }) => [styles.clearButton, pressed && styles.clearButtonPressed]} hitSlop={8} - disabled={modelLoading || modelReset} > - DL + ) : null} + {whisperError ? ( + + {whisperError} + + ) : null} + - {modelLoading ? ( + {isTranscribingBulk ? ( + + + + ) : modelLoadingState !== "ready" ? ( <> - + - {`Downloading model ${pct}%`} + + {modelLoadingState === "downloading" + ? `Downloading ${loadingModelLabel} ${pct}%` + : `Loading ${loadingModelLabel}`} + ) : ( @@ -1831,6 +2422,162 @@ export default function DictationScreen() { + setWhisperSettingsOpen(false)} + > + + + + Whisper models + Default: {WHISPER_MODEL_LABELS[defaultWhisperModel]} + + setWhisperSettingsOpen(false)}> + Done + + + + + Transcription + + setTranscriptionMode("bulk")} + disabled={isRecording || isTranscribingBulk} + style={({ pressed }) => [ + styles.settingsModeButton, + transcriptionMode === "bulk" && styles.settingsModeButtonActive, + (isRecording || isTranscribingBulk) && styles.settingsInlinePressableDisabled, + pressed && styles.clearButtonPressed, + ]} + > + + On Release + + + + setTranscriptionMode("realtime")} + disabled={isRecording || isTranscribingBulk} + style={({ pressed }) => [ + styles.settingsModeButton, + transcriptionMode === "realtime" && styles.settingsModeButtonActive, + (isRecording || isTranscribingBulk) && styles.settingsInlinePressableDisabled, + pressed && styles.clearButtonPressed, + ]} + > + + Realtime + + + + + + + {WHISPER_MODELS.map((modelID) => { + const installed = installedWhisperModels.includes(modelID) + const isDefault = defaultWhisperModel === modelID + const isDownloading = downloadingModelID === modelID + const actionDisabled = (downloadingModelID !== null && !isDownloading) || isTranscribingBulk + const rowLabel = isDefault ? `${modelID} · default` : modelID + const actionIcon = isDownloading ? "…" : installed ? "✓" : "↓" + const downloadPct = Math.round(Math.max(0, Math.min(1, downloadProgress)) * 100) + const actionLabel = isDownloading + ? "Downloading" + : installed + ? isDefault + ? "Selected" + : "Select" + : "Download" + const sizeLabel = formatWhisperModelSize(WHISPER_MODEL_SIZES[modelID]) + + return ( + + { + if (installed) { + void handleSelectWhisperModel(modelID) + } + }} + onLongPress={() => { + if (!installed || isDownloading) return + Alert.alert("Delete model?", `Remove ${modelID} from this device?`, [ + { text: "Cancel", style: "cancel" }, + { + text: "Delete", + style: "destructive", + onPress: () => { + void handleDeleteWhisperModel(modelID) + }, + }, + ]) + }} + delayLongPress={350} + disabled={!installed || actionDisabled || isPreparingWhisperModel} + style={({ pressed }) => [ + styles.settingsInlineLabelPressable, + (!installed || actionDisabled || isPreparingWhisperModel) && + styles.settingsInlinePressableDisabled, + pressed && styles.clearButtonPressed, + ]} + > + {rowLabel} + + + {sizeLabel} + + { + if (isDownloading) return + if (installed) { + void handleSelectWhisperModel(modelID) + return + } + void handleDownloadWhisperModel(modelID) + }} + disabled={actionDisabled || (installed && isPreparingWhisperModel)} + accessibilityLabel={actionLabel} + style={({ pressed }) => [ + styles.settingsInlineIconButton, + (actionDisabled || (installed && isPreparingWhisperModel)) && + styles.settingsInlinePressableDisabled, + pressed && styles.clearButtonPressed, + ]} + > + {isDownloading ? ( + + {downloadPct} + + ) : ( + + {actionIcon} + + )} + + + ) + })} + + + + Promise + subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void + }> + transcribeData( + data: ArrayBuffer, + options?: TranscribeOptions, + ): { + stop: () => Promise + promise: Promise + } + release(): Promise + } + + export type ContextOptions = { + filePath: string | number + useGpu?: boolean + useCoreMLIos?: boolean + useFlashAttn?: boolean + } + + export function initWhisper(options: ContextOptions): Promise + export function releaseAllWhisper(): Promise +} + +declare module "whisper.rn/realtime-transcription/index" { + import type { TranscribeOptions, TranscribeResult, WhisperContext } from "whisper.rn" + + export type RealtimeTranscribeEvent = { + type: "start" | "transcribe" | "end" | "error" + sliceIndex: number + data?: TranscribeResult + isCapturing: boolean + processTime: number + recordingTime: number + } + + export type RealtimeOptions = { + audioSliceSec?: number + audioMinSec?: number + maxSlicesInMemory?: number + transcribeOptions?: TranscribeOptions + logger?: (message: string) => void + } + + export type RealtimeTranscriberCallbacks = { + onTranscribe?: (event: RealtimeTranscribeEvent) => void + onError?: (error: string) => void + onStatusChange?: (isActive: boolean) => void + } + + export type RealtimeTranscriberDependencies = { + whisperContext: WhisperContext + audioStream: unknown + vadContext?: unknown + fs?: unknown + } + + export class RealtimeTranscriber { + constructor( + dependencies: RealtimeTranscriberDependencies, + options?: RealtimeOptions, + callbacks?: RealtimeTranscriberCallbacks, + ) + start(): Promise + stop(): Promise + release(): Promise + updateCallbacks(callbacks: Partial): void + } +} + +declare module "whisper.rn/realtime-transcription" { + export * from "whisper.rn/realtime-transcription/index" +} + +declare module "whisper.rn/src/realtime-transcription" { + export * from "whisper.rn/realtime-transcription/index" +} + +declare module "whisper.rn/realtime-transcription/adapters/AudioPcmStreamAdapter" { + export class AudioPcmStreamAdapter { + initialize(config: Record): Promise + start(): Promise + stop(): Promise + isRecording(): boolean + onData(callback: (data: unknown) => void): void + onError(callback: (error: string) => void): void + onStatusChange(callback: (isRecording: boolean) => void): void + release(): Promise + } +} + +declare module "whisper.rn/src/realtime-transcription/adapters/AudioPcmStreamAdapter" { + export * from "whisper.rn/realtime-transcription/adapters/AudioPcmStreamAdapter" +} diff --git a/packages/opencode/src/server/push-relay.ts b/packages/opencode/src/server/push-relay.ts index d0b7698129..d4c5eecb3a 100644 --- a/packages/opencode/src/server/push-relay.ts +++ b/packages/opencode/src/server/push-relay.ts @@ -139,8 +139,8 @@ async function notify(input: { type: Type; sessionID: string }): Promise const session = await Session.get(sessionID) out.title = session.title + let latestUser: string | undefined for await (const msg of MessageV2.stream(sessionID)) { - if (msg.info.role !== "user") continue const body = msg.parts .map((part) => { if (part.type !== "text") return "" @@ -151,8 +151,19 @@ async function notify(input: { type: Type; sessionID: string }): Promise .join(" ") const next = words(body) if (!next) continue - out.body = next - break + + if (msg.info.role === "assistant") { + out.body = next + break + } + + if (!latestUser && msg.info.role === "user") { + latestUser = next + } + } + + if (!out.body) { + out.body = latestUser } } catch (error) { log.info("notification metadata unavailable", {