diff --git a/AGENTS.md b/AGENTS.md index 6946d441..1a79f6f7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -15,7 +15,7 @@ All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in th - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay - **Pattern**: MVVM with `@StateObject` / `@Published` state management - **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming -- **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks +- **Speech-to-Text**: AssemblyAI real-time streaming via websocket, using `u3-rt-pro` for English and `whisper-rt` for Chinese, with OpenAI and Apple Speech as fallbacks - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support - **Voice Input**: Push-to-talk via `AVAudioEngine` + pluggable transcription-provider layer. System-wide keyboard shortcut via listen-only CGEvent tap. @@ -34,7 +34,7 @@ The app never calls external APIs directly. All requests go through a Cloudflare | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token | Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` -Worker vars: `ELEVENLABS_VOICE_ID` +Worker vars: `ELEVENLABS_VOICE_ID`, `ELEVENLABS_CHINESE_VOICE_ID` (optional) ### Key Architecture Decisions @@ -61,7 +61,7 @@ Worker vars: `ELEVENLABS_VOICE_ID` | `CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | | `BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | | `BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | -| `AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | +| `AssemblyAIStreamingTranscriptionProvider.swift` | ~541 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, switches between `u3-rt-pro` for English and `whisper-rt` for Chinese, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | | `OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | | `AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | diff --git a/README.md b/README.md index d7dbf74b..8f0fbcff 100644 --- a/README.md +++ b/README.md @@ -56,13 +56,16 @@ npx wrangler secret put ASSEMBLYAI_API_KEY npx wrangler secret put ELEVENLABS_API_KEY ``` -For the ElevenLabs voice ID, open `wrangler.toml` and set it there (it's not sensitive): +For the ElevenLabs voice IDs, open `wrangler.toml` and set them there (they're not sensitive): ```toml [vars] ELEVENLABS_VOICE_ID = "your-voice-id-here" +ELEVENLABS_CHINESE_VOICE_ID = "optional-chinese-voice-id" ``` +`ELEVENLABS_VOICE_ID` stays the default voice. `ELEVENLABS_CHINESE_VOICE_ID` is optional and only used when the app is set to Chinese voice mode. If you leave it blank, the default voice is reused. + Deploy it: ```bash @@ -87,6 +90,7 @@ ANTHROPIC_API_KEY=sk-ant-... ASSEMBLYAI_API_KEY=... ELEVENLABS_API_KEY=... ELEVENLABS_VOICE_ID=... +ELEVENLABS_CHINESE_VOICE_ID=... ``` Then update the proxy URLs in the Swift code to point to `http://localhost:8787` instead of the deployed Worker URL while developing. Grep for `clicky-proxy` to find them all. @@ -127,7 +131,7 @@ The app will appear in your menu bar (not the dock). Click the icon to open the If you want the full technical breakdown, read `CLAUDE.md`. But here's the short version: -**Menu bar app** (no dock icon) with two `NSPanel` windows — one for the control panel dropdown, one for the full-screen transparent cursor overlay. Push-to-talk streams audio over a websocket to AssemblyAI, sends the transcript + screenshot to Claude via streaming SSE, and plays the response through ElevenLabs TTS. Claude can embed `[POINT:x,y:label:screenN]` tags in its responses to make the cursor fly to specific UI elements across multiple monitors. All three APIs are proxied through a Cloudflare Worker. +**Menu bar app** (no dock icon) with two `NSPanel` windows — one for the control panel dropdown, one for the full-screen transparent cursor overlay. Push-to-talk streams audio over a websocket to AssemblyAI, sends the transcript + screenshot to Claude via streaming SSE, and plays the response through ElevenLabs TTS. English uses AssemblyAI `u3-rt-pro`, while Chinese switches to `whisper-rt` so Chinese speech can be transcribed reliably. Claude can embed `[POINT:x,y:label:screenN]` tags in its responses to make the cursor fly to specific UI elements across multiple monitors. All three APIs are proxied through a Cloudflare Worker. ## Project structure diff --git a/install-clicky.sh b/install-clicky.sh new file mode 100644 index 00000000..02e24b5d --- /dev/null +++ b/install-clicky.sh @@ -0,0 +1,11 @@ +#!/bin/bash +APP_PATH="$HOME/Library/Developer/Xcode/DerivedData/leanring-buddy-dvwepfgqqgvpjhbjcbybcytzaawt/Index.noindex/Build/Products/Debug/Clicky.app" + +if [ -d "$APP_PATH" ]; then + rm -rf "/Applications/Clicky.app" 2>/dev/null + cp -R "$APP_PATH" /Applications/ + echo "✅ Clicky installed to /Applications!" + echo "Now open Clicky from Applications folder and grant permissions." +else + echo "❌ Clicky.app not found. Please build in Xcode first (⌘R)" +fi diff --git a/leanring-buddy.xcodeproj/project.pbxproj b/leanring-buddy.xcodeproj/project.pbxproj index 75e57261..4c4e65ee 100644 --- a/leanring-buddy.xcodeproj/project.pbxproj +++ b/leanring-buddy.xcodeproj/project.pbxproj @@ -34,9 +34,22 @@ 28F22CD62F56440300A0FC59 /* leanring-buddyUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "leanring-buddyUITests.xctest"; sourceTree = BUILT_PRODUCTS_DIR; }; /* End PBXFileReference section */ +/* Begin PBXFileSystemSynchronizedBuildFileExceptionSet section */ + AA11CC112F7000010039DA55 /* Exceptions for "leanring-buddy" folder in "leanring-buddy" target */ = { + isa = PBXFileSystemSynchronizedBuildFileExceptionSet; + membershipExceptions = ( + Info.plist, + ); + target = 28F22CBE2F56440300A0FC59 /* leanring-buddy */; + }; +/* End PBXFileSystemSynchronizedBuildFileExceptionSet section */ + /* Begin PBXFileSystemSynchronizedRootGroup section */ 28F22CC12F56440300A0FC59 /* leanring-buddy */ = { isa = PBXFileSystemSynchronizedRootGroup; + exceptions = ( + AA11CC112F7000010039DA55 /* Exceptions for "leanring-buddy" folder in "leanring-buddy" target */, + ); path = "leanring-buddy"; sourceTree = ""; }; @@ -411,7 +424,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 2UDAY4J48G; + DEVELOPMENT_TEAM = ZZY6K862N2; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; @@ -449,7 +462,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 2UDAY4J48G; + DEVELOPMENT_TEAM = ZZY6K862N2; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; @@ -484,7 +497,7 @@ BUNDLE_LOADER = "$(TEST_HOST)"; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = ZZY6K862N2; GENERATE_INFOPLIST_FILE = YES; MACOSX_DEPLOYMENT_TARGET = 14.2; MARKETING_VERSION = 1.0; @@ -505,7 +518,7 @@ BUNDLE_LOADER = "$(TEST_HOST)"; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = ZZY6K862N2; GENERATE_INFOPLIST_FILE = YES; MACOSX_DEPLOYMENT_TARGET = 14.2; MARKETING_VERSION = 1.0; @@ -525,7 +538,7 @@ buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = ZZY6K862N2; GENERATE_INFOPLIST_FILE = YES; MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.yourcompany.leanring-buddyUITests"; @@ -544,7 +557,7 @@ buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = ZZY6K862N2; GENERATE_INFOPLIST_FILE = YES; MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.yourcompany.leanring-buddyUITests"; diff --git a/leanring-buddy.xcodeproj/xcuserdata/mac.xcuserdatad/xcschemes/xcschememanagement.plist b/leanring-buddy.xcodeproj/xcuserdata/mac.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 00000000..6c13490a --- /dev/null +++ b/leanring-buddy.xcodeproj/xcuserdata/mac.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + leanring-buddy.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/leanring-buddy/AppleSpeechTranscriptionProvider.swift b/leanring-buddy/AppleSpeechTranscriptionProvider.swift index 600594fa..20e41ee2 100644 --- a/leanring-buddy/AppleSpeechTranscriptionProvider.swift +++ b/leanring-buddy/AppleSpeechTranscriptionProvider.swift @@ -25,11 +25,12 @@ final class AppleSpeechTranscriptionProvider: BuddyTranscriptionProvider { func startStreamingSession( keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void ) async throws -> any BuddyStreamingTranscriptionSession { - guard let speechRecognizer = Self.makeBestAvailableSpeechRecognizer() else { + guard let speechRecognizer = Self.makeBestAvailableSpeechRecognizer(languageCode: languageCode) else { throw AppleSpeechTranscriptionProviderError(message: "dictation is not available on this mac.") } @@ -41,14 +42,23 @@ final class AppleSpeechTranscriptionProvider: BuddyTranscriptionProvider { ) } - private static func makeBestAvailableSpeechRecognizer() -> SFSpeechRecognizer? { - let preferredLocales = [ - Locale.autoupdatingCurrent, - Locale(identifier: "en-US") - ] + private static func makeBestAvailableSpeechRecognizer(languageCode: String?) -> SFSpeechRecognizer? { + var preferredLocales: [Locale] = [] + + if let languageCode { + preferredLocales.append(Locale(identifier: languageCode)) + } + + preferredLocales.append(Locale.autoupdatingCurrent) + preferredLocales.append(Locale(identifier: "en-US")) + + if languageCode != "zh-CN" { + preferredLocales.append(Locale(identifier: "zh-CN")) + } for preferredLocale in preferredLocales { - if let speechRecognizer = SFSpeechRecognizer(locale: preferredLocale) { + if let speechRecognizer = SFSpeechRecognizer(locale: preferredLocale), + speechRecognizer.isAvailable { return speechRecognizer } } diff --git a/leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift b/leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift index d21286b6..3dfbb6c2 100644 --- a/leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift +++ b/leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift @@ -19,7 +19,7 @@ struct AssemblyAIStreamingTranscriptionProviderError: LocalizedError { final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider { /// URL for the Cloudflare Worker endpoint that returns a short-lived /// AssemblyAI streaming token. The real API key never leaves the server. - private static let tokenProxyURL = "https://your-worker-name.your-subdomain.workers.dev/transcribe-token" + private static let tokenProxyURL = "https://clicky-proxy.clicky-mark.workers.dev/transcribe-token" let displayName = "AssemblyAI" let requiresSpeechRecognitionPermission = false @@ -35,11 +35,11 @@ final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider func startStreamingSession( keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void ) async throws -> any BuddyStreamingTranscriptionSession { - // Fetch a fresh temporary token from the proxy before each session let temporaryToken = try await fetchTemporaryToken() print("🎙️ AssemblyAI: fetched temporary token (\(temporaryToken.prefix(20))...)") @@ -48,6 +48,7 @@ final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider temporaryToken: temporaryToken, urlSession: sharedWebSocketURLSession, keyterms: keyterms, + languageCode: languageCode, onTranscriptUpdate: onTranscriptUpdate, onFinalTranscriptReady: onFinalTranscriptReady, onError: onError @@ -85,6 +86,53 @@ final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider } private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStreamingTranscriptionSession { + private enum StreamingSpeechModelConfiguration { + case universalRealtimePro + case whisperRealtime + + init(languageCode: String?) { + let normalizedLanguageCode = languageCode? + .trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased() + + if let normalizedLanguageCode, + !normalizedLanguageCode.isEmpty, + normalizedLanguageCode != "en" { + self = .whisperRealtime + return + } + + self = .universalRealtimePro + } + + var modelIdentifier: String { + switch self { + case .universalRealtimePro: + return "u3-rt-pro" + case .whisperRealtime: + return "whisper-rt" + } + } + + var supportsExplicitLanguageCode: Bool { + switch self { + case .universalRealtimePro: + return true + case .whisperRealtime: + return false + } + } + + var shouldEnableLanguageDetection: Bool { + switch self { + case .universalRealtimePro: + return false + case .whisperRealtime: + return true + } + } + } + private struct MessageEnvelope: Decodable { let type: String } @@ -117,6 +165,7 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre private let apiKey: String? private let temporaryToken: String? private let keyterms: [String] + private let languageCode: String? private let onTranscriptUpdate: (String) -> Void private let onFinalTranscriptReady: (String) -> Void private let onError: (Error) -> Void @@ -142,6 +191,7 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre temporaryToken: String?, urlSession: URLSession, keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void @@ -150,6 +200,7 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre self.temporaryToken = temporaryToken self.urlSession = urlSession self.keyterms = keyterms + self.languageCode = languageCode self.onTranscriptUpdate = onTranscriptUpdate self.onFinalTranscriptReady = onFinalTranscriptReady self.onError = onError @@ -158,7 +209,8 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre func open() async throws { let websocketURL = try Self.makeWebsocketURL( temporaryToken: temporaryToken, - keyterms: keyterms + keyterms: keyterms, + languageCode: languageCode ) var websocketRequest = URLRequest(url: websocketURL) @@ -436,7 +488,8 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre private static func makeWebsocketURL( temporaryToken: String?, - keyterms: [String] + keyterms: [String], + languageCode: String? ) throws -> URL { guard var websocketURLComponents = URLComponents(string: websocketBaseURLString) else { throw AssemblyAIStreamingTranscriptionProviderError( @@ -444,13 +497,29 @@ private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStre ) } + let streamingSpeechModelConfiguration = StreamingSpeechModelConfiguration( + languageCode: languageCode + ) + var queryItems = [ URLQueryItem(name: "sample_rate", value: "16000"), URLQueryItem(name: "encoding", value: "pcm_s16le"), URLQueryItem(name: "format_turns", value: "true"), - URLQueryItem(name: "speech_model", value: "u3-rt-pro") + URLQueryItem( + name: "speech_model", + value: streamingSpeechModelConfiguration.modelIdentifier + ) ] + if streamingSpeechModelConfiguration.shouldEnableLanguageDetection { + queryItems.append(URLQueryItem(name: "language_detection", value: "true")) + } + + if streamingSpeechModelConfiguration.supportsExplicitLanguageCode, + let languageCode { + queryItems.append(URLQueryItem(name: "language_code", value: languageCode)) + } + let normalizedKeyterms = keyterms .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } .filter { !$0.isEmpty } diff --git a/leanring-buddy/BuddyDictationManager.swift b/leanring-buddy/BuddyDictationManager.swift index 5bca2677..eedbfbba 100644 --- a/leanring-buddy/BuddyDictationManager.swift +++ b/leanring-buddy/BuddyDictationManager.swift @@ -265,6 +265,7 @@ final class BuddyDictationManager: NSObject, ObservableObject { private let transcriptionProvider: any BuddyTranscriptionProvider private let audioEngine = AVAudioEngine() private var activeTranscriptionSession: (any BuddyStreamingTranscriptionSession)? + var languageCode: String? private var activeStartSource: BuddyDictationStartSource? private var draftCallbacks: BuddyDictationDraftCallbacks? private var draftTextBeforeCurrentDictation = "" @@ -519,6 +520,7 @@ final class BuddyDictationManager: NSObject, ObservableObject { let activeTranscriptionSession = try await transcriptionProvider.startStreamingSession( keyterms: buildTranscriptionKeyterms(), + languageCode: languageCode, onTranscriptUpdate: { [weak self] transcriptText in Task { @MainActor in self?.latestRecognizedText = transcriptText diff --git a/leanring-buddy/BuddyTranscriptionProvider.swift b/leanring-buddy/BuddyTranscriptionProvider.swift index 0a75715d..8c057c24 100644 --- a/leanring-buddy/BuddyTranscriptionProvider.swift +++ b/leanring-buddy/BuddyTranscriptionProvider.swift @@ -23,6 +23,7 @@ protocol BuddyTranscriptionProvider { func startStreamingSession( keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void diff --git a/leanring-buddy/ClaudeAPI.swift b/leanring-buddy/ClaudeAPI.swift index 0c7070b5..125da23b 100644 --- a/leanring-buddy/ClaudeAPI.swift +++ b/leanring-buddy/ClaudeAPI.swift @@ -203,7 +203,7 @@ class ClaudeAPI { accumulatedResponseText += textChunk // Send the accumulated text so far to the UI for progressive rendering let currentAccumulatedText = accumulatedResponseText - await onTextChunk(currentAccumulatedText) + onTextChunk(currentAccumulatedText) } } diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift index 0234cf19..039c6764 100644 --- a/leanring-buddy/CompanionManager.swift +++ b/leanring-buddy/CompanionManager.swift @@ -7,6 +7,7 @@ // exposes observable voice state for the panel UI. // +@preconcurrency import AVFAudio import AVFoundation import Combine import Foundation @@ -23,6 +24,8 @@ enum CompanionVoiceState { @MainActor final class CompanionManager: ObservableObject { + private static let screenContentPermissionUserDefaultsKey = "hasScreenContentPermission" + @Published private(set) var voiceState: CompanionVoiceState = .idle @Published private(set) var lastTranscript: String? @Published private(set) var currentAudioPowerLevel: CGFloat = 0 @@ -61,6 +64,10 @@ final class CompanionManager: ObservableObject { private var onboardingMusicPlayer: AVAudioPlayer? private var onboardingMusicFadeTimer: Timer? + private var onboardingMusicFadeStepsRemaining = 0 + private var onboardingMusicVolumeDecrement: Float = 0 + private var onboardingPromptStreamTask: Task? + private let fallbackSpeechSynthesizer = AVSpeechSynthesizer() let buddyDictationManager = BuddyDictationManager() let globalPushToTalkShortcutMonitor = GlobalPushToTalkShortcutMonitor() @@ -70,7 +77,7 @@ final class CompanionManager: ObservableObject { /// Base URL for the Cloudflare Worker proxy. All API requests route /// through this so keys never ship in the app binary. - private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev" + private static let workerBaseURL = "https://clicky-proxy.clicky-mark.workers.dev" private lazy var claudeAPI: ClaudeAPI = { return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) @@ -93,6 +100,7 @@ final class CompanionManager: ObservableObject { private var audioPowerCancellable: AnyCancellable? private var accessibilityCheckTimer: Timer? private var pendingKeyboardShortcutStartTask: Task? + private var screenContentPermissionValidationTask: Task? /// Scheduled hide for transient cursor mode — cancelled if the user /// speaks again before the delay elapses. private var transientHideTask: Task? @@ -116,6 +124,73 @@ final class CompanionManager: ObservableObject { claudeAPI.model = model } + /// Supported voice languages for transcription, AI responses, and TTS. + enum VoiceLanguage: String, CaseIterable, Identifiable { + case english = "en" + case chinese = "zh" + + var id: String { rawValue } + + var displayName: String { + switch self { + case .english: return "English" + case .chinese: return "中文" + } + } + + var locale: Locale { + switch self { + case .english: return Locale(identifier: "en-US") + case .chinese: return Locale(identifier: "zh-CN") + } + } + + var transcriptionLanguageCode: String? { + switch self { + case .english: return nil + case .chinese: return "zh" + } + } + + var textToSpeechLanguageCode: String? { + switch self { + case .english: return nil + case .chinese: return "zh" + } + } + + var responseSystemPromptInstructions: String { + switch self { + case .english: + return """ + - reply in english unless the user clearly asks for another language. + - all lowercase, casual, warm. no emojis. + """ + case .chinese: + return """ + - reply in simplified chinese unless the user clearly asks for another language. + - write the way spoken mandarin sounds in conversation: natural, warm, and compact. + - chinese does not need lowercase styling. keep punctuation natural for speech. + - if a technical term is much easier to recognize in english, you can mention the english term once alongside the chinese explanation. + """ + } + } + } + + /// The voice language used for transcription and TTS. Persisted to UserDefaults. + @Published var selectedVoiceLanguage: VoiceLanguage = { + if let savedLanguage = UserDefaults.standard.string(forKey: "selectedVoiceLanguage"), + let language = VoiceLanguage(rawValue: savedLanguage) { + return language + } + return .english + }() + + func setSelectedVoiceLanguage(_ language: VoiceLanguage) { + selectedVoiceLanguage = language + UserDefaults.standard.set(language.rawValue, forKey: "selectedVoiceLanguage") + } + /// User preference for whether the Clicky cursor should be shown. /// When toggled off, the overlay is hidden and push-to-talk is disabled. /// Persisted to UserDefaults so the choice survives app restarts. @@ -233,6 +308,8 @@ final class CompanionManager: ObservableObject { private func stopOnboardingMusic() { onboardingMusicFadeTimer?.invalidate() onboardingMusicFadeTimer = nil + onboardingMusicFadeStepsRemaining = 0 + onboardingMusicVolumeDecrement = 0 onboardingMusicPlayer?.stop() onboardingMusicPlayer = nil } @@ -252,7 +329,9 @@ final class CompanionManager: ObservableObject { // After 1m 30s, fade the music out over 3s onboardingMusicFadeTimer = Timer.scheduledTimer(withTimeInterval: 90.0, repeats: false) { [weak self] _ in - self?.fadeOutOnboardingMusic() + Task { @MainActor [weak self] in + self?.fadeOutOnboardingMusic() + } } } catch { print("⚠️ Clicky: Failed to play onboarding music: \(error)") @@ -265,22 +344,38 @@ final class CompanionManager: ObservableObject { let fadeSteps = 30 let fadeDuration: Double = 3.0 let stepInterval = fadeDuration / Double(fadeSteps) - let volumeDecrement = player.volume / Float(fadeSteps) - var stepsRemaining = fadeSteps + onboardingMusicFadeStepsRemaining = fadeSteps + onboardingMusicVolumeDecrement = player.volume / Float(fadeSteps) onboardingMusicFadeTimer = Timer.scheduledTimer(withTimeInterval: stepInterval, repeats: true) { [weak self] timer in - stepsRemaining -= 1 - player.volume -= volumeDecrement - - if stepsRemaining <= 0 { - timer.invalidate() - player.stop() - self?.onboardingMusicPlayer = nil - self?.onboardingMusicFadeTimer = nil + Task { @MainActor [weak self] in + self?.handleOnboardingMusicFadeTimerTick(timer) } } } + private func handleOnboardingMusicFadeTimerTick(_ timer: Timer) { + guard let onboardingMusicPlayer else { + timer.invalidate() + onboardingMusicFadeTimer = nil + onboardingMusicFadeStepsRemaining = 0 + onboardingMusicVolumeDecrement = 0 + return + } + + onboardingMusicFadeStepsRemaining -= 1 + onboardingMusicPlayer.volume -= onboardingMusicVolumeDecrement + + if onboardingMusicFadeStepsRemaining <= 0 { + timer.invalidate() + onboardingMusicPlayer.stop() + self.onboardingMusicPlayer = nil + onboardingMusicFadeTimer = nil + onboardingMusicFadeStepsRemaining = 0 + onboardingMusicVolumeDecrement = 0 + } + } + func clearDetectedElementLocation() { detectedElementScreenLocation = nil detectedElementDisplayFrame = nil @@ -292,6 +387,8 @@ final class CompanionManager: ObservableObject { buddyDictationManager.cancelCurrentDictation() overlayWindowManager.hideOverlay() transientHideTask?.cancel() + screenContentPermissionValidationTask?.cancel() + onboardingPromptStreamTask?.cancel() currentResponseTask?.cancel() currentResponseTask = nil @@ -317,7 +414,8 @@ final class CompanionManager: ObservableObject { globalPushToTalkShortcutMonitor.stop() } - hasScreenRecordingPermission = WindowPositionManager.hasScreenRecordingPermission() + hasScreenRecordingPermission = WindowPositionManager + .shouldTreatScreenRecordingPermissionAsGrantedForSessionLaunch() let micAuthStatus = AVCaptureDevice.authorizationStatus(for: .audio) hasMicrophonePermission = micAuthStatus == .authorized @@ -339,10 +437,13 @@ final class CompanionManager: ObservableObject { if !previouslyHadMicrophone && hasMicrophonePermission { ClickyAnalytics.trackPermissionGranted(permission: "microphone") } - // Screen content permission is persisted — once the user has approved the - // SCShareableContent picker, we don't need to re-check it. - if !hasScreenContentPermission { - hasScreenContentPermission = UserDefaults.standard.bool(forKey: "hasScreenContentPermission") + if !hasScreenRecordingPermission { + hasScreenContentPermission = false + screenContentPermissionValidationTask?.cancel() + screenContentPermissionValidationTask = nil + } else if hasScreenContentPermission + || UserDefaults.standard.bool(forKey: Self.screenContentPermissionUserDefaultsKey) { + validateScreenContentPermissionAgainstSystemIfNeeded() } if !previouslyHadAll && allPermissionsGranted { @@ -378,7 +479,7 @@ final class CompanionManager: ObservableObject { isRequestingScreenContent = false guard didCapture else { return } hasScreenContentPermission = true - UserDefaults.standard.set(true, forKey: "hasScreenContentPermission") + UserDefaults.standard.set(true, forKey: Self.screenContentPermissionUserDefaultsKey) ClickyAnalytics.trackPermissionGranted(permission: "screen_content") // If onboarding was already completed, show the cursor overlay now @@ -390,7 +491,10 @@ final class CompanionManager: ObservableObject { } } catch { print("⚠️ Screen content permission request failed: \(error)") - await MainActor.run { isRequestingScreenContent = false } + await MainActor.run { + isRequestingScreenContent = false + invalidateScreenContentPermissionGrant() + } } } } @@ -512,6 +616,7 @@ final class CompanionManager: ObservableObject { pendingKeyboardShortcutStartTask?.cancel() pendingKeyboardShortcutStartTask = Task { + buddyDictationManager.languageCode = selectedVoiceLanguage.transcriptionLanguageCode await buddyDictationManager.startPushToTalkFromKeyboardShortcut( currentDraftText: "", updateDraftText: { _ in @@ -541,12 +646,15 @@ final class CompanionManager: ObservableObject { // MARK: - Companion Prompt - private static let companionVoiceResponseSystemPrompt = """ + private static func companionVoiceResponseSystemPrompt( + for voiceLanguage: VoiceLanguage + ) -> String { + """ you're clicky, a friendly always-on companion that lives in the user's menu bar. the user just spoke to you via push-to-talk and you can see their screen(s). your reply will be spoken aloud via text-to-speech, so write the way you'd actually talk. this is an ongoing conversation — you remember everything they've said before. rules: - default to one or two sentences. be direct and dense. BUT if the user asks you to explain more, go deeper, or elaborate, then go all out — give a thorough, detailed explanation with no length limit. - - all lowercase, casual, warm. no emojis. + \(voiceLanguage.responseSystemPromptInstructions) - write for the ear, not the eye. short sentences. no lists, bullet points, markdown, or formatting — just natural speech. - don't use abbreviations or symbols that sound weird read aloud. write "for example" not "e.g.", spell out small numbers. - if the user's question relates to what's on their screen, reference specific things you see. @@ -575,6 +683,7 @@ final class CompanionManager: ObservableObject { - user asks how to commit in xcode: "see that source control menu up top? click that and hit commit, or you can use command option c as a shortcut. [POINT:285,11:source control]" - element is on screen 2 (not where cursor is): "that's over on your other monitor — see the terminal window? [POINT:400,300:terminal:screen2]" """ + } // MARK: - AI Response Pipeline @@ -612,7 +721,7 @@ final class CompanionManager: ObservableObject { let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( images: labeledImages, - systemPrompt: Self.companionVoiceResponseSystemPrompt, + systemPrompt: Self.companionVoiceResponseSystemPrompt(for: selectedVoiceLanguage), conversationHistory: historyForAPI, userPrompt: transcript, onTextChunk: { _ in @@ -701,6 +810,7 @@ final class CompanionManager: ObservableObject { // until the audio actually starts playing, then switch to responding. if !spokenText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { do { + elevenLabsTTSClient.languageCode = selectedVoiceLanguage.textToSpeechLanguageCode try await elevenLabsTTSClient.speakText(spokenText) // speakText returns after player.play() — audio is now playing voiceState = .responding @@ -713,6 +823,12 @@ final class CompanionManager: ObservableObject { } catch is CancellationError { // User spoke again — response was interrupted } catch { + if Self.isScreenContentPermissionDenied(error) { + invalidateScreenContentPermissionGrant() + print("⚠️ Screen content permission is no longer valid: \(error.localizedDescription)") + voiceState = .idle + return + } ClickyAnalytics.trackResponseError(error: error.localizedDescription) print("⚠️ Companion response error: \(error)") speakCreditsErrorFallback() @@ -725,6 +841,73 @@ final class CompanionManager: ObservableObject { } } + private func validateScreenContentPermissionAgainstSystemIfNeeded() { + guard screenContentPermissionValidationTask == nil else { return } + + screenContentPermissionValidationTask = Task { [weak self] in + let isGranted = await Self.canCaptureScreenContentRightNow() + + await MainActor.run { + guard let self else { return } + self.screenContentPermissionValidationTask = nil + + if isGranted { + self.hasScreenContentPermission = true + UserDefaults.standard.set( + true, + forKey: Self.screenContentPermissionUserDefaultsKey + ) + } else { + self.invalidateScreenContentPermissionGrant() + } + } + } + } + + private func invalidateScreenContentPermissionGrant() { + hasScreenContentPermission = false + UserDefaults.standard.removeObject(forKey: Self.screenContentPermissionUserDefaultsKey) + } + + private static func canCaptureScreenContentRightNow() async -> Bool { + do { + let content = try await SCShareableContent.excludingDesktopWindows( + false, + onScreenWindowsOnly: true + ) + + guard let display = content.displays.first else { + return false + } + + let filter = SCContentFilter(display: display, excludingWindows: []) + let configuration = SCStreamConfiguration() + configuration.width = 64 + configuration.height = 64 + + let image = try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: configuration + ) + + return image.width > 0 && image.height > 0 + } catch { + print("⚠️ Screen content validation failed: \(error)") + return false + } + } + + private static func isScreenContentPermissionDenied(_ error: Error) -> Bool { + let nsError = error as NSError + + if nsError.domain == "com.apple.ScreenCaptureKit.SCStreamErrorDomain", + nsError.code == -3801 { + return true + } + + return nsError.localizedDescription.contains("TCC") + } + /// If the cursor is in transient mode (user toggled "Show Clicky" off), /// waits for TTS playback and any pointing animation to finish, then /// fades out the overlay after a 1-second pause. Cancelled automatically @@ -755,13 +938,14 @@ final class CompanionManager: ObservableObject { } } - /// Speaks a hardcoded error message using macOS system TTS when API - /// credits run out. Uses NSSpeechSynthesizer so it works even when - /// ElevenLabs is down. + /// Speaks a hardcoded error message using the system speech synthesizer + /// when API credits run out, so the user still hears a fallback even if + /// ElevenLabs never returns audio. private func speakCreditsErrorFallback() { let utterance = "I'm all out of credits. Please DM Farza and tell him to bring me back to life." - let synthesizer = NSSpeechSynthesizer() - synthesizer.startSpeaking(utterance) + let speechUtterance = AVSpeechUtterance(string: utterance) + fallbackSpeechSynthesizer.stopSpeaking(at: .immediate) + fallbackSpeechSynthesizer.speak(speechUtterance) voiceState = .responding } @@ -856,7 +1040,9 @@ final class CompanionManager: ObservableObject { queue: .main ) { [weak self] in ClickyAnalytics.trackOnboardingDemoTriggered() - self?.performOnboardingDemoInteraction() + Task { @MainActor [weak self] in + self?.performOnboardingDemoInteraction() + } } // Fade out and clean up when the video finishes @@ -865,16 +1051,14 @@ final class CompanionManager: ObservableObject { object: player.currentItem, queue: .main ) { [weak self] _ in - guard let self else { return } ClickyAnalytics.trackOnboardingVideoCompleted() - self.onboardingVideoOpacity = 0.0 - // Wait for the 2s fade-out animation to complete before tearing down - DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) { + Task { @MainActor [weak self] in + guard let self else { return } + self.onboardingVideoOpacity = 0.0 + try? await Task.sleep(nanoseconds: 2_000_000_000) self.tearDownOnboardingVideo() - // After the video disappears, stream in the prompt to try talking - DispatchQueue.main.asyncAfter(deadline: .now() + 0.3) { - self.startOnboardingPromptStream() - } + try? await Task.sleep(nanoseconds: 300_000_000) + self.startOnboardingPromptStream() } } } @@ -895,6 +1079,7 @@ final class CompanionManager: ObservableObject { private func startOnboardingPromptStream() { let message = "press control + option and introduce yourself" + onboardingPromptStreamTask?.cancel() onboardingPromptText = "" showOnboardingPrompt = true onboardingPromptOpacity = 0.0 @@ -903,26 +1088,26 @@ final class CompanionManager: ObservableObject { onboardingPromptOpacity = 1.0 } - var currentIndex = 0 - Timer.scheduledTimer(withTimeInterval: 0.03, repeats: true) { timer in - guard currentIndex < message.count else { - timer.invalidate() - // Auto-dismiss after 10 seconds - DispatchQueue.main.asyncAfter(deadline: .now() + 10.0) { - guard self.showOnboardingPrompt else { return } - withAnimation(.easeOut(duration: 0.3)) { - self.onboardingPromptOpacity = 0.0 - } - DispatchQueue.main.asyncAfter(deadline: .now() + 0.35) { - self.showOnboardingPrompt = false - self.onboardingPromptText = "" - } - } - return + onboardingPromptStreamTask = Task { @MainActor [weak self] in + guard let self else { return } + + for character in message { + guard !Task.isCancelled else { return } + self.onboardingPromptText.append(character) + try? await Task.sleep(nanoseconds: 30_000_000) } - let index = message.index(message.startIndex, offsetBy: currentIndex) - self.onboardingPromptText.append(message[index]) - currentIndex += 1 + + try? await Task.sleep(nanoseconds: 10_000_000_000) + guard !Task.isCancelled, self.showOnboardingPrompt else { return } + + withAnimation(.easeOut(duration: 0.3)) { + self.onboardingPromptOpacity = 0.0 + } + + try? await Task.sleep(nanoseconds: 350_000_000) + guard !Task.isCancelled else { return } + self.showOnboardingPrompt = false + self.onboardingPromptText = "" } } diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift index 76789b4c..11d16a8c 100644 --- a/leanring-buddy/CompanionPanelView.swift +++ b/leanring-buddy/CompanionPanelView.swift @@ -31,6 +31,10 @@ struct CompanionPanelView: View { modelPickerRow .padding(.horizontal, 16) + + voiceLanguagePickerRow + .padding(.horizontal, 16) + .padding(.top, 8) } if !companionManager.allPermissionsGranted { @@ -256,9 +260,7 @@ struct CompanionPanelView: View { screenRecordingPermissionRow - if companionManager.hasScreenRecordingPermission { - screenContentPermissionRow - } + screenContentPermissionRow } } @@ -392,6 +394,7 @@ struct CompanionPanelView: View { private var screenContentPermissionRow: some View { let isGranted = companionManager.hasScreenContentPermission + let canRequestScreenContentPermission = companionManager.hasScreenRecordingPermission return HStack { HStack(spacing: 8) { Image(systemName: "eye") @@ -399,9 +402,21 @@ struct CompanionPanelView: View { .foregroundColor(isGranted ? DS.Colors.textTertiary : DS.Colors.warning) .frame(width: 16) - Text("Screen Content") - .font(.system(size: 13, weight: .medium)) - .foregroundColor(DS.Colors.textSecondary) + VStack(alignment: .leading, spacing: 1) { + Text("Screen Content") + .font(.system(size: 13, weight: .medium)) + .foregroundColor(DS.Colors.textSecondary) + + if !isGranted { + Text( + canRequestScreenContentPermission + ? "Approve the ScreenCaptureKit picker after tapping Grant" + : "Grant Screen Recording first" + ) + .font(.system(size: 10)) + .foregroundColor(DS.Colors.textTertiary) + } + } } Spacer() @@ -431,6 +446,8 @@ struct CompanionPanelView: View { } .buttonStyle(.plain) .pointerCursor() + .disabled(!canRequestScreenContentPermission) + .opacity(canRequestScreenContentPermission ? 1 : 0.45) } } .padding(.vertical, 6) @@ -641,6 +658,55 @@ struct CompanionPanelView: View { .pointerCursor() } + // MARK: - Voice Language Picker + + private var voiceLanguagePickerRow: some View { + HStack { + Text("Voice") + .font(.system(size: 13, weight: .medium)) + .foregroundColor(DS.Colors.textSecondary) + + Spacer() + + HStack(spacing: 0) { + ForEach(CompanionManager.VoiceLanguage.allCases) { language in + voiceLanguageOptionButton( + label: language.displayName, + language: language + ) + } + } + .background( + RoundedRectangle(cornerRadius: 6, style: .continuous) + .fill(Color.white.opacity(0.06)) + ) + .overlay( + RoundedRectangle(cornerRadius: 6, style: .continuous) + .stroke(DS.Colors.borderSubtle, lineWidth: 0.5) + ) + } + .padding(.vertical, 4) + } + + private func voiceLanguageOptionButton(label: String, language: CompanionManager.VoiceLanguage) -> some View { + let isSelected = companionManager.selectedVoiceLanguage == language + return Button(action: { + companionManager.setSelectedVoiceLanguage(language) + }) { + Text(label) + .font(.system(size: 11, weight: .medium)) + .foregroundColor(isSelected ? DS.Colors.textPrimary : DS.Colors.textTertiary) + .padding(.horizontal, 10) + .padding(.vertical, 5) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(isSelected ? Color.white.opacity(0.1) : Color.clear) + ) + } + .buttonStyle(.plain) + .pointerCursor() + } + // MARK: - DM Farza Button private var dmFarzaButton: some View { diff --git a/leanring-buddy/CompanionResponseOverlay.swift b/leanring-buddy/CompanionResponseOverlay.swift index a11c6240..7755160e 100644 --- a/leanring-buddy/CompanionResponseOverlay.swift +++ b/leanring-buddy/CompanionResponseOverlay.swift @@ -177,7 +177,7 @@ final class CompanionResponseOverlayManager { context.duration = 0.4 overlayPanel.animator().alphaValue = 0 }, completionHandler: { [weak self] in - Task { @MainActor in + Task { @MainActor [weak self] in self?.hideOverlay() } }) diff --git a/leanring-buddy/ElevenLabsTTSClient.swift b/leanring-buddy/ElevenLabsTTSClient.swift index 35545c9d..91b6a564 100644 --- a/leanring-buddy/ElevenLabsTTSClient.swift +++ b/leanring-buddy/ElevenLabsTTSClient.swift @@ -15,10 +15,10 @@ final class ElevenLabsTTSClient { private let proxyURL: URL private let session: URLSession - /// The audio player for the current TTS playback. Kept alive so the - /// audio finishes playing even if the caller doesn't hold a reference. private var audioPlayer: AVAudioPlayer? + var languageCode: String? + init(proxyURL: String) { self.proxyURL = URL(string: proxyURL)! @@ -28,15 +28,13 @@ final class ElevenLabsTTSClient { self.session = URLSession(configuration: configuration) } - /// Sends `text` to ElevenLabs TTS and plays the resulting audio. - /// Throws on network or decoding errors. Cancellation-safe. func speakText(_ text: String) async throws { var request = URLRequest(url: proxyURL) request.httpMethod = "POST" request.setValue("application/json", forHTTPHeaderField: "Content-Type") request.setValue("audio/mpeg", forHTTPHeaderField: "Accept") - let body: [String: Any] = [ + var body: [String: Any] = [ "text": text, "model_id": "eleven_flash_v2_5", "voice_settings": [ @@ -45,6 +43,10 @@ final class ElevenLabsTTSClient { ] ] + if let languageCode { + body["language_code"] = languageCode + } + request.httpBody = try JSONSerialization.data(withJSONObject: body) let (data, response) = try await session.data(for: request) diff --git a/leanring-buddy/MenuBarPanelManager.swift b/leanring-buddy/MenuBarPanelManager.swift index e5eb98de..4a9b3ad9 100644 --- a/leanring-buddy/MenuBarPanelManager.swift +++ b/leanring-buddy/MenuBarPanelManager.swift @@ -45,7 +45,9 @@ final class MenuBarPanelManager: NSObject { object: nil, queue: .main ) { [weak self] _ in - self?.hidePanel() + Task { @MainActor [weak self] in + self?.hidePanel() + } } } diff --git a/leanring-buddy/OpenAIAudioTranscriptionProvider.swift b/leanring-buddy/OpenAIAudioTranscriptionProvider.swift index 75092092..3a4dcce3 100644 --- a/leanring-buddy/OpenAIAudioTranscriptionProvider.swift +++ b/leanring-buddy/OpenAIAudioTranscriptionProvider.swift @@ -35,6 +35,7 @@ final class OpenAIAudioTranscriptionProvider: BuddyTranscriptionProvider { func startStreamingSession( keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void @@ -49,6 +50,7 @@ final class OpenAIAudioTranscriptionProvider: BuddyTranscriptionProvider { apiKey: apiKey, modelName: modelName, keyterms: keyterms, + languageCode: languageCode, onTranscriptUpdate: onTranscriptUpdate, onFinalTranscriptReady: onFinalTranscriptReady, onError: onError @@ -69,6 +71,7 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription private let apiKey: String private let modelName: String private let keyterms: [String] + private let languageCode: String? private let onTranscriptUpdate: (String) -> Void private let onFinalTranscriptReady: (String) -> Void private let onError: (Error) -> Void @@ -89,6 +92,7 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription apiKey: String, modelName: String, keyterms: [String], + languageCode: String?, onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void @@ -96,6 +100,7 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription self.apiKey = apiKey self.modelName = modelName self.keyterms = keyterms + self.languageCode = languageCode self.onTranscriptUpdate = onTranscriptUpdate self.onFinalTranscriptReady = onFinalTranscriptReady self.onError = onError @@ -232,11 +237,14 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription value: modelName, usingBoundary: boundary ) + + let languageValue = languageCode ?? "en" requestBodyData.appendMultipartFormField( named: "language", - value: "en", + value: languageValue, usingBoundary: boundary ) + requestBodyData.appendMultipartFormField( named: "response_format", value: "json", diff --git a/leanring-buddy/OverlayWindow.swift b/leanring-buddy/OverlayWindow.swift index 884ebcbf..2a8dbffb 100644 --- a/leanring-buddy/OverlayWindow.swift +++ b/leanring-buddy/OverlayWindow.swift @@ -87,7 +87,7 @@ struct NavigationBubbleSizePreferenceKey: PreferenceKey { /// The buddy's behavioral mode. Controls whether it follows the cursor, /// is flying toward a detected UI element, or is pointing at an element. -enum BuddyNavigationMode { +enum BuddyNavigationMode: Sendable { /// Default — buddy follows the mouse cursor with spring animation case followingCursor /// Buddy is animating toward a detected UI element location @@ -261,7 +261,7 @@ struct BlueCursorView: View { // Navigation pointer bubble — shown when buddy arrives at a detected element. // Pops in with a scale-bounce (0.5x → 1.0x spring) and a bright initial // glow that settles, creating a "materializing" effect. - if buddyNavigationMode == .pointingAtTarget && !navigationBubbleText.isEmpty { + if isPointingAtTargetMode && !navigationBubbleText.isEmpty { Text(navigationBubbleText) .font(.system(size: 11, weight: .medium)) .foregroundColor(.white) @@ -311,14 +311,14 @@ struct BlueCursorView: View { .opacity(buddyIsVisibleOnThisScreen && (companionManager.voiceState == .idle || companionManager.voiceState == .responding) ? cursorOpacity : 0) .position(cursorPosition) .animation( - buddyNavigationMode == .followingCursor + isFollowingCursorMode ? .spring(response: 0.2, dampingFraction: 0.6, blendDuration: 0) : nil, value: cursorPosition ) .animation(.easeIn(duration: 0.25), value: companionManager.voiceState) .animation( - buddyNavigationMode == .navigatingToTarget ? nil : .easeInOut(duration: 0.3), + isNavigatingToTargetMode ? nil : .easeInOut(duration: 0.3), value: triangleRotationDegrees ) @@ -368,7 +368,7 @@ struct BlueCursorView: View { navigationAnimationTimer?.invalidate() companionManager.tearDownOnboardingVideo() } - .onChange(of: companionManager.detectedElementScreenLocation) { newLocation in + .onChange(of: companionManager.detectedElementScreenLocation) { _, newLocation in // When a UI element location is detected, navigate the buddy to // that position so it points at the element. guard let screenLocation = newLocation, @@ -406,6 +406,27 @@ struct BlueCursorView: View { } } + private var isFollowingCursorMode: Bool { + if case .followingCursor = buddyNavigationMode { + return true + } + return false + } + + private var isNavigatingToTargetMode: Bool { + if case .navigatingToTarget = buddyNavigationMode { + return true + } + return false + } + + private var isPointingAtTargetMode: Bool { + if case .pointingAtTarget = buddyNavigationMode { + return true + } + return false + } + // MARK: - Cursor Tracking private func startTrackingCursor() { @@ -417,7 +438,7 @@ struct BlueCursorView: View { // mouse movement — it completes its full animation and return flight. // Only during the RETURN flight do we allow cursor movement to cancel // (so the buddy snaps to following if the user moves while it's flying back). - if self.buddyNavigationMode == .navigatingToTarget && self.isReturningToCursor { + if self.isNavigatingToTargetMode && self.isReturningToCursor { let currentMouseInSwiftUI = self.convertScreenPointToSwiftUICoordinates(mouseLocation) let distanceFromNavigationStart = hypot( currentMouseInSwiftUI.x - self.cursorPositionWhenNavigationStarted.x, @@ -430,7 +451,7 @@ struct BlueCursorView: View { } // During forward navigation or pointing, just skip cursor tracking - if self.buddyNavigationMode != .followingCursor { + if !self.isFollowingCursorMode { return } @@ -483,7 +504,7 @@ struct BlueCursorView: View { isReturningToCursor = false animateBezierFlightArc(to: clampedTarget) { - guard self.buddyNavigationMode == .navigatingToTarget else { return } + guard self.isNavigatingToTargetMode else { return } self.startPointingAtElement() } } @@ -590,10 +611,10 @@ struct BlueCursorView: View { streamNavigationBubbleCharacter(phrase: pointerPhrase, characterIndex: 0) { // All characters streamed — hold for 3 seconds, then fly back DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { - guard self.buddyNavigationMode == .pointingAtTarget else { return } + guard self.isPointingAtTargetMode else { return } self.navigationBubbleOpacity = 0.0 DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) { - guard self.buddyNavigationMode == .pointingAtTarget else { return } + guard self.isPointingAtTargetMode else { return } self.startFlyingBackToCursor() } } @@ -607,7 +628,7 @@ struct BlueCursorView: View { characterIndex: Int, onComplete: @escaping () -> Void ) { - guard buddyNavigationMode == .pointingAtTarget else { return } + guard isPointingAtTargetMode else { return } guard characterIndex < phrase.count else { onComplete() return diff --git a/worker/package-lock.json b/worker/package-lock.json index c2383cc1..4d043455 100644 --- a/worker/package-lock.json +++ b/worker/package-lock.json @@ -643,9 +643,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -663,9 +660,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -683,9 +677,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -703,9 +694,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -723,9 +711,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -743,9 +728,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -763,9 +745,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -789,9 +768,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -815,9 +791,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -841,9 +814,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -867,9 +837,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -893,9 +860,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "Apache-2.0", "optional": true, "os": [ diff --git a/worker/src/index.ts b/worker/src/index.ts index 2e3e9345..bbec5f45 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -13,9 +13,15 @@ interface Env { ANTHROPIC_API_KEY: string; ELEVENLABS_API_KEY: string; ELEVENLABS_VOICE_ID: string; + ELEVENLABS_CHINESE_VOICE_ID?: string; ASSEMBLYAI_API_KEY: string; } +interface TextToSpeechRequestBody { + language_code?: string; + [key: string]: unknown; +} + export default { async fetch(request: Request, env: Env): Promise { const url = new URL(request.url); @@ -107,8 +113,21 @@ async function handleTranscribeToken(env: Env): Promise { } async function handleTTS(request: Request, env: Env): Promise { - const body = await request.text(); - const voiceId = env.ELEVENLABS_VOICE_ID; + let requestBody: TextToSpeechRequestBody; + + try { + requestBody = await request.json() as TextToSpeechRequestBody; + } catch { + return new Response( + JSON.stringify({ error: "Invalid JSON body." }), + { status: 400, headers: { "content-type": "application/json" } } + ); + } + + const voiceId = resolveVoiceIdForLanguage( + requestBody.language_code, + env + ); const response = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, @@ -119,7 +138,7 @@ async function handleTTS(request: Request, env: Env): Promise { "content-type": "application/json", accept: "audio/mpeg", }, - body, + body: JSON.stringify(requestBody), } ); @@ -139,3 +158,19 @@ async function handleTTS(request: Request, env: Env): Promise { }, }); } + +function resolveVoiceIdForLanguage( + languageCode: string | undefined, + env: Env +): string { + const normalizedLanguageCode = languageCode?.trim().toLowerCase(); + + if ( + normalizedLanguageCode?.startsWith("zh") && + env.ELEVENLABS_CHINESE_VOICE_ID?.trim() + ) { + return env.ELEVENLABS_CHINESE_VOICE_ID.trim(); + } + + return env.ELEVENLABS_VOICE_ID; +} diff --git a/worker/wrangler.toml b/worker/wrangler.toml index b4bdbf38..056ab903 100644 --- a/worker/wrangler.toml +++ b/worker/wrangler.toml @@ -4,3 +4,4 @@ compatibility_date = "2024-01-01" [vars] ELEVENLABS_VOICE_ID = "kPzsL2i3teMYv0FxEYQ6" +ELEVENLABS_CHINESE_VOICE_ID = ""