From acb833f720e5281d2d2d2e9679258cd306f87c03 Mon Sep 17 00:00:00 2001 From: Al Jami Islam Anik Date: Thu, 2 Apr 2026 16:34:52 +0600 Subject: [PATCH 1/2] feat: integrate Ollama and Chatterbox TTS providers, enhance README and CLI options - Added support for Ollama as a local LLM and image provider, including interactive model selection. - Introduced Chatterbox TTS for text-to-speech functionality, with setup instructions and requirements. - Updated README to reflect new prerequisites and usage instructions for local development. - Enhanced CLI options to include new parameters for Ollama and Chatterbox configurations. - Improved cost estimation logic to account for free local providers. - Added validation for local provider availability and setup processes. This update significantly expands the capabilities of the pipeline for local development and usage. --- .gitignore | 8 +- README.md | 98 ++++++++++++++++++- scripts/chatterbox_tts.py | 89 ++++++++++++++++++ src/agents/creative-director.ts | 15 ++- src/cli/args.test.ts | 6 ++ src/cli/args.ts | 26 +++++- src/cli/chatterbox-setup.ts | 146 +++++++++++++++++++++++++++++ src/cli/cost-estimator.ts | 40 ++++++-- src/cli/ollama-setup.ts | 152 ++++++++++++++++++++++++++++++ src/cli/validate-env.ts | 72 +++++++++++--- src/index.ts | 161 +++++++++++++++++++++++++++++++- src/pipeline/orchestrator.ts | 107 +++++++++++++++------ src/providers/factory.ts | 27 +++++- src/providers/image/ollama.ts | 64 +++++++++++++ src/providers/llm/ollama.ts | 105 +++++++++++++++++++++ src/providers/tts/chatterbox.ts | 127 +++++++++++++++++++++++++ src/schema/providers.ts | 6 +- 17 files changed, 1178 insertions(+), 71 deletions(-) create mode 100644 scripts/chatterbox_tts.py create mode 100644 src/cli/chatterbox-setup.ts create mode 100644 src/cli/ollama-setup.ts create mode 100644 src/providers/image/ollama.ts create mode 100644 src/providers/llm/ollama.ts create mode 100644 src/providers/tts/chatterbox.ts diff --git a/.gitignore b/.gitignore index 45e9841..d901635 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,10 @@ dist/ *.tsbuildinfo .env staging_docs/ -output/ \ No newline at end of file +output/ + +# macOS +.DS_Store + +# Python version managers +.python-version \ No newline at end of file diff --git a/README.md b/README.md index 5ead90b..84eefbb 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ docker compose run worker npx tsx src/index.ts --yes "5 stoic lessons that chang ### Local development -**Prerequisites:** Node.js 22+, pnpm, ffprobe (for stock video duration detection) +**Prerequisites:** Node.js 22+, pnpm, ffprobe (for stock video duration detection), Python 3.11 or 3.12 (only required for `--tts-provider chatterbox` — Python 3.13+ is not supported due to PyTorch/OpenMP issues) ```bash git clone https://github.com/tsensei/OpenReels.git @@ -99,18 +99,110 @@ pnpm start "your topic" --archetype anime_illustration --provider openai **Optional:** `PEXELS_API_KEY` ([Pexels](https://www.pexels.com/api/)), `PIXABAY_API_KEY` ([Pixabay](https://pixabay.com/api/docs/)) for stock footage (free registration) +### Zero-API-key local mode (Ollama + Chatterbox) + +Run the full pipeline with **no API keys** using local open-source models. + +#### One-time setup + +```bash +# 1. Install and start Ollama (macOS) +brew install ollama +ollama serve # keep running in a separate terminal + +# 2. Pull your preferred LLM model (one-time — pick one) +ollama pull llama3.1:8b # ~5 GB, fast and reliable +ollama pull gemma3:9b # ~6 GB, good quality +ollama pull qwen2.5:7b # ~5 GB, multilingual + +# 3. Pull an image generation model — macOS only (one-time — pick one) +ollama pull x/flux2-klein:4b # ~6 GB, fastest +ollama pull x/flux2-klein:9b # ~12 GB, higher quality +ollama pull x/z-image-turbo:fp8 # ~13 GB, photorealistic +``` + +> **Interactive model selection:** When you run with `--provider ollama`, OpenReels will show you all pulled models and let you choose interactively. No need to memorise model names. + +> **Chatterbox is auto-installed:** OpenReels automatically creates an isolated Python venv at `~/.openreels/chatterbox-venv` and installs `chatterbox-tts` on first use. You only need Python 3.11 or 3.12 on your system (`brew install python@3.12` on macOS). Python 3.13+ is not supported. + +> **First run note:** Chatterbox Turbo downloads ~1.5 GB of model weights on first use. This is automatic and cached locally (`~/.cache/huggingface/`). Expect 2–5 minutes on a typical connection. + +> **GPU recommended:** Chatterbox is significantly faster on Apple Silicon (MPS) or a CUDA GPU. CPU generation works but is slow (10–30× slower than real-time). + +#### Running with no API keys + +```bash +# Interactive — OpenReels will prompt you to choose a model and describe your topic +pnpm start "your topic" \ + --provider ollama \ + --tts-provider chatterbox \ + --image-provider ollama + +# Non-interactive — supply context via --brief and pin specific models +pnpm start "your topic" \ + --provider ollama \ + --tts-provider chatterbox \ + --image-provider ollama \ + --ollama-model llama3.1:8b \ + --ollama-image-model x/flux2-klein:4b \ + --brief "Solar panels cost $10k upfront but save $50k over 20 years. Mood: informative." +``` + +> **Linux/Windows users:** Ollama image generation is currently macOS-only. Use `--image-provider gemini` (free tier available) or `--image-provider openai` instead, and provide the relevant API key. + +#### Mix and match — combine free and paid providers + +Each provider is independent. You can freely mix local and cloud options to get the best trade-off between cost, speed, and quality. + +```bash +# Best quality script + free TTS + free images (macOS for now) +# Requires: ANTHROPIC_API_KEY +pnpm start "your topic" \ + --provider anthropic \ + --tts-provider chatterbox \ + --image-provider ollama + +# Free script + paid TTS for higher quality voice + free images (macOS for now) +# Requires: ELEVENLABS_API_KEY +pnpm start "your topic" \ + --provider ollama \ + --tts-provider elevenlabs \ + --image-provider ollama + +# Free everything on Linux/Windows (Ollama image gen is macOS-only, use Gemini instead) +# Requires: GOOGLE_API_KEY +pnpm start "your topic" \ + --provider ollama \ + --tts-provider chatterbox \ + --image-provider gemini + +# Free script + free TTS + best image quality (OpenAI DALL-E) +# Requires: OPENAI_API_KEY +pnpm start "your topic" \ + --provider ollama \ + --tts-provider chatterbox \ + --image-provider openai +``` + ### CLI flags | Flag | Description | Default | |------|-------------|---------| | `--archetype ` | Override visual archetype | LLM chooses | -| `--provider ` | LLM provider (`anthropic` or `openai`) | `anthropic` | -| `--tts-provider ` | TTS provider (`elevenlabs` or `inworld`) | `elevenlabs` | +| `--provider ` | LLM provider (`anthropic`, `openai`, `ollama`) | `anthropic` | +| `--tts-provider ` | TTS provider (`elevenlabs`, `inworld`, `chatterbox`) | `elevenlabs` | +| `-i, --image-provider ` | Image provider (`gemini`, `openai`, `ollama`) | `gemini` | | `--platform ` | Target platform (`youtube`, `tiktok`, `instagram`) | `youtube` | | `--dry-run` | Output DirectorScore JSON without generating assets | off | | `--preview` | Open Remotion Studio after rendering | off | | `-o, --output ` | Output directory | `./output` | | `-y, --yes` | Auto-confirm cost estimation (for Docker/CI) | off | +| `--brief ` | Topic context for Ollama mode (skips interactive prompt) | — | +| `--ollama-model ` | Ollama LLM model name | interactive selection | +| `--ollama-image-model ` | Ollama image generation model name | interactive selection | +| `--ollama-host ` | Ollama API host URL | `http://localhost:11434` | +| `--chatterbox-device ` | PyTorch device for Chatterbox (`cpu`, `cuda`, `mps`) | auto-detected | +| `--chatterbox-audio-prompt ` | Reference WAV for Chatterbox voice cloning (5–10s) | — | ## Archetypes diff --git a/scripts/chatterbox_tts.py b/scripts/chatterbox_tts.py new file mode 100644 index 0000000..1887623 --- /dev/null +++ b/scripts/chatterbox_tts.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Chatterbox Turbo TTS bridge for OpenReels. + +Generates speech from text using ResembleAI's Chatterbox Turbo model and writes: + - A WAV audio file + - A JSON file containing approximate word-level timestamps + +Usage: + python scripts/chatterbox_tts.py \ + --text "Your script here" \ + --out /tmp/output.wav \ + --timestamps /tmp/timestamps.json \ + [--device cpu|cuda|mps] \ + [--audio-prompt /path/to/reference.wav] + +First run will download ~1.5 GB of model weights automatically. +Subsequent runs use the cached weights (usually ~/.cache/huggingface/). + +Requirements: + pip install chatterbox-tts +""" + +import argparse +import json +import sys + + +def main() -> None: + parser = argparse.ArgumentParser(description="Chatterbox Turbo TTS bridge") + parser.add_argument("--text", required=True, help="Text to synthesize") + parser.add_argument("--out", required=True, help="Output WAV file path") + parser.add_argument("--timestamps", required=True, help="Output JSON timestamps file path") + parser.add_argument( + "--device", + default="cpu", + choices=["cpu", "cuda", "mps"], + help="PyTorch device (default: cpu; use mps on Apple Silicon, cuda on NVIDIA GPU)", + ) + parser.add_argument( + "--audio-prompt", + default=None, + help="Optional path to a reference WAV file for zero-shot voice cloning (5–10s recommended)", + ) + args = parser.parse_args() + + try: + import torchaudio as ta + from chatterbox.tts_turbo import ChatterboxTurboTTS + except ImportError as e: + print( + f"ERROR: {e}\n" + "Chatterbox Turbo is not installed.\n" + "Install it with: pip install chatterbox-tts\n" + "Python 3.11 is strongly recommended.", + file=sys.stderr, + ) + sys.exit(1) + + print("Loading Chatterbox Turbo model (first run downloads ~1.5 GB)...", file=sys.stderr) + model = ChatterboxTurboTTS.from_pretrained(device=args.device) + + generate_kwargs: dict = {"audio_prompt_path": args.audio_prompt} if args.audio_prompt else {} + wav = model.generate(args.text, **generate_kwargs) + + ta.save(args.out, wav, model.sr) + print(f"Audio saved to: {args.out}", file=sys.stderr) + + # Chatterbox Turbo does not expose word-level timestamps natively. + # We approximate by distributing words evenly across the total audio duration. + # Caption timing will be approximate but functional. + duration_sec: float = wav.shape[-1] / model.sr + words = args.text.split() + if not words: + timestamps = [] + else: + step = duration_sec / len(words) + timestamps = [ + {"word": word, "start": round(i * step, 4), "end": round((i + 1) * step, 4)} + for i, word in enumerate(words) + ] + + with open(args.timestamps, "w", encoding="utf-8") as f: + json.dump(timestamps, f) + print(f"Timestamps saved to: {args.timestamps}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/src/agents/creative-director.ts b/src/agents/creative-director.ts index 3171433..93b1ad8 100644 --- a/src/agents/creative-director.ts +++ b/src/agents/creative-director.ts @@ -34,7 +34,11 @@ export async function generateDirectorScore( llm: LLMProvider, topic: string, researchContext: ResearchResult, - options?: { archetype?: string }, + options?: { + archetype?: string; + /** Restrict which visual types the director may use. Defaults to all four. */ + allowedVisualTypes?: VisualType[]; + }, ): Promise { let systemPrompt = buildDefaultPrompt(); @@ -57,6 +61,13 @@ export async function generateDirectorScore( ? `Use the "${options.archetype}" archetype.` : `Choose from: ${archetypes.join(", ")}`; + const allVisualTypes = VisualType.options; + const allowed: VisualType[] = options?.allowedVisualTypes ?? [...allVisualTypes]; + const visualTypeConstraint = allowed.length === allVisualTypes.length + ? `Use all 4 visual types (ai_image, stock_image, stock_video, text_card).` + : `IMPORTANT: You may ONLY use these visual types: ${allowed.join(", ")}. ` + + `Do NOT use ${allVisualTypes.filter((t) => !allowed.includes(t)).join(" or ")} — those providers are not available.\n` + const userMessage = `Topic: ${topic} Research context: @@ -69,7 +80,7 @@ Mood: ${researchContext.mood} ${archetypeInstruction} -Create a DirectorScore with 4-7 scenes. Use all 4 visual types (ai_image, stock_image, stock_video, text_card). +Create a DirectorScore with 4-7 scenes. ${visualTypeConstraint} CRITICAL RULE: Never use the same visual_type more than 2 times in a row. Every scene MUST have a script_line (the voiceover text). The first scene should be a strong hook. diff --git a/src/cli/args.test.ts b/src/cli/args.test.ts index c6fb510..4ede5fb 100644 --- a/src/cli/args.test.ts +++ b/src/cli/args.test.ts @@ -15,6 +15,9 @@ describe("CLIOptions type", () => { preview: false, output: "./output", yes: true, + ollamaModel: "llama3.2", + ollamaImageModel: "x/flux2-klein", + ollamaHost: "http://localhost:11434", }; expect(opts.yes).toBe(true); }); @@ -30,6 +33,9 @@ describe("CLIOptions type", () => { preview: false, output: "./output", yes: false, + ollamaModel: "llama3.2", + ollamaImageModel: "x/flux2-klein", + ollamaHost: "http://localhost:11434", }; expect(opts.yes).toBe(false); }); diff --git a/src/cli/args.ts b/src/cli/args.ts index 6823260..843d599 100644 --- a/src/cli/args.ts +++ b/src/cli/args.ts @@ -16,6 +16,14 @@ export interface CLIOptions { preview: boolean; output: string; yes: boolean; + brief?: string; + /** Explicitly provided via --ollama-model. When undefined, model is selected interactively. */ + ollamaModel?: string; + /** Explicitly provided via --ollama-image-model. When undefined, model is selected interactively. */ + ollamaImageModel?: string; + ollamaHost: string; + chatterboxDevice?: string; + chatterboxAudioPrompt?: string; } export function parseArgs(): CLIOptions { @@ -28,17 +36,17 @@ export function parseArgs(): CLIOptions { .argument("", "The topic for your video") .addOption( new Option("-p, --provider ", "LLM provider") - .choices(["anthropic", "openai"]) + .choices(["anthropic", "openai", "ollama"]) .default("anthropic"), ) .addOption( new Option("-i, --image-provider ", "Image generation provider") - .choices(["gemini", "openai"]) + .choices(["gemini", "openai", "ollama"]) .default("gemini"), ) .addOption( new Option("--tts-provider ", "TTS provider") - .choices(["elevenlabs", "inworld"]) + .choices(["elevenlabs", "inworld", "chatterbox"]) .default("elevenlabs"), ) .option("-a, --archetype ", "Visual archetype override") @@ -47,6 +55,12 @@ export function parseArgs(): CLIOptions { .option("--preview", "Open Remotion Studio preview after rendering", false) .option("-o, --output ", "Output directory", "./output") .option("-y, --yes", "Auto-confirm cost estimation prompt (non-interactive mode)", false) + .option("--brief ", "Topic context for Ollama mode (skips interactive prompt)") + .option("--ollama-model ", "Ollama LLM model name (default: interactive selection)") + .option("--ollama-image-model ", "Ollama image generation model name (default: interactive selection)") + .option("--ollama-host ", "Ollama API host", "http://localhost:11434") + .option("--chatterbox-device ", "PyTorch device for Chatterbox TTS (cpu, cuda, mps)") + .option("--chatterbox-audio-prompt ", "Path to reference WAV for Chatterbox voice cloning") .parse(); const topic = program.args[0] ?? ""; @@ -67,5 +81,11 @@ export function parseArgs(): CLIOptions { preview: opts["preview"] as boolean, output: opts["output"] as string, yes: opts["yes"] as boolean, + brief: opts["brief"] as string | undefined, + ollamaModel: opts["ollamaModel"] as string | undefined, + ollamaImageModel: opts["ollamaImageModel"] as string | undefined, + ollamaHost: opts["ollamaHost"] as string, + chatterboxDevice: opts["chatterboxDevice"] as string | undefined, + chatterboxAudioPrompt: opts["chatterboxAudioPrompt"] as string | undefined, }; } diff --git a/src/cli/chatterbox-setup.ts b/src/cli/chatterbox-setup.ts new file mode 100644 index 0000000..ab48967 --- /dev/null +++ b/src/cli/chatterbox-setup.ts @@ -0,0 +1,146 @@ +/** + * Chatterbox Turbo local TTS setup: Python venv creation and package installation. + */ + +import { spawnSync } from "node:child_process"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; + +const VENV_DIR = path.join(os.homedir(), ".openreels", "chatterbox-venv"); + +/** + * Ensures chatterbox-tts is available in a managed venv at ~/.openreels/chatterbox-venv. + * + * Strategy (in order): + * 1. If `uv` is available, use `uv venv --python 3.12` + `uv pip install` — handles + * uv-managed Pythons correctly and is 10-100x faster than pip. + * 2. Otherwise fall back to finding a system python3.12/3.11 and using `python -m venv`. + * + * On subsequent runs the venv is detected in ~0ms and skips all setup. + * + * @returns Path to the Python binary inside the venv. + */ +export async function ensureChatterboxVenv(): Promise { + const venvPython = process.platform === "win32" + ? path.join(VENV_DIR, "Scripts", "python.exe") + : path.join(VENV_DIR, "bin", "python"); + + // Fast path: venv already exists and all required packages are importable + if (fs.existsSync(venvPython)) { + const check = spawnSync( + venvPython, + ["-c", "import chatterbox; import torchaudio; import pkg_resources"], + { encoding: "utf-8" }, + ); + if (check.status === 0) return venvPython; + console.info(`\nChatterbox venv found but missing dependencies — reinstalling...\n`); + } else { + console.info( + `\n─────────────────────────────────────────────────────────────\n` + + ` Chatterbox Turbo: one-time setup\n` + + ` Creating isolated Python environment at:\n` + + ` ${VENV_DIR}\n` + + ` Installing chatterbox-tts... this takes a few minutes.\n` + + ` Model weights (~1.5 GB) are downloaded on first generation.\n` + + `─────────────────────────────────────────────────────────────\n`, + ); + fs.mkdirSync(path.dirname(VENV_DIR), { recursive: true }); + } + + const uvBin = findUv(); + + if (uvBin) { + // uv path: works with uv-managed Pythons, no ensurepip issues + const venvResult = spawnSync(uvBin, ["venv", "--python", "3.12", VENV_DIR], { stdio: "inherit" }); + if (venvResult.status !== 0) { + // 3.12 not available to uv, try 3.11 + const fallback = spawnSync(uvBin, ["venv", "--python", "3.11", VENV_DIR], { stdio: "inherit" }); + if (fallback.status !== 0) { + console.error( + `\n✗ uv could not create a venv with Python 3.11 or 3.12.\n\n` + + ` Install Python 3.12 via uv: uv python install 3.12\n` + + ` Or via Homebrew: brew install python@3.12\n`, + ); + process.exit(1); + } + } + const installResult = spawnSync( + uvBin, + ["pip", "install", "--python", venvPython, "chatterbox-tts", "setuptools<70"], + { stdio: "inherit" }, + ); + if (installResult.status !== 0) { + console.error(`\n✗ Failed to install chatterbox-tts via uv.\n`); + process.exit(1); + } + } else { + // Standard path: find a system python3.12 or python3.11 and use python -m venv + const systemPython = findCompatiblePython(); + const venvResult = spawnSync(systemPython, ["-m", "venv", VENV_DIR], { stdio: "inherit" }); + if (venvResult.status !== 0) { + console.error(`\n✗ Failed to create venv: ${systemPython} -m venv ${VENV_DIR}\n`); + process.exit(1); + } + const pipBin = process.platform === "win32" + ? path.join(VENV_DIR, "Scripts", "pip.exe") + : path.join(VENV_DIR, "bin", "pip"); + const installResult = spawnSync(pipBin, ["install", "chatterbox-tts", "setuptools<70"], { + stdio: "inherit", + }); + if (installResult.status !== 0) { + console.error(`\n✗ pip install chatterbox-tts failed.\n`); + process.exit(1); + } + } + + console.info(`\n✓ Chatterbox Turbo ready.\n`); + return venvPython; +} + +/** Returns the path to `uv` if available on PATH or common install locations, otherwise null. */ +function findUv(): string | null { + for (const bin of ["uv", "/opt/homebrew/bin/uv", "/usr/local/bin/uv"]) { + const probe = spawnSync(bin, ["--version"], { encoding: "utf-8" }); + if (probe.status === 0) return bin; + } + return null; +} + +/** + * Finds a system-managed python3.12 or python3.11 (non-uv). + * Used as fallback when uv is not available. + * Checks common absolute paths in addition to PATH so it doesn't miss Homebrew installs. + */ +function findCompatiblePython(): string { + const home = os.homedir(); + const candidates = [ + "python3.12", "python3.11", + "/opt/homebrew/bin/python3.12", "/opt/homebrew/bin/python3.11", + "/usr/local/bin/python3.12", "/usr/local/bin/python3.11", + path.join(home, ".pyenv", "shims", "python3.12"), + path.join(home, ".pyenv", "shims", "python3.11"), + ]; + + for (const bin of candidates) { + const probe = spawnSync(bin, ["--version"], { encoding: "utf-8" }); + if (probe.status !== 0) continue; + const version = (probe.stdout ?? probe.stderr ?? "").trim(); + const match = version.match(/Python 3\.(\d+)/); + if (!match) continue; + const minor = parseInt(match[1] ?? "0", 10); + if (minor === 11 || minor === 12) return bin; + } + + console.error( + `\n✗ Python 3.11 or 3.12 is required for Chatterbox Turbo.\n` + + ` Python 3.13+ has known PyTorch/OpenMP issues on macOS.\n\n` + + ` The fastest option is to install via uv (already installed):\n` + + ` uv python install 3.12\n\n` + + ` Or install Python 3.12 directly:\n` + + ` macOS: brew install python@3.12\n` + + ` Linux: sudo apt install python3.12\n` + + ` Windows: https://python.org (download 3.12)\n`, + ); + process.exit(1); +} diff --git a/src/cli/cost-estimator.ts b/src/cli/cost-estimator.ts index 08b7d9e..25d7552 100644 --- a/src/cli/cost-estimator.ts +++ b/src/cli/cost-estimator.ts @@ -84,9 +84,19 @@ export function estimateCost( callCost(TOKEN_ESTIMATES.creativeDirector) + callCost(TOKEN_ESTIMATES.critic) + aiImages * callCost(TOKEN_ESTIMATES.imagePrompter); - const ttsPerChar = ttsProvider === "inworld" ? PRICING.inworldPerChar : PRICING.elevenLabsPerChar; + const ttsPerChar = + ttsProvider === "inworld" + ? PRICING.inworldPerChar + : ttsProvider === "chatterbox" + ? 0 + : PRICING.elevenLabsPerChar; const ttsCost = ttsCharacters * ttsPerChar; - const perImage = imageProvider === "openai" ? PRICING.openaiPerImage : PRICING.geminiPerImage; + const perImage = + imageProvider === "openai" + ? PRICING.openaiPerImage + : imageProvider === "ollama" + ? 0 + : PRICING.geminiPerImage; const imageCost = aiImages * perImage; const totalCost = llmCost + ttsCost + imageCost; @@ -97,7 +107,12 @@ export function formatCostEstimate( breakdown: CostBreakdown, imageProvider: ImageProviderKey = "gemini", ): string { - const perImage = imageProvider === "openai" ? PRICING.openaiPerImage : PRICING.geminiPerImage; + const perImage = + imageProvider === "openai" + ? PRICING.openaiPerImage + : imageProvider === "ollama" + ? 0 + : PRICING.geminiPerImage; return [ `Estimated cost: $${breakdown.totalCost.toFixed(3)}`, ` LLM: $${breakdown.llmCost.toFixed(4)} (${breakdown.details.llmCalls} calls)`, @@ -117,14 +132,27 @@ export function computeActualLLMCost( imageProvider: ImageProviderKey = "gemini", ttsProvider: TTSProviderKey = "elevenlabs", ): ActualCostBreakdown { - const p = PRICING[provider]; + // Ollama and Chatterbox are free local providers — cost is $0 + const p = provider === "ollama" + ? { perInputToken: 0, perOutputToken: 0 } + : PRICING[provider]; const totalInputTokens = usages.reduce((sum, u) => sum + u.inputTokens, 0); const totalOutputTokens = usages.reduce((sum, u) => sum + u.outputTokens, 0); const llmCost = totalInputTokens * p.perInputToken + totalOutputTokens * p.perOutputToken; - const ttsPerChar = ttsProvider === "inworld" ? PRICING.inworldPerChar : PRICING.elevenLabsPerChar; + const ttsPerChar = + ttsProvider === "inworld" + ? PRICING.inworldPerChar + : ttsProvider === "chatterbox" + ? 0 + : PRICING.elevenLabsPerChar; const ttsCost = nonLlm.ttsCharacters * ttsPerChar; - const perImage = imageProvider === "openai" ? PRICING.openaiPerImage : PRICING.geminiPerImage; + const perImage = + imageProvider === "openai" + ? PRICING.openaiPerImage + : imageProvider === "ollama" + ? 0 + : PRICING.geminiPerImage; const imageCost = nonLlm.aiImages * perImage; const totalCost = llmCost + ttsCost + imageCost; diff --git a/src/cli/ollama-setup.ts b/src/cli/ollama-setup.ts new file mode 100644 index 0000000..736c2d0 --- /dev/null +++ b/src/cli/ollama-setup.ts @@ -0,0 +1,152 @@ +/** + * Ollama local provider setup: reachability check and interactive model selection. + */ + +import * as readline from "node:readline"; + +/** + * Curated LLM models known to reliably produce structured JSON output. + * Only 7B+ parameter models are included — smaller models consistently fail + * at the constrained JSON generation this pipeline requires. + * Each entry is the exact tag Ollama expects (name:params). + */ +export const KNOWN_LLM_MODELS = [ + "llama3.1:8b", + "llama3.2:latest", // 3b — borderline but usable + "llama3.3:70b", + "mistral:7b", + "mixtral:8x7b", + "gemma3:9b", + "gemma3:27b", + "qwen2.5:7b", + "qwen2.5:14b", + "phi4:14b", + "deepseek-r1:7b", + "deepseek-r1:14b", +]; + +/** + * The only models that support image generation in Ollama (macOS, experimental). + * https://ollama.com/blog/image-generation + * Full tags are required — Ollama returns 404 for bare names like "x/flux2-klein". + */ +export const KNOWN_IMAGE_MODELS = [ + "x/flux2-klein:4b", + "x/flux2-klein:9b", + "x/z-image-turbo:latest", +]; + +interface OllamaTagsResponse { + models?: Array<{ name: string }>; +} + +/** Checks Ollama is reachable and returns the list of locally pulled model names. */ +export async function checkOllamaReachable(host: string): Promise { + const url = `${host.replace(/\/$/, "")}/api/tags`; + let data: OllamaTagsResponse; + try { + const res = await fetch(url, { signal: AbortSignal.timeout(5000) }); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + data = (await res.json()) as OllamaTagsResponse; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `\n✗ Ollama is not reachable at ${host}\n` + + ` Error: ${msg}\n\n` + + ` → Start Ollama: ollama serve\n` + + ` → Install Ollama: https://ollama.com\n` + + ` → Or override host with: --ollama-host \n`, + ); + process.exit(1); + } + + return (data.models ?? []).map((m) => m.name); +} + +/** + * Shows a numbered list of model choices and lets the user pick one. + * + * For each known model, if the user already has it pulled locally we show the + * exact pulled name (e.g. "gemma3:27b") so Ollama receives the right tag. + * Unpulled known models are shown with their recommended tag from KNOWN_*_MODELS. + * + * For image models the list is locked to KNOWN_IMAGE_MODELS only — other locally + * pulled models are never shown because they cannot generate images. + * + * @param lockToKnown When true, only show knownModels (no "other pulled" bucket). + */ +export async function selectOllamaModel( + pulledModels: string[], + knownModels: string[], + label: string, + lockToKnown = false, +): Promise { + // Build a lookup: base name → full pulled tag (e.g. "gemma3" → "gemma3:27b") + // When the user has multiple tags of the same base, prefer non-"latest" tags (more specific). + const pulledByBase = new Map(); + for (const fullTag of pulledModels) { + const base = fullTag.replace(/:.*$/, ""); + const existing = pulledByBase.get(base); + if (!existing || existing === `${base}:latest`) { + pulledByBase.set(base, fullTag); + } + } + + const resolveDisplayName = (knownEntry: string): string => { + const base = knownEntry.replace(/:.*$/, ""); + return pulledByBase.get(base) ?? knownEntry; + }; + + const isPulled = (knownEntry: string): boolean => + pulledByBase.has(knownEntry.replace(/:.*$/, "")); + + const pulledKnown = knownModels.filter(isPulled); + const unpulledKnown = knownModels.filter((m) => !isPulled(m)); + + // LLM selector surfaces other pulled models too; image selector is locked to known list. + const otherPulled = lockToKnown + ? [] + : pulledModels.filter((fullTag) => { + const base = fullTag.replace(/:.*$/, ""); + return !knownModels.some((k) => k.replace(/:.*$/, "") === base); + }); + + const options: Array<{ display: string; pulled: boolean }> = [ + ...pulledKnown.map((m) => ({ display: resolveDisplayName(m), pulled: true })), + ...otherPulled.map((m) => ({ display: m, pulled: true })), + ...unpulledKnown.map((m) => ({ display: m, pulled: false })), + ]; + + console.info(`\n─────────────────────────────────────────────────────────────`); + console.info(` Select an Ollama model for ${label}:`); + console.info(` (✓ = already pulled locally; others require: ollama pull )\n`); + + options.forEach(({ display, pulled }, i) => { + console.info(` [${i + 1}] ${pulled ? "✓" : " "} ${display}`); + }); + console.info(` [${options.length + 1}] Enter a custom model name`); + console.info(`─────────────────────────────────────────────────────────────\n`); + + const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); + const ask = (q: string): Promise => + new Promise((resolve) => rl.question(q, (a) => resolve(a.trim()))); + + let chosen = ""; + while (!chosen) { + const raw = await ask(`Your choice (1–${options.length + 1}): `); + const n = parseInt(raw, 10); + + if (n >= 1 && n <= options.length) { + chosen = options[n - 1]?.display ?? ""; + } else if (n === options.length + 1 || (!Number.isInteger(n) && raw.length > 0)) { + const manual = Number.isInteger(n) ? await ask(`Model name (e.g. llama3.1:8b): `) : raw; + if (manual.length > 0) chosen = manual; + } else { + console.info(` Please enter a number between 1 and ${options.length + 1}.`); + } + } + + rl.close(); + console.info(`\n Using model: ${chosen}\n`); + return chosen; +} diff --git a/src/cli/validate-env.ts b/src/cli/validate-env.ts index e351ea4..15ffa6b 100644 --- a/src/cli/validate-env.ts +++ b/src/cli/validate-env.ts @@ -1,4 +1,6 @@ import type { ImageProviderKey, LLMProviderKey, TTSProviderKey } from "../schema/providers.js"; +import { checkOllamaReachable, selectOllamaModel, KNOWN_LLM_MODELS, KNOWN_IMAGE_MODELS } from "./ollama-setup.js"; +import { ensureChatterboxVenv } from "./chatterbox-setup.js"; interface EnvRequirement { key: string; @@ -7,11 +9,25 @@ interface EnvRequirement { required: boolean; } -export function validateEnv(opts: { +export interface ValidateEnvResult { + /** Resolved Ollama LLM model name (may differ from CLI flag if user selected interactively). */ + ollamaModel?: string; + /** Resolved Ollama image model name (may differ from CLI flag if user selected interactively). */ + ollamaImageModel?: string; + /** Path to the venv Python binary for Chatterbox. Undefined when chatterbox is not selected. */ + chatterboxPythonBin?: string; +} + +export async function validateEnv(opts: { provider: LLMProviderKey; ttsProvider: TTSProviderKey; imageProvider: ImageProviderKey; -}): void { + ollamaHost?: string; + /** Explicitly passed --ollama-model. If set, skip interactive selection. */ + ollamaModel?: string; + /** Explicitly passed --ollama-image-model. If set, skip interactive selection. */ + ollamaImageModel?: string; +}): Promise { const requirements: EnvRequirement[] = [ { key: "ANTHROPIC_API_KEY", @@ -47,8 +63,8 @@ export function validateEnv(opts: { const missing = requirements.filter((r) => r.required && !process.env[r.key]); - // Stock keys are optional — the pipeline degrades gracefully (black frames) — but - // warn upfront so users aren't surprised by missing visuals on stock_image/stock_video scenes. + // Stock keys are optional — the pipeline degrades gracefully — but warn upfront + // so users aren't surprised by missing visuals on stock_image/stock_video scenes. const hasStockKey = process.env["PEXELS_API_KEY"] || process.env["PIXABAY_API_KEY"]; if (!hasStockKey) { console.warn( @@ -58,17 +74,43 @@ export function validateEnv(opts: { ); } - if (missing.length === 0) return; + if (missing.length > 0) { + console.error("\nMissing required API keys:\n"); + console.error(" Key Status Get it at"); + console.error(" " + "-".repeat(70)); + for (const r of missing) { + const key = r.key.padEnd(24); + console.error(` ${key}MISSING ${r.signupUrl}`); + } + console.error( + "\nSet these in your .env file (or pass with `docker run --env-file .env` when using Docker).\n", + ); + process.exit(1); + } + + // --- Local provider setup (no API keys needed, but tools must be available) --- + let ollamaModel: string | undefined; + let ollamaImageModel: string | undefined; - console.error("\nMissing required API keys:\n"); - console.error(" Key Status Get it at"); - console.error(" " + "-".repeat(70)); - for (const r of missing) { - const key = r.key.padEnd(24); - console.error(` ${key}MISSING ${r.signupUrl}`); + if (opts.provider === "ollama" || opts.imageProvider === "ollama") { + const host = opts.ollamaHost ?? "http://localhost:11434"; + const pulledModels = await checkOllamaReachable(host); + + if (opts.provider === "ollama") { + ollamaModel = opts.ollamaModel + ?? await selectOllamaModel(pulledModels, KNOWN_LLM_MODELS, "LLM"); + } + + if (opts.imageProvider === "ollama") { + ollamaImageModel = opts.ollamaImageModel + ?? await selectOllamaModel(pulledModels, KNOWN_IMAGE_MODELS, "image generation", true); + } } - console.error( - "\nSet these in your .env file (or pass with `docker run --env-file .env` when using Docker).\n", - ); - process.exit(1); + + let chatterboxPythonBin: string | undefined; + if (opts.ttsProvider === "chatterbox") { + chatterboxPythonBin = await ensureChatterboxVenv(); + } + + return { ollamaModel, ollamaImageModel, chatterboxPythonBin }; } diff --git a/src/index.ts b/src/index.ts index 6292f31..92e74e6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,31 +1,55 @@ #!/usr/bin/env node +import * as readline from "node:readline"; import { parseArgs } from "./cli/args.js"; import { validateEnv } from "./cli/validate-env.js"; import { createCliCallbacks, runPipeline } from "./pipeline/orchestrator.js"; import { createProviders } from "./providers/factory.js"; +import type { LLMProvider } from "./schema/providers.js"; +import type { ResearchResult } from "./agents/research.js"; async function main(): Promise { const opts = parseArgs(); - // Validate required API keys before constructing providers - validateEnv({ + // Validate required API keys (and local tool availability) before constructing providers. + // When --ollama-model / --ollama-image-model are omitted, validateEnv presents an interactive + // model selection prompt and returns the chosen values in envResult. + const envResult = await validateEnv({ provider: opts.provider, ttsProvider: opts.ttsProvider, imageProvider: opts.imageProvider, + ollamaHost: opts.ollamaHost, + ollamaModel: opts.ollamaModel, + ollamaImageModel: opts.ollamaImageModel, }); - // Initialize providers via factory + // Use model names resolved by validateEnv (interactive selection or explicit flag) + const resolvedOllamaModel = envResult.ollamaModel ?? opts.ollamaModel; + const resolvedOllamaImageModel = envResult.ollamaImageModel ?? opts.ollamaImageModel; + + // Initialize all providers via factory const { llm, tts, imageGen, stock } = createProviders({ llm: opts.provider, tts: opts.ttsProvider, image: opts.imageProvider, + ollamaModel: resolvedOllamaModel, + ollamaImageModel: resolvedOllamaImageModel, + ollamaHost: opts.ollamaHost, + chatterboxDevice: opts.chatterboxDevice, + chatterboxAudioPrompt: opts.chatterboxAudioPrompt, + chatterboxPythonBin: envResult.chatterboxPythonBin, }); - // Create CLI callbacks for terminal progress display + // Collect topic brief for Ollama mode (replaces web-search research) + let topicBrief: ResearchResult | undefined; + if (opts.provider === "ollama") { + topicBrief = await collectTopicBrief(opts.topic, llm, opts.brief); + } + + // Build CLI callbacks (wraps ProgressDisplay + cost/log printing) const { callbacks, progress } = createCliCallbacks(opts.yes); - // Run pipeline with CLI callbacks + // Run pipeline const result = await runPipeline( { topic: opts.topic, @@ -41,6 +65,7 @@ async function main(): Promise { preview: opts.preview, outputDir: opts.output, yes: opts.yes, + topicBrief, }, callbacks, ); @@ -54,6 +79,132 @@ async function main(): Promise { } } +/** + * Collects topic context for Ollama mode. + * + * Flow: + * 1. --brief flag → use it directly, no prompts + * 2. Non-TTY (Docker/pipe) → continue with topic only + * 3. Interactive TTY → + * a. Explain why context helps + * b. Offer: [1] AI-guided questions (Ollama generates 3 topic-specific questions) + * [2] Write it yourself (freeform) + * [3] Skip (topic name only) + */ +async function collectTopicBrief( + topic: string, + llm: LLMProvider, + brief?: string, +): Promise { + if (brief) { + console.info(`\n Using provided --brief for research context.\n`); + return { summary: brief, key_facts: [], mood: "neutral", sources: [] }; + } + + if (!process.stdin.isTTY) { + return { summary: `Topic: ${topic}`, key_facts: [], mood: "neutral", sources: [] }; + } + + const divider = `─────────────────────────────────────────────────────────────`; + + console.info( + `\n${divider}\n` + + ` Topic context — optional but recommended\n` + + `${divider}\n\n` + + ` Since you're using Ollama (no web search), providing a little context\n` + + ` helps us write a more accurate and interesting script for you.\n\n` + + ` How would you like to provide context?\n\n` + + ` [1] Guided — We ask you 3 questions about "${topic}"\n` + + ` [2] Freeform — You write a few lines yourself\n` + + ` [3] Skip — Continue with topic name only\n`, + ); + + const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); + const ask = (q: string): Promise => + new Promise((resolve) => rl.question(q, (a) => resolve(a.trim()))); + + let choice = ""; + while (!["1", "2", "3"].includes(choice)) { + choice = await ask(` Your choice (1/2/3): `); + if (!["1", "2", "3"].includes(choice)) { + console.info(` Please enter 1, 2, or 3.`); + } + } + + // ── Option 3: skip ────────────────────────────────────────────────────────── + if (choice === "3") { + rl.close(); + console.info(`\n Continuing with topic name only.\n`); + return { summary: `Topic: ${topic}`, key_facts: [], mood: "neutral", sources: [] }; + } + + // ── Option 2: freeform ────────────────────────────────────────────────────── + if (choice === "2") { + console.info( + `\n Write a few lines about your topic — key facts, context, desired mood.\n` + ); + const text = await ask(` > `); + rl.close(); + console.info(""); + // summary carries the full user text; key_facts left empty to avoid duplication. + return { + summary: text || `Topic: ${topic}`, + key_facts: [], + mood: "neutral", + sources: [], + }; + } + + // ── Option 1: AI-guided questions ─────────────────────────────────────────── + console.info(`\n Generating questions for "${topic}"...\n`); + + let questions: string[] = []; + try { + const { z } = await import("zod"); + const QuestionsSchema = z.object({ + questions: z.array(z.string()).length(3), + }); + const result = await llm.generate({ + systemPrompt: + `You are a research assistant helping prepare a short-form video script.\n` + + `Generate exactly 3 specific, open-ended questions that will help gather useful\n` + + `context about the given topic. Questions should target: key facts/events,\n` + + `emotional angle or human interest, and surprising or lesser-known details.\n` + + `Keep each question under 15 words.`, + userMessage: `Topic: "${topic}"`, + schema: QuestionsSchema, + }); + questions = result.data.questions; + } catch { + questions = [ + `What are the most important facts about "${topic}"?`, + `What is the emotional angle or human story here?`, + `What would surprise most people about this topic?`, + ]; + } + + console.info(` Answer each question (or press Enter to skip):\n`); + + const answers: string[] = []; + for (let i = 0; i < questions.length; i++) { + console.info(` ${i + 1}. ${questions[i]}`); + const answer = await ask(` > `); + answers.push(answer); + console.info(""); + } + + rl.close(); + + // key_facts = each non-empty answer as a standalone atomic fact. + // summary = prose joining all answers so the director has full narrative context. + const key_facts = answers.filter(Boolean); + const summary = key_facts.length > 0 + ? key_facts.join(". ") + : `Topic: ${topic}`; + + return { summary, key_facts, mood: "neutral", sources: [] }; +} + main().catch((err) => { console.error("\nPipeline failed:", err instanceof Error ? err.message : String(err)); process.exit(1); diff --git a/src/pipeline/orchestrator.ts b/src/pipeline/orchestrator.ts index b3b14ce..e3a56b5 100644 --- a/src/pipeline/orchestrator.ts +++ b/src/pipeline/orchestrator.ts @@ -20,7 +20,7 @@ import { getArchetype } from "../config/archetype-registry.js"; import { getPlatformConfig } from "../config/platforms.js"; import { getTotalDurationInFrames, mapScoreToProps } from "../remotion/lib/score-to-props.js"; import type { ArchetypeConfig } from "../schema/archetype.js"; -import type { DirectorScore } from "../schema/director-score.js"; +import type { DirectorScore, VisualType } from "../schema/director-score.js"; import type { ImageProvider, ImageProviderKey, @@ -31,6 +31,7 @@ import type { TTSProviderKey, WordTimestamp, } from "../schema/providers.js"; +import type { ResearchResult } from "../agents/research.js"; // Stage names matching the pipeline execution order export const STAGE_NAMES = [ @@ -121,6 +122,8 @@ export interface PipelineOptions { preview: boolean; outputDir: string; yes: boolean; + /** Pre-supplied research result (Ollama local mode). When set, the research agent LLM call is skipped. */ + topicBrief?: ResearchResult; } export interface PipelineResult { @@ -173,26 +176,36 @@ export async function runPipeline( try { // Stage 1: Research cb.onStageStart?.("research"); - let researchResult; + let researchResult: ResearchResult; const researchStart = Date.now(); - try { - const researchOutput = await research(opts.llm, opts.topic); - researchResult = researchOutput.data; - llmUsages.push(researchOutput.usage); + + if (opts.topicBrief) { + // Ollama local mode: user-supplied brief, no LLM web-search call needed + researchResult = opts.topicBrief; const dur = (Date.now() - researchStart) / 1000; - cb.onStageComplete?.("research", `${researchResult.key_facts.length} facts`, dur); + cb.onStageComplete?.("research", `provided brief (${researchResult.key_facts.length} facts)`, dur); cb.onProgress?.("research", { type: "results", summary: researchResult.summary, key_facts: researchResult.key_facts, mood: researchResult.mood }); - log.stages.push({ name: "research", duration: dur, status: "done" }); - } catch (err) { - const dur = (Date.now() - researchStart) / 1000; - cb.onStageSkip?.("research", "web search failed"); - log.stages.push({ name: "research", duration: dur, status: "skipped", error: String(err) }); - researchResult = { - summary: `Topic: ${opts.topic}`, - key_facts: [], - mood: "informative", - sources: [], - }; + log.stages.push({ name: "research", duration: dur, status: "brief" }); + } else { + try { + const researchOutput = await research(opts.llm, opts.topic); + researchResult = researchOutput.data; + llmUsages.push(researchOutput.usage); + const dur = (Date.now() - researchStart) / 1000; + cb.onStageComplete?.("research", `${researchResult.key_facts.length} facts`, dur); + cb.onProgress?.("research", { type: "results", summary: researchResult.summary, key_facts: researchResult.key_facts, mood: researchResult.mood }); + log.stages.push({ name: "research", duration: dur, status: "done" }); + } catch (err) { + const dur = (Date.now() - researchStart) / 1000; + cb.onStageSkip?.("research", "web search failed"); + log.stages.push({ name: "research", duration: dur, status: "skipped", error: String(err) }); + researchResult = { + summary: `Topic: ${opts.topic}`, + key_facts: [], + mood: "informative", + sources: [], + }; + } } // Check cancellation between stages @@ -203,8 +216,19 @@ export async function runPipeline( // Stage 2: Creative Director cb.onStageStart?.("director"); const cdStart = Date.now(); + + // Only allow visual types whose providers are actually configured. + // Stock types require a Pexels or Pixabay API key — without one they silently produce blank frames. + const stockAvailable = !!(process.env["PEXELS_API_KEY"] || process.env["PIXABAY_API_KEY"]); + const allowedVisualTypes: VisualType[] = [ + "ai_image", + "text_card", + ...(stockAvailable ? (["stock_image", "stock_video"] as const) : []), + ]; + const cdOutput = await generateDirectorScore(opts.llm, opts.topic, researchResult, { archetype: opts.archetype, + allowedVisualTypes, }); const directorScore = cdOutput.data; llmUsages.push(cdOutput.usage); @@ -271,24 +295,47 @@ export async function runPipeline( return { outputDir: runDir, videoPath: null, thumbnailPath: null, scorePath, logPath }; } - // Stage 4: Visual Assets (parallel) + // Stage 4: Visual Assets + // Ollama image generation runs sequentially to avoid overwhelming a single-threaded local model. + // Cloud providers run in parallel for speed. cb.onStageStart?.("visuals"); const visualStart = Date.now(); const totalScenes = directorScore.scenes.length; - const sceneResults = await Promise.all( - directorScore.scenes.map(async (scene, i) => { + const sceneResults: Array<{ path: string | null; usage: LLMUsage | null; durationSeconds: number | null }> = []; + + if (opts.imageProvider === "ollama") { + for (let i = 0; i < directorScore.scenes.length; i++) { + const scene = directorScore.scenes[i]!; + process.stderr.write(`\n[visuals] Scene ${i + 1}/${totalScenes}: generating image (${scene.visual_type})...\n`); try { - return await resolveVisualAsset(scene, i, totalScenes, assetsDir, opts, archetypeConfig); + const result = await resolveVisualAsset(scene, i, totalScenes, assetsDir, opts, archetypeConfig); + sceneResults.push(result); + process.stderr.write(`[visuals] Scene ${i + 1}/${totalScenes}: ✓ done\n`); } catch (err) { - cb.onProgress?.("visuals", { type: "asset_failed", scene: i, error: String(err) }); - return { - path: null as string | null, - usage: null as LLMUsage | null, - durationSeconds: null, - }; + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`[visuals] Scene ${i + 1}/${totalScenes}: ✗ failed — ${msg}\n`); + cb.onProgress?.("visuals", { type: "asset_failed", scene: i, error: msg }); + sceneResults.push({ path: null, usage: null, durationSeconds: null }); } - }), - ); + } + } else { + const parallelResults = await Promise.all( + directorScore.scenes.map(async (scene, i) => { + try { + return await resolveVisualAsset(scene, i, totalScenes, assetsDir, opts, archetypeConfig); + } catch (err) { + cb.onProgress?.("visuals", { type: "asset_failed", scene: i, error: String(err) }); + return { + path: null as string | null, + usage: null as LLMUsage | null, + durationSeconds: null, + }; + } + }), + ); + sceneResults.push(...parallelResults); + } + const sceneAssets = sceneResults.map((r) => r.path); const sceneSourceDurations = sceneResults.map((r) => r.durationSeconds); for (const r of sceneResults) { diff --git a/src/providers/factory.ts b/src/providers/factory.ts index eb17e9d..ecfab9c 100644 --- a/src/providers/factory.ts +++ b/src/providers/factory.ts @@ -10,12 +10,15 @@ import type { } from "../schema/providers.js"; import { GeminiImage } from "./image/gemini.js"; import { OpenAIImage } from "./image/openai.js"; +import { OllamaImage } from "./image/ollama.js"; import { AnthropicLLM } from "./llm/anthropic.js"; import { OpenAILLM } from "./llm/openai.js"; +import { OllamaLLM } from "./llm/ollama.js"; import { PexelsStock } from "./stock/pexels.js"; import { PixabayStock } from "./stock/pixabay.js"; import { ElevenLabsTTS } from "./tts/elevenlabs.js"; import { InworldTTS } from "./tts/inworld.js"; +import { ChatterboxTTS } from "./tts/chatterbox.js"; export interface ProviderConfig { llm: LLMProviderKey; @@ -23,6 +26,14 @@ export interface ProviderConfig { image: ImageProviderKey; stock?: StockProviderKey; keys?: Record; + /** Ollama-specific options */ + ollamaModel?: string; + ollamaImageModel?: string; + ollamaHost?: string; + /** Chatterbox-specific options */ + chatterboxDevice?: string; + chatterboxAudioPrompt?: string; + chatterboxPythonBin?: string; } export interface Providers { @@ -38,17 +49,27 @@ export function createProviders(config: ProviderConfig): Providers { const llm: LLMProvider = config.llm === "openai" ? new OpenAILLM(undefined, k["OPENAI_API_KEY"]) - : new AnthropicLLM(undefined, k["ANTHROPIC_API_KEY"]); + : config.llm === "ollama" + ? new OllamaLLM(config.ollamaModel, config.ollamaHost) + : new AnthropicLLM(undefined, k["ANTHROPIC_API_KEY"]); const tts: TTSProvider = config.tts === "inworld" ? new InworldTTS(undefined, undefined, k["INWORLD_TTS_API_KEY"]) - : new ElevenLabsTTS(undefined, k["ELEVENLABS_API_KEY"]); + : config.tts === "chatterbox" + ? new ChatterboxTTS({ + device: config.chatterboxDevice, + audioPrompt: config.chatterboxAudioPrompt, + pythonBin: config.chatterboxPythonBin, + }) + : new ElevenLabsTTS(undefined, k["ELEVENLABS_API_KEY"]); const imageGen: ImageProvider = config.image === "openai" ? new OpenAIImage(undefined, k["OPENAI_API_KEY"]) - : new GeminiImage(undefined, k["GOOGLE_API_KEY"]); + : config.image === "ollama" + ? new OllamaImage(config.ollamaImageModel, config.ollamaHost) + : new GeminiImage(undefined, k["GOOGLE_API_KEY"]); const stockKey = config.stock ?? "pexels"; const stock: StockProvider = diff --git a/src/providers/image/ollama.ts b/src/providers/image/ollama.ts new file mode 100644 index 0000000..b531b0a --- /dev/null +++ b/src/providers/image/ollama.ts @@ -0,0 +1,64 @@ +import type { ImageProvider } from "../../schema/providers.js"; + +const DEFAULT_MODEL = "x/flux2-klein:4b"; +const DEFAULT_HOST = "http://localhost:11434"; + +export class OllamaImage implements ImageProvider { + private model: string; + private host: string; + + constructor(model: string = DEFAULT_MODEL, host: string = DEFAULT_HOST) { + if (process.platform !== "darwin") { + throw new Error( + `Ollama image generation is currently macOS-only.\n` + + ` → Use --image-provider gemini or --image-provider openai on Linux/Windows.\n` + + ` → See: https://ollama.com/blog/image-generation`, + ); + } + this.model = model; + this.host = host.replace(/\/$/, ""); + } + + async generate(prompt: string, style?: string): Promise { + const fullPrompt = style + ? `${prompt}. Style: ${style}. Vertical 9:16 aspect ratio, portrait orientation. No text, no watermarks.` + : `${prompt}. Vertical 9:16 aspect ratio, portrait orientation. No text, no watermarks.`; + + const response = await fetch(`${this.host}/api/generate`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: this.model, + prompt: fullPrompt, + stream: false, + }), + }); + + if (!response.ok) { + const body = await response.text(); + throw new Error( + `Ollama image generation failed (${response.status}): ${body}\n` + + ` → Ensure Ollama is running: ollama serve\n` + + ` → Ensure the model is pulled: ollama pull ${this.model}`, + ); + } + + const data = (await response.json()) as OllamaGenerateResponse; + + // Ollama image models return the image as base64 in data.image (singular) + const imageData = data.image; + if (!imageData) { + throw new Error( + `Ollama returned no image data. ` + + `Ensure you are using an image-capable model (e.g. x/flux2-klein:4b or x/z-image-turbo:latest).`, + ); + } + + return Buffer.from(imageData, "base64"); + } +} + +interface OllamaGenerateResponse { + image?: string; + response: string; +} diff --git a/src/providers/llm/ollama.ts b/src/providers/llm/ollama.ts new file mode 100644 index 0000000..65effcc --- /dev/null +++ b/src/providers/llm/ollama.ts @@ -0,0 +1,105 @@ +import { z } from "zod"; +import type { LLMProvider, LLMResult } from "../../schema/providers.js"; + +const DEFAULT_MODEL = "llama3.2"; +const DEFAULT_HOST = "http://localhost:11434"; + +export class OllamaLLM implements LLMProvider { + readonly id = "ollama" as const; + private model: string; + private host: string; + + constructor(model: string = DEFAULT_MODEL, host: string = DEFAULT_HOST) { + this.model = model; + this.host = host.replace(/\/$/, ""); + console.info( + `ℹ Ollama mode: web search is disabled. You will be asked to provide topic context before the pipeline starts.`, + ); + } + + async generate(opts: { + systemPrompt: string; + userMessage: string; + schema: T; + enableWebSearch?: boolean; + }): Promise>> { + // Ollama has no web search capability. When called with enableWebSearch=true + // (the research agent), we return a stub so the pipeline can continue with + // the user-provided topic brief injected via topicBrief in PipelineOptions. + if (opts.enableWebSearch) { + return { + data: { + summary: "", + key_facts: [], + mood: "neutral", + sources: [], + } as z.infer, + usage: { inputTokens: 0, outputTokens: 0 }, + }; + } + + return this.generateStructured(opts); + } + + private async generateStructured(opts: { + systemPrompt: string; + userMessage: string; + schema: T; + }): Promise>> { + const jsonSchema = z.toJSONSchema(opts.schema); + + const systemWithSchema = + `${opts.systemPrompt}\n\n` + + `You MUST respond with a single valid JSON object that conforms to this JSON Schema:\n` + + `${JSON.stringify(jsonSchema, null, 2)}\n\n` + + `Do not include any explanation, markdown, or text outside the JSON object.`; + + const response = await fetch(`${this.host}/api/chat`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: this.model, + format: "json", + stream: false, + messages: [ + { role: "system", content: systemWithSchema }, + { role: "user", content: opts.userMessage }, + ], + }), + }); + + if (!response.ok) { + const body = await response.text(); + throw new Error(`Ollama API error (${response.status}): ${body}`); + } + + const raw = (await response.json()) as OllamaChatResponse; + const content = raw.message?.content ?? ""; + + let parsed: unknown; + try { + parsed = JSON.parse(content); + } catch { + throw new Error(`Ollama returned non-JSON content: ${content.slice(0, 200)}`); + } + + const result = opts.schema.safeParse(parsed); + if (!result.success) { + throw new Error(`Ollama response failed schema validation: ${JSON.stringify(result.error)}`); + } + + return { + data: result.data, + usage: { + inputTokens: raw.prompt_eval_count ?? 0, + outputTokens: raw.eval_count ?? 0, + }, + }; + } +} + +interface OllamaChatResponse { + message?: { role: string; content: string }; + prompt_eval_count?: number; + eval_count?: number; +} diff --git a/src/providers/tts/chatterbox.ts b/src/providers/tts/chatterbox.ts new file mode 100644 index 0000000..9652984 --- /dev/null +++ b/src/providers/tts/chatterbox.ts @@ -0,0 +1,127 @@ +import { execFileSync, spawn, spawnSync } from "node:child_process"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import type { TTSProvider, TTSResult, WordTimestamp } from "../../schema/providers.js"; + +const SCRIPT_PATH = path.join(process.cwd(), "scripts", "chatterbox_tts.py"); + +export class ChatterboxTTS implements TTSProvider { + private pythonBin: string; + private device: string; + private audioPrompt: string | null; + + /** + * @param opts.pythonBin Venv Python path returned by validateEnv. When omitted + * (e.g. tests), falls back to searching PATH for python3.12/3.11. + */ + constructor(opts: { device?: string; audioPrompt?: string; pythonBin?: string } = {}) { + this.device = opts.device ?? this.detectDevice(); + this.audioPrompt = opts.audioPrompt ?? null; + this.pythonBin = opts.pythonBin ?? this.resolvePythonBin(); + } + + async generate(text: string): Promise { + const tmpDir = os.tmpdir(); + const id = `openreels-tts-${Date.now()}-${Math.random().toString(36).slice(2)}`; + const wavPath = path.join(tmpDir, `${id}.wav`); + const mp3Path = path.join(tmpDir, `${id}.mp3`); + const tsPath = path.join(tmpDir, `${id}.json`); + + try { + const args = [ + SCRIPT_PATH, + "--text", text, + "--out", wavPath, + "--timestamps", tsPath, + "--device", this.device, + ]; + if (this.audioPrompt) { + args.push("--audio-prompt", this.audioPrompt); + } + + console.info( + `\nChatterbox Turbo: generating audio` + + (this.device !== "cpu" ? ` (device: ${this.device})` : ` (device: cpu — may be slow)`) + + `...\n`, + ); + + // Use async spawn so stderr streams live to the terminal (shows model load progress) + // and the Node.js event loop is not blocked during the potentially long model load. + await spawnAsync(this.pythonBin, args); + + if (!fs.existsSync(wavPath)) { + throw new Error(`Chatterbox TTS did not produce output file: ${wavPath}`); + } + + // Convert WAV → MP3 using ffmpeg (already required by the pipeline) + execFileSync("ffmpeg", [ + "-y", "-i", wavPath, + "-codec:a", "libmp3lame", "-q:a", "2", + mp3Path, + ], { stdio: "pipe" }); + + const audio = fs.readFileSync(mp3Path); + + const rawTimestamps = JSON.parse(fs.readFileSync(tsPath, "utf-8")) as unknown[]; + const words: WordTimestamp[] = rawTimestamps + .filter((t): t is { word: string; start: number; end: number } => + typeof (t as Record)["word"] === "string") + .map((t) => ({ word: t.word, start: t.start, end: t.end })); + + return { audio, words }; + } finally { + for (const f of [wavPath, mp3Path, tsPath]) { + try { fs.unlinkSync(f); } catch { /* ignore cleanup errors */ } + } + } + } + + private resolvePythonBin(): string { + // Mirror the preference order in validate-env: prefer 3.12/3.11 over generic python3 + for (const bin of ["python3.12", "python3.11", "python3", "python"]) { + const probe = spawnSync(bin, ["--version"], { encoding: "utf-8" }); + if (probe.status === 0) return bin; + } + throw new Error( + `Python not found. Chatterbox Turbo requires Python 3.11 or 3.12.\n` + + ` → macOS: brew install python@3.12\n` + + ` → Then: pip install chatterbox-tts`, + ); + } + + private detectDevice(): string { + // Prefer MPS on Apple Silicon, fall back to CPU + const platform = process.platform; + if (platform === "darwin") { + const arch = process.arch; + if (arch === "arm64") return "mps"; + } + return "cpu"; + } +} + +/** + * Async wrapper around child_process.spawn. + * Streams stderr live to the terminal so users can see Chatterbox model loading progress. + * Rejects with a descriptive error if the process exits non-zero. + */ +function spawnAsync(bin: string, args: string[]): Promise { + return new Promise((resolve, reject) => { + const child = spawn(bin, args, { + stdio: ["ignore", "pipe", "inherit"], // stdout captured (not used), stderr → terminal live + }); + + child.on("error", (err) => { + reject(new Error(`Failed to start Chatterbox TTS process: ${err.message}`)); + }); + + child.on("close", (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`Chatterbox TTS script exited with code ${code ?? "unknown"}.`)); + } + }); + }); +} diff --git a/src/schema/providers.ts b/src/schema/providers.ts index cf52e05..a3c91fd 100644 --- a/src/schema/providers.ts +++ b/src/schema/providers.ts @@ -1,8 +1,8 @@ import type { z } from "zod"; -export type LLMProviderKey = "anthropic" | "openai"; -export type TTSProviderKey = "elevenlabs" | "inworld"; -export type ImageProviderKey = "gemini" | "openai"; +export type LLMProviderKey = "anthropic" | "openai" | "ollama"; +export type TTSProviderKey = "elevenlabs" | "inworld" | "chatterbox"; +export type ImageProviderKey = "gemini" | "openai" | "ollama"; export type StockProviderKey = "pexels" | "pixabay"; export interface LLMUsage { From eee77ecff36e8f8619ed2b0514dea451fb1e63ff Mon Sep 17 00:00:00 2001 From: Al Jami Islam Anik Date: Thu, 2 Apr 2026 17:18:51 +0600 Subject: [PATCH 2/2] refactor: enhance Ollama integration and improve user interaction in topic context collection - Updated `collectTopicBrief` function to include LLM provider as a parameter, allowing for dynamic question generation based on the selected LLM. - Improved user interaction flow for providing topic context, offering options for guided questions, freeform input, or skipping. - Enhanced cost estimation logic to accommodate the new LLM provider parameter. - Refined error handling and output messages in the Chatterbox TTS provider for better user experience. - Updated relevant interfaces and types to ensure consistency across the pipeline. These changes significantly improve the flexibility and usability of the pipeline for local development. --- src/cli/cost-estimator.ts | 5 ++- src/cli/ollama-setup.ts | 30 ++++++++++++++--- src/index.ts | 7 +--- src/providers/tts/chatterbox.ts | 57 ++++++++++++++++++++++++++++----- 4 files changed, 80 insertions(+), 19 deletions(-) diff --git a/src/cli/cost-estimator.ts b/src/cli/cost-estimator.ts index 25d7552..1346125 100644 --- a/src/cli/cost-estimator.ts +++ b/src/cli/cost-estimator.ts @@ -70,12 +70,15 @@ export function estimateCost( score: DirectorScore, imageProvider: ImageProviderKey = "gemini", ttsProvider: TTSProviderKey = "elevenlabs", + provider: LLMProviderKey = "anthropic", ): CostBreakdown { const aiImages = score.scenes.filter((s) => s.visual_type === "ai_image").length; const ttsCharacters = score.scenes.reduce((sum, s) => sum + s.script_line.length, 0); const llmCalls = 3 + aiImages; // research + CD + critic + 1 per ai_image - const p = PRICING.anthropic; // conservative estimate + const p = provider === "ollama" + ? { perInputToken: 0, perOutputToken: 0 } + : PRICING[provider]; const callCost = (est: { input: number; output: number }) => est.input * p.perInputToken + est.output * p.perOutputToken; diff --git a/src/cli/ollama-setup.ts b/src/cli/ollama-setup.ts index 736c2d0..a37ad4c 100644 --- a/src/cli/ollama-setup.ts +++ b/src/cli/ollama-setup.ts @@ -81,8 +81,10 @@ export async function selectOllamaModel( label: string, lockToKnown = false, ): Promise { - // Build a lookup: base name → full pulled tag (e.g. "gemma3" → "gemma3:27b") - // When the user has multiple tags of the same base, prefer non-"latest" tags (more specific). + // Build two lookups: + // pulledExact: full tag → true (e.g. "x/flux2-klein:4b" → true) + // pulledByBase: base → full tag (e.g. "gemma3" → "gemma3:27b") for base-only entries + const pulledExact = new Set(pulledModels); const pulledByBase = new Map(); for (const fullTag of pulledModels) { const base = fullTag.replace(/:.*$/, ""); @@ -92,13 +94,33 @@ export async function selectOllamaModel( } } + /** + * Resolve display name for a known model entry: + * 1. If the exact tag is pulled, show it as-is. + * 2. If only a different tag of the same base is pulled, show the pulled tag. + * 3. Otherwise show the known entry unchanged (unpulled, recommended tag). + */ const resolveDisplayName = (knownEntry: string): string => { + if (pulledExact.has(knownEntry)) return knownEntry; const base = knownEntry.replace(/:.*$/, ""); return pulledByBase.get(base) ?? knownEntry; }; - const isPulled = (knownEntry: string): boolean => - pulledByBase.has(knownEntry.replace(/:.*$/, "")); + /** + * A known entry is considered "pulled" if: + * - Its exact tag is pulled (e.g. "x/flux2-klein:4b" pulled → "x/flux2-klein:4b" ✓) + * - OR it has no explicit tag in the known list (bare name like "gemma3") and any tag + * of that base is pulled (e.g. user pulled "gemma3:27b" → "gemma3" ✓) + * This prevents x/flux2-klein:9b from showing as pulled just because :4b is pulled. + */ + const isPulled = (knownEntry: string): boolean => { + if (pulledExact.has(knownEntry)) return true; + // Only fall back to base matching when the known entry has no tag (no ":") + // or uses ":latest" — i.e. it's not a specific version tag + const tag = knownEntry.includes(":") ? knownEntry.split(":")[1] : null; + if (tag && tag !== "latest") return false; + return pulledByBase.has(knownEntry.replace(/:.*$/, "")); + }; const pulledKnown = knownModels.filter(isPulled); const unpulledKnown = knownModels.filter((m) => !isPulled(m)); diff --git a/src/index.ts b/src/index.ts index 92e74e6..14e52c7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -47,7 +47,7 @@ async function main(): Promise { } // Build CLI callbacks (wraps ProgressDisplay + cost/log printing) - const { callbacks, progress } = createCliCallbacks(opts.yes); + const { callbacks } = createCliCallbacks(opts.yes); // Run pipeline const result = await runPipeline( @@ -70,8 +70,6 @@ async function main(): Promise { callbacks, ); - progress.summary(); - if (result.videoPath) { console.log(`\nDone! Video saved to: ${result.videoPath}`); } else if (opts.dryRun) { @@ -146,7 +144,6 @@ async function collectTopicBrief( const text = await ask(` > `); rl.close(); console.info(""); - // summary carries the full user text; key_facts left empty to avoid duplication. return { summary: text || `Topic: ${topic}`, key_facts: [], @@ -195,8 +192,6 @@ async function collectTopicBrief( rl.close(); - // key_facts = each non-empty answer as a standalone atomic fact. - // summary = prose joining all answers so the director has full narrative context. const key_facts = answers.filter(Boolean); const summary = key_facts.length > 0 ? key_facts.join(". ") diff --git a/src/providers/tts/chatterbox.ts b/src/providers/tts/chatterbox.ts index 9652984..7221e01 100644 --- a/src/providers/tts/chatterbox.ts +++ b/src/providers/tts/chatterbox.ts @@ -40,14 +40,14 @@ export class ChatterboxTTS implements TTSProvider { args.push("--audio-prompt", this.audioPrompt); } - console.info( - `\nChatterbox Turbo: generating audio` + - (this.device !== "cpu" ? ` (device: ${this.device})` : ` (device: cpu — may be slow)`) + + process.stderr.write( + `\n Chatterbox Turbo: synthesising audio` + + (this.device !== "cpu" ? ` on ${this.device}` : ` on cpu (may be slow)`) + `...\n`, ); - // Use async spawn so stderr streams live to the terminal (shows model load progress) - // and the Node.js event loop is not blocked during the potentially long model load. + // Async spawn — stderr is captured and filtered so raw tqdm progress bars + // don't pollute the terminal alongside the Node.js pipeline progress spinner. await spawnAsync(this.pythonBin, args); if (!fs.existsSync(wavPath)) { @@ -103,13 +103,46 @@ export class ChatterboxTTS implements TTSProvider { /** * Async wrapper around child_process.spawn. - * Streams stderr live to the terminal so users can see Chatterbox model loading progress. + * + * Captures Python's stderr and filters it so that: + * - Raw tqdm progress bars (the noisy `0%|█ | 0/1000` lines) are suppressed + * - Key status messages (model loading, "Audio saved") are forwarded as clean lines + * - Any unexpected errors are still surfaced on stderr for debugging + * * Rejects with a descriptive error if the process exits non-zero. */ function spawnAsync(bin: string, args: string[]): Promise { return new Promise((resolve, reject) => { const child = spawn(bin, args, { - stdio: ["ignore", "pipe", "inherit"], // stdout captured (not used), stderr → terminal live + stdio: ["ignore", "pipe", "pipe"], // capture both stdout and stderr + }); + + // Accumulate stderr to surface on failure, and filter noisy lines in real-time + const stderrLines: string[] = []; + let stderrBuf = ""; + + child.stderr?.on("data", (chunk: Buffer) => { + stderrBuf += chunk.toString(); + const lines = stderrBuf.split("\n"); + stderrBuf = lines.pop() ?? ""; // keep incomplete last line in buffer + + for (const line of lines) { + stderrLines.push(line); + const cleaned = line.trim(); + if (!cleaned) continue; + + // Suppress tqdm progress bars: they contain "|" and "it/s" or "%|" + if (/\d+%\|/.test(cleaned) || /it\/s\]/.test(cleaned)) continue; + // Suppress diffusers FutureWarning noise + if (/FutureWarning|LoRACompatible|deprecate/.test(cleaned)) continue; + // Suppress blank carriage-return lines tqdm emits + if (/^\r/.test(cleaned)) continue; + + // Forward meaningful status messages cleanly + if (/Loading|loaded|Fetching|S3 Token|Audio saved|Timestamps/.test(cleaned)) { + process.stderr.write(` ${cleaned}\n`); + } + } }); child.on("error", (err) => { @@ -117,10 +150,18 @@ function spawnAsync(bin: string, args: string[]): Promise { }); child.on("close", (code) => { + // Flush any remaining buffered stderr + if (stderrBuf.trim()) stderrLines.push(stderrBuf); + if (code === 0) { + process.stderr.write(` Chatterbox Turbo: audio ready.\n\n`); resolve(); } else { - reject(new Error(`Chatterbox TTS script exited with code ${code ?? "unknown"}.`)); + // On failure, print the last few lines of stderr to aid debugging + const tail = stderrLines.slice(-10).join("\n"); + reject(new Error( + `Chatterbox TTS script exited with code ${code ?? "unknown"}.\n${tail}`, + )); } }); });