tsensei · SucksToBeAnik · Apr 2, 2026 · Apr 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,10 @@ dist/
 *.tsbuildinfo
 .env
 staging_docs/
-output/
+output/
+
+# macOS
+.DS_Store
+
+# Python version managers
+.python-version
diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ docker compose run worker npx tsx src/index.ts --yes "5 stoic lessons that chang
 
 ### Local development
 
-**Prerequisites:** Node.js 22+, pnpm, ffprobe (for stock video duration detection)
+**Prerequisites:** Node.js 22+, pnpm, ffprobe (for stock video duration detection), Python 3.11 or 3.12 (only required for `--tts-provider chatterbox` — Python 3.13+ is not supported due to PyTorch/OpenMP issues)
 
 ```bash
 git clone https://github.com/tsensei/OpenReels.git
@@ -99,18 +99,110 @@ pnpm start "your topic" --archetype anime_illustration --provider openai
 
 **Optional:** `PEXELS_API_KEY` ([Pexels](https://www.pexels.com/api/)), `PIXABAY_API_KEY` ([Pixabay](https://pixabay.com/api/docs/)) for stock footage (free registration)
 
+### Zero-API-key local mode (Ollama + Chatterbox)
+
+Run the full pipeline with **no API keys** using local open-source models.
+
+#### One-time setup
+
+```bash
+# 1. Install and start Ollama (macOS)
+brew install ollama
+ollama serve   # keep running in a separate terminal
+
+# 2. Pull your preferred LLM model (one-time — pick one)
+ollama pull llama3.1:8b     # ~5 GB, fast and reliable
+ollama pull gemma3:9b       # ~6 GB, good quality
+ollama pull qwen2.5:7b      # ~5 GB, multilingual
+
+# 3. Pull an image generation model — macOS only (one-time — pick one)
+ollama pull x/flux2-klein:4b    # ~6 GB, fastest
+ollama pull x/flux2-klein:9b    # ~12 GB, higher quality
+ollama pull x/z-image-turbo:fp8     # ~13 GB, photorealistic
+```
+
+> **Interactive model selection:** When you run with `--provider ollama`, OpenReels will show you all pulled models and let you choose interactively. No need to memorise model names.
+
+> **Chatterbox is auto-installed:** OpenReels automatically creates an isolated Python venv at `~/.openreels/chatterbox-venv` and installs `chatterbox-tts` on first use. You only need Python 3.11 or 3.12 on your system (`brew install python@3.12` on macOS). Python 3.13+ is not supported.
+
+> **First run note:** Chatterbox Turbo downloads ~1.5 GB of model weights on first use. This is automatic and cached locally (`~/.cache/huggingface/`). Expect 2–5 minutes on a typical connection.
+
+> **GPU recommended:** Chatterbox is significantly faster on Apple Silicon (MPS) or a CUDA GPU. CPU generation works but is slow (10–30× slower than real-time).
+
+#### Running with no API keys
+
+```bash
+# Interactive — OpenReels will prompt you to choose a model and describe your topic
+pnpm start "your topic" \
+  --provider ollama \
+  --tts-provider chatterbox \
+  --image-provider ollama
+
+# Non-interactive — supply context via --brief and pin specific models
+pnpm start "your topic" \
+  --provider ollama \
+  --tts-provider chatterbox \
+  --image-provider ollama \
+  --ollama-model llama3.1:8b \
+  --ollama-image-model x/flux2-klein:4b \
+  --brief "Solar panels cost $10k upfront but save $50k over 20 years. Mood: informative."
+```
+
+> **Linux/Windows users:** Ollama image generation is currently macOS-only. Use `--image-provider gemini` (free tier available) or `--image-provider openai` instead, and provide the relevant API key.
+
+#### Mix and match — combine free and paid providers
+
+Each provider is independent. You can freely mix local and cloud options to get the best trade-off between cost, speed, and quality.
+
+```bash
+# Best quality script + free TTS + free images (macOS for now)
+# Requires: ANTHROPIC_API_KEY
+pnpm start "your topic" \
+  --provider anthropic \
+  --tts-provider chatterbox \
+  --image-provider ollama
+
+# Free script + paid TTS for higher quality voice + free images (macOS for now)
+# Requires: ELEVENLABS_API_KEY
+pnpm start "your topic" \
+  --provider ollama \
+  --tts-provider elevenlabs \
+  --image-provider ollama
+
+# Free everything on Linux/Windows (Ollama image gen is macOS-only, use Gemini instead)
+# Requires: GOOGLE_API_KEY
+pnpm start "your topic" \
+  --provider ollama \
+  --tts-provider chatterbox \
+  --image-provider gemini
+
+# Free script + free TTS + best image quality (OpenAI DALL-E)
+# Requires: OPENAI_API_KEY
+pnpm start "your topic" \
+  --provider ollama \
+  --tts-provider chatterbox \
+  --image-provider openai
+```
+
 ### CLI flags
 
 | Flag | Description | Default |
 |------|-------------|---------|
 | `--archetype <name>` | Override visual archetype | LLM chooses |
-| `--provider <name>` | LLM provider (`anthropic` or `openai`) | `anthropic` |
-| `--tts-provider <name>` | TTS provider (`elevenlabs` or `inworld`) | `elevenlabs` |
+| `--provider <name>` | LLM provider (`anthropic`, `openai`, `ollama`) | `anthropic` |
+| `--tts-provider <name>` | TTS provider (`elevenlabs`, `inworld`, `chatterbox`) | `elevenlabs` |
+| `-i, --image-provider <name>` | Image provider (`gemini`, `openai`, `ollama`) | `gemini` |
 | `--platform <name>` | Target platform (`youtube`, `tiktok`, `instagram`) | `youtube` |
 | `--dry-run` | Output DirectorScore JSON without generating assets | off |
 | `--preview` | Open Remotion Studio after rendering | off |
 | `-o, --output <dir>` | Output directory | `./output` |
 | `-y, --yes` | Auto-confirm cost estimation (for Docker/CI) | off |
+| `--brief <text>` | Topic context for Ollama mode (skips interactive prompt) | — |
+| `--ollama-model <name>` | Ollama LLM model name | interactive selection |
+| `--ollama-image-model <name>` | Ollama image generation model name | interactive selection |
+| `--ollama-host <url>` | Ollama API host URL | `http://localhost:11434` |
+| `--chatterbox-device <device>` | PyTorch device for Chatterbox (`cpu`, `cuda`, `mps`) | auto-detected |
+| `--chatterbox-audio-prompt <path>` | Reference WAV for Chatterbox voice cloning (5–10s) | — |
 
 ## Archetypes
 

diff --git a/scripts/chatterbox_tts.py b/scripts/chatterbox_tts.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Chatterbox Turbo TTS bridge for OpenReels.
+
+Generates speech from text using ResembleAI's Chatterbox Turbo model and writes:
+  - A WAV audio file
+  - A JSON file containing approximate word-level timestamps
+
+Usage:
+    python scripts/chatterbox_tts.py \
+        --text "Your script here" \
+        --out /tmp/output.wav \
+        --timestamps /tmp/timestamps.json \
+        [--device cpu|cuda|mps] \
+        [--audio-prompt /path/to/reference.wav]
+
+First run will download ~1.5 GB of model weights automatically.
+Subsequent runs use the cached weights (usually ~/.cache/huggingface/).
+
+Requirements:
+    pip install chatterbox-tts
+"""
+
+import argparse
+import json
+import sys
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Chatterbox Turbo TTS bridge")
+    parser.add_argument("--text", required=True, help="Text to synthesize")
+    parser.add_argument("--out", required=True, help="Output WAV file path")
+    parser.add_argument("--timestamps", required=True, help="Output JSON timestamps file path")
+    parser.add_argument(
+        "--device",
+        default="cpu",
+        choices=["cpu", "cuda", "mps"],
+        help="PyTorch device (default: cpu; use mps on Apple Silicon, cuda on NVIDIA GPU)",
+    )
+    parser.add_argument(
+        "--audio-prompt",
+        default=None,
+        help="Optional path to a reference WAV file for zero-shot voice cloning (5–10s recommended)",
+    )
+    args = parser.parse_args()
+
+    try:
+        import torchaudio as ta
+        from chatterbox.tts_turbo import ChatterboxTurboTTS
+    except ImportError as e:
+        print(
+            f"ERROR: {e}\n"
+            "Chatterbox Turbo is not installed.\n"
+            "Install it with:  pip install chatterbox-tts\n"
+            "Python 3.11 is strongly recommended.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    print("Loading Chatterbox Turbo model (first run downloads ~1.5 GB)...", file=sys.stderr)
+    model = ChatterboxTurboTTS.from_pretrained(device=args.device)
+
+    generate_kwargs: dict = {"audio_prompt_path": args.audio_prompt} if args.audio_prompt else {}
+    wav = model.generate(args.text, **generate_kwargs)
+
+    ta.save(args.out, wav, model.sr)
+    print(f"Audio saved to: {args.out}", file=sys.stderr)
+
+    # Chatterbox Turbo does not expose word-level timestamps natively.
+    # We approximate by distributing words evenly across the total audio duration.
+    # Caption timing will be approximate but functional.
+    duration_sec: float = wav.shape[-1] / model.sr
+    words = args.text.split()
+    if not words:
+        timestamps = []
+    else:
+        step = duration_sec / len(words)
+        timestamps = [
+            {"word": word, "start": round(i * step, 4), "end": round((i + 1) * step, 4)}
+            for i, word in enumerate(words)
+        ]
+
+    with open(args.timestamps, "w", encoding="utf-8") as f:
+        json.dump(timestamps, f)
+    print(f"Timestamps saved to: {args.timestamps}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/agents/creative-director.ts b/src/agents/creative-director.ts
@@ -34,7 +34,11 @@ export async function generateDirectorScore(
   llm: LLMProvider,
   topic: string,
   researchContext: ResearchResult,
-  options?: { archetype?: string },
+  options?: {
+    archetype?: string;
+    /** Restrict which visual types the director may use. Defaults to all four. */
+    allowedVisualTypes?: VisualType[];
+  },
 ): Promise<DirectorScoreOutput> {
   let systemPrompt = buildDefaultPrompt();
 
@@ -57,6 +61,13 @@ export async function generateDirectorScore(
     ? `Use the "${options.archetype}" archetype.`
     : `Choose from: ${archetypes.join(", ")}`;
 
+  const allVisualTypes = VisualType.options;
+  const allowed: VisualType[] = options?.allowedVisualTypes ?? [...allVisualTypes];
+  const visualTypeConstraint = allowed.length === allVisualTypes.length
+    ? `Use all 4 visual types (ai_image, stock_image, stock_video, text_card).`
+    : `IMPORTANT: You may ONLY use these visual types: ${allowed.join(", ")}. ` +
+      `Do NOT use ${allVisualTypes.filter((t) => !allowed.includes(t)).join(" or ")} — those providers are not available.\n`
+
   const userMessage = `Topic: ${topic}
 
 Research context:
@@ -69,7 +80,7 @@ Mood: ${researchContext.mood}
 
 ${archetypeInstruction}
 
-Create a DirectorScore with 4-7 scenes. Use all 4 visual types (ai_image, stock_image, stock_video, text_card).
+Create a DirectorScore with 4-7 scenes. ${visualTypeConstraint}
 CRITICAL RULE: Never use the same visual_type more than 2 times in a row.
 Every scene MUST have a script_line (the voiceover text).
 The first scene should be a strong hook.

diff --git a/src/cli/args.test.ts b/src/cli/args.test.ts
@@ -15,6 +15,9 @@ describe("CLIOptions type", () => {
       preview: false,
       output: "./output",
       yes: true,
+      ollamaModel: "llama3.2",
+      ollamaImageModel: "x/flux2-klein",
+      ollamaHost: "http://localhost:11434",
     };
     expect(opts.yes).toBe(true);
   });
@@ -30,6 +33,9 @@ describe("CLIOptions type", () => {
       preview: false,
       output: "./output",
       yes: false,
+      ollamaModel: "llama3.2",
+      ollamaImageModel: "x/flux2-klein",
+      ollamaHost: "http://localhost:11434",
     };
     expect(opts.yes).toBe(false);
   });

diff --git a/src/cli/args.ts b/src/cli/args.ts
@@ -16,6 +16,14 @@ export interface CLIOptions {
   preview: boolean;
   output: string;
   yes: boolean;
+  brief?: string;
+  /** Explicitly provided via --ollama-model. When undefined, model is selected interactively. */
+  ollamaModel?: string;
+  /** Explicitly provided via --ollama-image-model. When undefined, model is selected interactively. */
+  ollamaImageModel?: string;
+  ollamaHost: string;
+  chatterboxDevice?: string;
+  chatterboxAudioPrompt?: string;
 }
 
 export function parseArgs(): CLIOptions {
@@ -28,17 +36,17 @@ export function parseArgs(): CLIOptions {
     .argument("<topic>", "The topic for your video")
     .addOption(
       new Option("-p, --provider <provider>", "LLM provider")
-        .choices(["anthropic", "openai"])
+        .choices(["anthropic", "openai", "ollama"])
         .default("anthropic"),
     )
     .addOption(
       new Option("-i, --image-provider <provider>", "Image generation provider")
-        .choices(["gemini", "openai"])
+        .choices(["gemini", "openai", "ollama"])
         .default("gemini"),
     )
     .addOption(
       new Option("--tts-provider <provider>", "TTS provider")
-        .choices(["elevenlabs", "inworld"])
+        .choices(["elevenlabs", "inworld", "chatterbox"])
         .default("elevenlabs"),
     )
     .option("-a, --archetype <archetype>", "Visual archetype override")
@@ -47,6 +55,12 @@ export function parseArgs(): CLIOptions {
     .option("--preview", "Open Remotion Studio preview after rendering", false)
     .option("-o, --output <dir>", "Output directory", "./output")
     .option("-y, --yes", "Auto-confirm cost estimation prompt (non-interactive mode)", false)
+    .option("--brief <text>", "Topic context for Ollama mode (skips interactive prompt)")
+    .option("--ollama-model <name>", "Ollama LLM model name (default: interactive selection)")
+    .option("--ollama-image-model <name>", "Ollama image generation model name (default: interactive selection)")
+    .option("--ollama-host <url>", "Ollama API host", "http://localhost:11434")
+    .option("--chatterbox-device <device>", "PyTorch device for Chatterbox TTS (cpu, cuda, mps)")
+    .option("--chatterbox-audio-prompt <path>", "Path to reference WAV for Chatterbox voice cloning")
     .parse();
 
   const topic = program.args[0] ?? "";
@@ -67,5 +81,11 @@ export function parseArgs(): CLIOptions {
     preview: opts["preview"] as boolean,
     output: opts["output"] as string,
     yes: opts["yes"] as boolean,
+    brief: opts["brief"] as string | undefined,
+    ollamaModel: opts["ollamaModel"] as string | undefined,
+    ollamaImageModel: opts["ollamaImageModel"] as string | undefined,
+    ollamaHost: opts["ollamaHost"] as string,
+    chatterboxDevice: opts["chatterboxDevice"] as string | undefined,
+    chatterboxAudioPrompt: opts["chatterboxAudioPrompt"] as string | undefined,
   };
 }