From 789b161fa2dba1df52ded15a284a5e9e79b65c56 Mon Sep 17 00:00:00 2001 From: Hiroshi Nishio Date: Fri, 17 Apr 2026 14:31:30 -0700 Subject: [PATCH] Add Claude Opus 4.7 as default paid model, remove dead strict-tool guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add OPUS_4_7 enum, registry entry, pricing, context/output limits - Move OPUS_4_6 to fallback-only (user_selectable=False) - Update fallback chain: 4.7 → 4.6 → 4.5 → Sonnet 4.6 → 4.5 - Remove strip_strict_from_tools guard (all 4.5+ models support strict) - Add branch guard to pre-commit hook (block commits to main) - Remove list_changed_files.sh, simplify LGTM workflow - Add/update tests for all models across registry, pricing, fallback, quality checks --- CLAUDE.md | 23 ++++---- constants/claude.py | 2 + constants/models.py | 14 ++++- constants/test_models.py | 16 +++++ pyproject.toml | 2 +- scripts/git/list_changed_files.sh | 2 - services/claude/chat_with_claude.py | 9 --- services/claude/evaluate_condition.py | 4 +- services/claude/is_code_untestable.py | 4 +- services/claude/test_chat_with_claude.py | 37 ++++++++++++ services/claude/test_evaluate_condition.py | 12 ++++ .../claude/test_evaluate_quality_checks.py | 59 +++++++++++++------ services/claude/test_is_code_untestable.py | 13 ++++ services/get_fallback_models.py | 1 + .../supabase/credits/test_get_credit_price.py | 26 +++++++- .../supabase/llm_requests/calculate_costs.py | 1 + .../llm_requests/test_calculate_costs.py | 9 +++ services/test_chat_with_agent.py | 59 ++++++++++--------- services/test_chat_with_model.py | 2 +- services/test_get_fallback_models.py | 7 ++- services/webhook/setup_handler.py | 6 +- services/webhook/test_setup_handler.py | 4 ++ .../webhook/utils/test_get_preferred_model.py | 24 ++++++-- uv.lock | 2 +- 24 files changed, 245 insertions(+), 93 deletions(-) delete mode 100755 scripts/git/list_changed_files.sh diff --git a/CLAUDE.md b/CLAUDE.md index 4a885e0ee..cd2bbc832 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -119,30 +119,27 @@ assert find_test_files("foo.ts", all_files, None) == ["foo.test.ts"] **CRITICAL: NEVER start without explicit user request. PR must be clean — don't ignore failures.** -1. `git status` to see ALL changes -2. `scripts/git/list_changed_files.sh` — store for staging reference -3. Verify not on main: `git branch --show-current` -4. `git fetch origin main && git merge origin/main` -5. `git commit -m "descriptive message"` — user has already run `git add` before saying "lgtm" +1. `git fetch origin main && git merge origin/main` +2. `git commit -m "descriptive message"` — user has already run `git add` before saying "lgtm" - Pre-commit hook runs automatically (see `scripts/git/pre_commit_hook.sh`): pip-freeze, generate-types, black, ruff, print/logging checks, then pylint + pyright + pytest concurrently - Install: `ln -sf ../../scripts/git/pre_commit_hook.sh .git/hooks/pre-commit` - **If hooks fail**: fix, re-stage, commit again. Don't stage other sessions' files. - **`--no-verify`** only for trivial non-code changes - Unused mock params: `# pyright: reportUnusedVariable=false` at top - NO co-author lines or `[skip ci]` -6. Check for existing PR: `gh pr list --head $(git branch --show-current) --state open` — if exists, **STOP and ask** -7. `git push` -8. `gh pr create --title "PR title" --body "" --assignee @me` — create PR immediately, no body -9. Check recent posts: `scripts/git/recent_social_posts.sh gitauto` and `scripts/git/recent_social_posts.sh wes` -10. `gh pr edit --body "..."` — add summary and social posts after checking recent posts +3. Check for existing PR: `gh pr list --head $(git branch --show-current) --state open` — if exists, **STOP and ask** +4. `git push` +5. `gh pr create --title "PR title" --body "" --assignee @me` — create PR immediately, no body +6. Check recent posts: `scripts/git/recent_social_posts.sh gitauto` and `scripts/git/recent_social_posts.sh wes` +7. `gh pr edit --body "..."` — add summary and social posts after checking recent posts - Technical, descriptive title. **No `## Test plan`**. - **Two posts** (last section, customer-facing only): GitAuto (changelog) + Wes (personal voice, don't emphasize "GitAuto") - Format: `## Social Media Post (GitAuto)` and `## Social Media Post (Wes)` headers (parsed by `extract-social-posts.js`) - **GitAuto post**: Changelog format — one-liner headline + change bullets. No storytelling. - **Wes post**: Honest stories. Vary openers — check recent posts first. - Guidelines: No em dashes (—). Under 280 chars. No marketing keywords. No negative framing. No internal names. No small numbers — use relative language. -11. If Sentry issue: `python3 scripts/sentry/get_issue.py AGENT-XXX` then `python3 scripts/sentry/resolve_issue.py AGENT-XXX ...` -12. **Blog post** in `../website/app/blog/posts/`: +8. If Sentry issue: `python3 scripts/sentry/get_issue.py AGENT-XXX` then `python3 scripts/sentry/resolve_issue.py AGENT-XXX ...` +9. **Blog post** in `../website/app/blog/posts/`: - `YYYY-MM-DD-kebab-case-title.mdx`. Universal dev lesson, not GitAuto internals (exception: deep technical content). - **Skip if lesson is thin** — argue back if no real insight. - `metadata.title`: **34-44 chars** (layout appends `- GitAuto Blog` for 50-60 total). Verify no duplicate slug. @@ -172,7 +169,7 @@ assert find_test_files("foo.ts", all_files, None) == ["foo.test.ts"] - Unsplash API: `source .env && curl "https://api.unsplash.com/search/photos?query=QUERY&orientation=landscape&client_id=$UNSPLASH_ACCESS_KEY"`, download with `?w=1200&h=630&fit=crop&crop=entropy` - Convert to PNG: `sips -s format png downloaded.jpg --out ../website/public/og/blog/{slug}.png` - Dev.to crops to 1000x420 — keep important content centered. -13. **Docs page** in `../website/app/docs/`: Create new or update existing. Browse for best-fit category. New pages: 3 files (`page.tsx`, `layout.tsx`, `jsonld.ts`). +10. **Docs page** in `../website/app/docs/`: Create new or update existing. Browse for best-fit category. New pages: 3 files (`page.tsx`, `layout.tsx`, `jsonld.ts`). ## CRITICAL: Fixing Foxquilt PRs diff --git a/constants/claude.py b/constants/claude.py index 9da3980f3..89964be2c 100644 --- a/constants/claude.py +++ b/constants/claude.py @@ -7,6 +7,7 @@ # https://platform.claude.com/docs/en/docs/about-claude/models/all-models#model-comparison-table CONTEXT_WINDOW: dict[ClaudeModelId, int] = { + ClaudeModelId.OPUS_4_7: 1_000_000, ClaudeModelId.OPUS_4_6: 1_000_000, ClaudeModelId.SONNET_4_6: 1_000_000, ClaudeModelId.OPUS_4_5: 200_000, @@ -15,6 +16,7 @@ } MAX_OUTPUT_TOKENS: dict[ClaudeModelId, int] = { + ClaudeModelId.OPUS_4_7: 128_000, ClaudeModelId.OPUS_4_6: 128_000, ClaudeModelId.SONNET_4_6: 64_000, ClaudeModelId.OPUS_4_5: 64_000, diff --git a/constants/models.py b/constants/models.py index 9ec76cc00..a52d91054 100644 --- a/constants/models.py +++ b/constants/models.py @@ -5,6 +5,7 @@ class ClaudeModelId(StrEnum): """Claude models — user-selectable and fallback-only.""" + OPUS_4_7 = "claude-opus-4-7" OPUS_4_6 = "claude-opus-4-6" SONNET_4_6 = "claude-sonnet-4-6" OPUS_4_5 = "claude-opus-4-5" @@ -41,9 +42,9 @@ class ModelInfo(TypedDict): MODEL_REGISTRY: dict[ModelId, ModelInfo] = { # Claude (user-selectable) - ClaudeModelId.OPUS_4_6: ModelInfo( + ClaudeModelId.OPUS_4_7: ModelInfo( provider=ModelProvider.CLAUDE, - display_name="Claude Opus 4.6", + display_name="Claude Opus 4.7", credit_cost_usd=8, user_selectable=True, free_tier=False, @@ -56,6 +57,13 @@ class ModelInfo(TypedDict): free_tier=True, ), # Claude (fallback-only, same cost as their newer versions) + ClaudeModelId.OPUS_4_6: ModelInfo( + provider=ModelProvider.CLAUDE, + display_name="Claude Opus 4.6", + credit_cost_usd=8, + user_selectable=False, + free_tier=False, + ), ClaudeModelId.OPUS_4_5: ModelInfo( provider=ModelProvider.CLAUDE, display_name="Claude Opus 4.5", @@ -100,6 +108,6 @@ class ModelInfo(TypedDict): m for m, r in MODEL_REGISTRY.items() if r["user_selectable"] and r["free_tier"] ] DEFAULT_FREE_MODEL = GoogleModelId.GEMMA_4_31B -DEFAULT_PAID_MODEL = ClaudeModelId.OPUS_4_6 +DEFAULT_PAID_MODEL = ClaudeModelId.OPUS_4_7 MAX_CREDIT_COST_USD = max(entry["credit_cost_usd"] for entry in MODEL_REGISTRY.values()) CREDIT_GRANT_AMOUNT_USD = MAX_CREDIT_COST_USD * 3 diff --git a/constants/test_models.py b/constants/test_models.py index b821a3372..ea70e52d2 100644 --- a/constants/test_models.py +++ b/constants/test_models.py @@ -64,3 +64,19 @@ def test_anthropic_models_exist(): m for m, r in MODEL_REGISTRY.items() if r["provider"] == ModelProvider.CLAUDE ] assert len(anthropic_models) >= 1 + + +def test_opus_47_is_user_selectable(): + info = MODEL_REGISTRY[ClaudeModelId.OPUS_4_7] + assert info["user_selectable"] is True + assert info["credit_cost_usd"] == 8 + + +def test_opus_46_is_fallback_only(): + info = MODEL_REGISTRY[ClaudeModelId.OPUS_4_6] + assert info["user_selectable"] is False + assert info["credit_cost_usd"] == 8 + + +def test_default_paid_model_is_opus_47(): + assert DEFAULT_PAID_MODEL == ClaudeModelId.OPUS_4_7 diff --git a/pyproject.toml b/pyproject.toml index 1cc9c8bbf..68787fded 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "GitAuto" -version = "1.6.6" +version = "1.6.10" requires-python = ">=3.14" dependencies = [ "annotated-doc==0.0.4", diff --git a/scripts/git/list_changed_files.sh b/scripts/git/list_changed_files.sh deleted file mode 100755 index 5bde394bb..000000000 --- a/scripts/git/list_changed_files.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -(git diff --name-only; git diff --name-only --staged; git ls-files --others --exclude-standard) | sort -u diff --git a/services/claude/chat_with_claude.py b/services/claude/chat_with_claude.py index d204b6b67..cb567f105 100644 --- a/services/claude/chat_with_claude.py +++ b/services/claude/chat_with_claude.py @@ -14,7 +14,6 @@ from services.claude.remove_outdated_file_edit_attempts import ( remove_outdated_file_edit_attempts, ) -from services.claude.strip_strict_from_tools import strip_strict_from_tools from services.claude.trim_messages import trim_messages_to_token_limit from services.llm_result import LlmResult, ToolCall from services.supabase.llm_requests.insert_llm_request import insert_llm_request @@ -43,14 +42,6 @@ def chat_with_claude( messages=messages, client=claude, model=model_id, max_input=max_input ) - # Strip "strict" from tools for models that don't support it - if model_id not in ( - ClaudeModelId.SONNET_4_6, - ClaudeModelId.SONNET_4_5, - ClaudeModelId.OPUS_4_6, - ): - tools = strip_strict_from_tools(tools) - # https://docs.anthropic.com/en/api/messages start_time = time.time() try: diff --git a/services/claude/evaluate_condition.py b/services/claude/evaluate_condition.py index b9fd771b1..5107066d0 100644 --- a/services/claude/evaluate_condition.py +++ b/services/claude/evaluate_condition.py @@ -46,8 +46,8 @@ def evaluate_condition( return EvaluationResult(False, "empty input") response = claude.beta.messages.create( - model=ClaudeModelId.OPUS_4_6, - max_tokens=MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_6], + model=ClaudeModelId.OPUS_4_7, + max_tokens=MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_7], temperature=0, system=system_prompt, messages=[{"role": "user", "content": content}], diff --git a/services/claude/is_code_untestable.py b/services/claude/is_code_untestable.py index 209a36a1c..703ed2c18 100644 --- a/services/claude/is_code_untestable.py +++ b/services/claude/is_code_untestable.py @@ -106,8 +106,8 @@ def is_code_untestable( Is this code dead (unreachable/redundant) or genuinely untestable (reachable at runtime but impossible to test)?""" response = claude.beta.messages.create( - model=ClaudeModelId.OPUS_4_6, - max_tokens=MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_6], + model=ClaudeModelId.OPUS_4_7, + max_tokens=MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_7], temperature=0, system=SYSTEM_PROMPT, messages=[{"role": "user", "content": content}], diff --git a/services/claude/test_chat_with_claude.py b/services/claude/test_chat_with_claude.py index 8caf65b8f..2f716c5e3 100644 --- a/services/claude/test_chat_with_claude.py +++ b/services/claude/test_chat_with_claude.py @@ -147,3 +147,40 @@ def test_chat_with_claude_calls_optimization_functions( ) mock_remove_outdated_file_edit_attempts.assert_called_once() + + +@patch("services.claude.chat_with_claude.insert_llm_request") +@patch("services.claude.chat_with_claude.claude") +def test_strict_tools_passed_through_unchanged(mock_claude, mock_insert_llm_request): + """Strict tools must not be stripped — all current models support strict.""" + mock_response = Mock() + mock_response.content = [Mock(type="text", text="ok")] + mock_response.usage = Mock(output_tokens=5) + mock_insert_llm_request.return_value = {"total_cost_usd": 0.01} + mock_claude.messages.create.return_value = mock_response + mock_claude.messages.count_tokens.return_value = Mock(input_tokens=10) + + tools = cast( + list[ToolUnionParam], + [ + { + "name": "test_tool", + "description": "Test", + "strict": True, + "input_schema": {"type": "object", "properties": {}}, + } + ], + ) + + chat_with_claude( + messages=cast(list[MessageParam], [{"role": "user", "content": "test"}]), + system_content="system", + tools=tools, + model_id=ClaudeModelId.HAIKU_4_5, + usage_id=999, + created_by="4:test-user", + ) + + call_args = mock_claude.messages.create.call_args + passed_tools = call_args.kwargs["tools"] + assert passed_tools[0]["strict"] is True diff --git a/services/claude/test_evaluate_condition.py b/services/claude/test_evaluate_condition.py index 2ab604acf..09ef29b74 100644 --- a/services/claude/test_evaluate_condition.py +++ b/services/claude/test_evaluate_condition.py @@ -66,6 +66,18 @@ def test_returns_evaluation_failed_on_invalid_json(self, mock_claude): assert result == EvaluationResult(False, "evaluation failed") + def test_uses_opus_47_model(self, mock_claude): + mock_response = MagicMock() + mock_response.content = [ + MagicMock(text='{"result": true, "reason": "testable"}') + ] + mock_claude.beta.messages.create.return_value = mock_response + + evaluate_condition(content="code", system_prompt="Check this.") + + call_args = mock_claude.beta.messages.create.call_args + assert call_args.kwargs["model"] == "claude-opus-4-7" + def test_uses_structured_output_schema(self, mock_claude): mock_response = MagicMock() mock_response.content = [ diff --git a/services/claude/test_evaluate_quality_checks.py b/services/claude/test_evaluate_quality_checks.py index d70c2b5b8..e18736935 100644 --- a/services/claude/test_evaluate_quality_checks.py +++ b/services/claude/test_evaluate_quality_checks.py @@ -1,12 +1,12 @@ # pyright: reportArgumentType=false from unittest.mock import MagicMock, patch +from constants.claude import MAX_OUTPUT_TOKENS from constants.models import ClaudeModelId from services.claude.evaluate_quality_checks import evaluate_quality_checks -@patch("services.claude.evaluate_quality_checks.claude") -def test_uses_opus_model(mock_claude): +def _mock_claude_call(mock_claude, model: ClaudeModelId): mock_content = MagicMock() mock_content.text = '{"business_logic": {}}' mock_claude.messages.create.return_value = MagicMock(content=[mock_content]) @@ -15,26 +15,49 @@ def test_uses_opus_model(mock_claude): source_content="const x = 1;", source_path="src/foo.ts", test_files=[("test/foo.spec.ts", "it('works', () => {})")], - model=ClaudeModelId.OPUS_4_6, + model=model, ) - call_kwargs = mock_claude.messages.create.call_args.kwargs - assert call_kwargs["model"] == "claude-opus-4-6" + return mock_claude.messages.create.call_args.kwargs @patch("services.claude.evaluate_quality_checks.claude") -def test_uses_max_tokens_matching_model(mock_claude): - mock_content = MagicMock() - mock_content.text = '{"business_logic": {}}' - mock_claude.messages.create.return_value = MagicMock(content=[mock_content]) +def test_opus_47_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.OPUS_4_7) + assert kwargs["model"] == "claude-opus-4-7" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_7] - evaluate_quality_checks( - source_content="const x = 1;", - source_path="src/foo.ts", - test_files=[("test/foo.spec.ts", "it('works', () => {})")], - model=ClaudeModelId.OPUS_4_6, - ) - call_kwargs = mock_claude.messages.create.call_args.kwargs - # Opus 4.6 has 128_000 max tokens - assert call_kwargs["max_tokens"] == 128_000 +@patch("services.claude.evaluate_quality_checks.claude") +def test_opus_46_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.OPUS_4_6) + assert kwargs["model"] == "claude-opus-4-6" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_6] + + +@patch("services.claude.evaluate_quality_checks.claude") +def test_sonnet_46_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.SONNET_4_6) + assert kwargs["model"] == "claude-sonnet-4-6" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.SONNET_4_6] + + +@patch("services.claude.evaluate_quality_checks.claude") +def test_opus_45_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.OPUS_4_5) + assert kwargs["model"] == "claude-opus-4-5" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.OPUS_4_5] + + +@patch("services.claude.evaluate_quality_checks.claude") +def test_sonnet_45_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.SONNET_4_5) + assert kwargs["model"] == "claude-sonnet-4-5" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.SONNET_4_5] + + +@patch("services.claude.evaluate_quality_checks.claude") +def test_haiku_45_passes_model_and_max_tokens(mock_claude): + kwargs = _mock_claude_call(mock_claude, ClaudeModelId.HAIKU_4_5) + assert kwargs["model"] == "claude-haiku-4-5" + assert kwargs["max_tokens"] == MAX_OUTPUT_TOKENS[ClaudeModelId.HAIKU_4_5] diff --git a/services/claude/test_is_code_untestable.py b/services/claude/test_is_code_untestable.py index 1b54a8e5e..263e6c8ca 100644 --- a/services/claude/test_is_code_untestable.py +++ b/services/claude/test_is_code_untestable.py @@ -28,6 +28,19 @@ def _set_mock_response(mock_claude, result: bool, category: str, reason: str): ] +def test_uses_opus_47_model(mock_claude): + _set_mock_response(mock_claude, False, "testable", "testable") + + is_code_untestable( + file_path="src/app.tsx", + file_content="const x = 1;", + uncovered_lines="1", + ) + + call_args = mock_claude.beta.messages.create.call_args + assert call_args.kwargs["model"] == "claude-opus-4-7" + + def test_returns_testable_when_no_uncovered_code(mock_claude): result = is_code_untestable( file_path="src/app.tsx", diff --git a/services/get_fallback_models.py b/services/get_fallback_models.py index 3a692ea79..bf1d48383 100644 --- a/services/get_fallback_models.py +++ b/services/get_fallback_models.py @@ -10,6 +10,7 @@ # Full fallback chains for resilience — includes non-user-selectable models CLAUDE_FALLBACK_MODELS: list[ModelId] = [ + ClaudeModelId.OPUS_4_7, ClaudeModelId.OPUS_4_6, ClaudeModelId.OPUS_4_5, ClaudeModelId.SONNET_4_6, diff --git a/services/supabase/credits/test_get_credit_price.py b/services/supabase/credits/test_get_credit_price.py index 391f16787..d031b4d67 100644 --- a/services/supabase/credits/test_get_credit_price.py +++ b/services/supabase/credits/test_get_credit_price.py @@ -16,9 +16,33 @@ def test_returns_max_cost_for_none(): assert get_credit_price(None) == MAX_CREDIT_COST_USD -def test_opus_costs_8(): +def test_opus_47_costs_8(): + assert get_credit_price(ClaudeModelId.OPUS_4_7) == 8 + + +def test_opus_46_costs_8(): assert get_credit_price(ClaudeModelId.OPUS_4_6) == 8 +def test_sonnet_46_costs_4(): + assert get_credit_price(ClaudeModelId.SONNET_4_6) == 4 + + +def test_opus_45_costs_8(): + assert get_credit_price(ClaudeModelId.OPUS_4_5) == 8 + + +def test_sonnet_45_costs_4(): + assert get_credit_price(ClaudeModelId.SONNET_4_5) == 4 + + +def test_haiku_45_costs_2(): + assert get_credit_price(ClaudeModelId.HAIKU_4_5) == 2 + + +def test_gemini_25_flash_costs_4(): + assert get_credit_price(GoogleModelId.GEMINI_2_5_FLASH) == 4 + + def test_gemma_costs_2(): assert get_credit_price(GoogleModelId.GEMMA_4_31B) == 2 diff --git a/services/supabase/llm_requests/calculate_costs.py b/services/supabase/llm_requests/calculate_costs.py index 701c9f493..1914965c2 100644 --- a/services/supabase/llm_requests/calculate_costs.py +++ b/services/supabase/llm_requests/calculate_costs.py @@ -11,6 +11,7 @@ def calculate_costs( # Pricing per 1M tokens (input/output) pricing = { "claude": { + "claude-opus-4-7": {"input": 5.00, "output": 25.00}, "claude-opus-4-6": {"input": 5.00, "output": 25.00}, "claude-opus-4-5": {"input": 5.00, "output": 25.00}, "claude-sonnet-4-6": {"input": 3.00, "output": 15.00}, diff --git a/services/supabase/llm_requests/test_calculate_costs.py b/services/supabase/llm_requests/test_calculate_costs.py index e86eddd13..0ad2e7398 100644 --- a/services/supabase/llm_requests/test_calculate_costs.py +++ b/services/supabase/llm_requests/test_calculate_costs.py @@ -3,6 +3,15 @@ from services.supabase.llm_requests.calculate_costs import calculate_costs +def test_calculate_costs_claude_opus_47(): + input_cost, output_cost = calculate_costs("claude", "claude-opus-4-7", 1000, 500) + expected_input = (1000 / 1_000_000) * 5.00 + expected_output = (500 / 1_000_000) * 25.00 + + assert input_cost == expected_input + assert output_cost == expected_output + + def test_calculate_costs_claude_opus_46(): input_cost, output_cost = calculate_costs("claude", "claude-opus-4-6", 1000, 500) expected_input = (1000 / 1_000_000) * 5.00 diff --git a/services/test_chat_with_agent.py b/services/test_chat_with_agent.py index 230fdc37e..9bcf7ba9a 100644 --- a/services/test_chat_with_agent.py +++ b/services/test_chat_with_agent.py @@ -81,7 +81,7 @@ async def test_cost_usd_computed_for_claude_model( cost_usd=0.1625, ) - base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_6) + base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_7) result = await chat_with_agent( messages=[{"role": "user", "content": "test"}], @@ -89,7 +89,7 @@ async def test_cost_usd_computed_for_claude_model( base_args=base_args, tools=[], usage_id=789, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) assert result.cost_usd == 0.1625 @@ -211,7 +211,7 @@ async def test_delete_file_logging( cost_usd=0.05, ) - base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_6) + base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_7) with patch("services.chat_with_agent.tools_to_call") as mock_tools: mock_tools.__getitem__.return_value = Mock( @@ -224,7 +224,7 @@ async def test_delete_file_logging( base_args=base_args, tools=[], usage_id=123, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) call_args = mock_update_comment.call_args_list @@ -435,7 +435,7 @@ async def test_verify_task_is_complete_with_pr_changes_returns_is_completed_true ) base_args = create_test_base_args( - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, owner="test-owner", repo="test-repo", pr_number=123, @@ -448,7 +448,7 @@ async def test_verify_task_is_complete_with_pr_changes_returns_is_completed_true base_args=base_args, tools=[], usage_id=123, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) is_completed = result.is_completed @@ -667,7 +667,7 @@ async def test_file_write_result_success_includes_formatted_content( cost_usd=0.05, ) - base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_6) + base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_7) with patch("services.chat_with_agent.tools_to_call") as mock_tools: mock_tools.__contains__.return_value = True @@ -686,7 +686,7 @@ async def test_file_write_result_success_includes_formatted_content( base_args=base_args, tools=[], usage_id=123, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) messages = result.messages @@ -927,7 +927,7 @@ async def test_full_file_read_calls_replace_with_is_full_file_read_true( cost_usd=0.05, ) - base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_6) + base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_7) with patch("services.chat_with_agent.tools_to_call") as mock_tools: mock_tools.__getitem__.return_value = Mock( @@ -941,7 +941,7 @@ async def test_full_file_read_calls_replace_with_is_full_file_read_true( base_args=base_args, tools=[], usage_id=123, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) mock_replace.assert_called_once() @@ -1164,19 +1164,16 @@ async def test_gitauto_md_edit_always_allowed( @pytest.mark.asyncio @patch("services.chat_with_agent.chat_with_model") -async def test_opus_falls_back_to_opus_45_on_error( +async def test_opus_falls_back_through_chain_on_error( mock_chat_with_model, create_test_base_args ): - """Opus 4.6 ($8) falls back to Opus 4.5 ($8), not Sonnet.""" - call_count = 0 + """Opus 4.7 ($8) falls back through 4.6 → 4.5, not Sonnet.""" + models_tried: list[ModelId] = [] def side_effect(**kwargs): - nonlocal call_count - call_count += 1 - if call_count == 1: - assert kwargs["model_id"] == ClaudeModelId.OPUS_4_6 - raise RuntimeError("Opus 4.6 down") - assert kwargs["model_id"] == ClaudeModelId.OPUS_4_5 + models_tried.append(kwargs["model_id"]) + if len(models_tried) < 3: + raise RuntimeError("model down") return LlmResult( assistant_message={"role": "assistant", "content": "ok"}, tool_calls=[], @@ -1186,7 +1183,7 @@ def side_effect(**kwargs): ) mock_chat_with_model.side_effect = side_effect - base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_6) + base_args = create_test_base_args(model_id=ClaudeModelId.OPUS_4_7) await chat_with_agent( messages=[{"role": "user", "content": "test"}], @@ -1194,10 +1191,14 @@ def side_effect(**kwargs): base_args=base_args, tools=[], usage_id=1, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) - assert call_count == 2 + assert models_tried == [ + ClaudeModelId.OPUS_4_7, + ClaudeModelId.OPUS_4_6, + ClaudeModelId.OPUS_4_5, + ] @pytest.mark.asyncio @@ -1236,7 +1237,7 @@ def side_effect(**kwargs): for model in models_tried: assert model not in ( - ClaudeModelId.OPUS_4_6, + ClaudeModelId.OPUS_4_7, ClaudeModelId.OPUS_4_5, ), f"Sonnet fallback tried Opus model {model}" @@ -1276,18 +1277,18 @@ def side_effect(**kwargs): async def test_overload_retries_then_falls_back( mock_chat_with_model, _mock_sleep, create_test_base_args ): - """Overload retries exhaust on Opus 4.6, then falls back to Opus 4.5.""" + """Overload retries exhaust on Opus 4.7, then falls back to Opus 4.6.""" call_count = 0 def side_effect(**kwargs): nonlocal call_count call_count += 1 - # First 3 calls: Opus 4.6 overloaded (1 initial + 2 retries) + # First 3 calls: Opus 4.7 overloaded (1 initial + 2 retries) if call_count <= 3: - assert kwargs["model_id"] == ClaudeModelId.OPUS_4_6 + assert kwargs["model_id"] == ClaudeModelId.OPUS_4_7 raise ClaudeOverloadedError("529") - # 4th call: Opus 4.5 succeeds - assert kwargs["model_id"] == ClaudeModelId.OPUS_4_5 + # 4th call: falls back to Opus 4.6 and succeeds + assert kwargs["model_id"] == ClaudeModelId.OPUS_4_6 return LlmResult( assistant_message={"role": "assistant", "content": "ok"}, tool_calls=[], @@ -1305,7 +1306,7 @@ def side_effect(**kwargs): base_args=base_args, tools=[], usage_id=1, - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, ) assert call_count == 4 diff --git a/services/test_chat_with_model.py b/services/test_chat_with_model.py index 42445d602..dcc1620df 100644 --- a/services/test_chat_with_model.py +++ b/services/test_chat_with_model.py @@ -24,7 +24,7 @@ def test_routes_to_anthropic_for_opus(mock_claude: MagicMock): messages=[{"role": "user", "content": "test"}], system_content="system", tools=[], - model_id=ClaudeModelId.OPUS_4_6, + model_id=ClaudeModelId.OPUS_4_7, usage_id=1, created_by="test", ) diff --git a/services/test_get_fallback_models.py b/services/test_get_fallback_models.py index f29be7c21..a75f50f23 100644 --- a/services/test_get_fallback_models.py +++ b/services/test_get_fallback_models.py @@ -9,6 +9,7 @@ def test_claude_chain_order(): """Claude chain has newest Opus first, then fallback models.""" assert CLAUDE_FALLBACK_MODELS == [ + ClaudeModelId.OPUS_4_7, ClaudeModelId.OPUS_4_6, ClaudeModelId.OPUS_4_5, ClaudeModelId.SONNET_4_6, @@ -26,9 +27,10 @@ def test_google_chain_order(): def test_get_fallback_models_opus_excludes_self(): """Opus fallbacks don't include Opus itself.""" - fallbacks = get_fallback_models(ClaudeModelId.OPUS_4_6) - assert ClaudeModelId.OPUS_4_6 not in fallbacks + fallbacks = get_fallback_models(ClaudeModelId.OPUS_4_7) + assert ClaudeModelId.OPUS_4_7 not in fallbacks assert fallbacks == [ + ClaudeModelId.OPUS_4_6, ClaudeModelId.OPUS_4_5, ClaudeModelId.SONNET_4_6, ClaudeModelId.SONNET_4_5, @@ -39,6 +41,7 @@ def test_get_fallback_models_sonnet_46_excludes_opus(): """Sonnet ($4) never falls back to Opus ($8).""" fallbacks = get_fallback_models(ClaudeModelId.SONNET_4_6) assert fallbacks == [ClaudeModelId.SONNET_4_5] + assert ClaudeModelId.OPUS_4_7 not in fallbacks assert ClaudeModelId.OPUS_4_6 not in fallbacks assert ClaudeModelId.OPUS_4_5 not in fallbacks diff --git a/services/webhook/setup_handler.py b/services/webhook/setup_handler.py index 60913f0a6..8c4124208 100644 --- a/services/webhook/setup_handler.py +++ b/services/webhook/setup_handler.py @@ -59,9 +59,9 @@ async def setup_handler( ): set_owner_repo(owner_name, repo_name) set_trigger("setup") - model_id = ( - ClaudeModelId.OPUS_4_6 - ) # Setup runs once per repo, needs reliable tool-use + + # Setup runs once per repo, needs reliable tool-use + model_id = ClaudeModelId.OPUS_4_7 logger.info( "Setup triggered by sender_name=%s sender_id=%d source=%s for %s/%s", sender_name, diff --git a/services/webhook/test_setup_handler.py b/services/webhook/test_setup_handler.py index ee00d56c5..691cc6ded 100644 --- a/services/webhook/test_setup_handler.py +++ b/services/webhook/test_setup_handler.py @@ -95,6 +95,10 @@ async def test_not_completed_closes_pr_and_deletes_branch( mock_close_pr.assert_called_once() mock_delete_branch.assert_called_once() + # Setup handler hardcodes Opus 4.7 for reliable tool-use + agent_kwargs = mock_agent.call_args.kwargs + assert agent_kwargs["model_id"] == "claude-opus-4-7" + @pytest.mark.asyncio @patch(f"{MODULE}.slack_notify") diff --git a/services/webhook/utils/test_get_preferred_model.py b/services/webhook/utils/test_get_preferred_model.py index 38df2256e..f82c53dcd 100644 --- a/services/webhook/utils/test_get_preferred_model.py +++ b/services/webhook/utils/test_get_preferred_model.py @@ -94,6 +94,18 @@ def test_typo_model_paid_user(): # --- Non-user-selectable models (fallback-only) --- +def test_non_selectable_opus_4_6_free_user(): + settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_6}) + result = get_preferred_model(repo_settings=settings, is_paid=False) + assert result == DEFAULT_FREE_MODEL + + +def test_non_selectable_opus_4_6_paid_user(): + settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_6}) + result = get_preferred_model(repo_settings=settings, is_paid=True) + assert result == DEFAULT_PAID_MODEL + + def test_non_selectable_opus_4_5_free_user(): settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_5}) result = get_preferred_model(repo_settings=settings, is_paid=False) @@ -154,19 +166,19 @@ def test_free_user_selects_sonnet_4_6(): # --- Free user with premium models (DB setting honored regardless of tier) --- -def test_free_user_selects_opus_4_6_honored(): - settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_6}) +def test_free_user_selects_opus_4_7_honored(): + settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_7}) result = get_preferred_model(repo_settings=settings, is_paid=False) - assert result == ClaudeModelId.OPUS_4_6 + assert result == ClaudeModelId.OPUS_4_7 # --- Paid user with each user-selectable model --- -def test_paid_user_selects_opus_4_6(): - settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_6}) +def test_paid_user_selects_opus_4_7(): + settings = cast(Repositories, {"preferred_model": ClaudeModelId.OPUS_4_7}) result = get_preferred_model(repo_settings=settings, is_paid=True) - assert result == ClaudeModelId.OPUS_4_6 + assert result == ClaudeModelId.OPUS_4_7 def test_paid_user_selects_sonnet_4_6(): diff --git a/uv.lock b/uv.lock index c5bb6a8a1..59808db6b 100644 --- a/uv.lock +++ b/uv.lock @@ -596,7 +596,7 @@ wheels = [ [[package]] name = "gitauto" -version = "1.6.6" +version = "1.6.10" source = { virtual = "." } dependencies = [ { name = "annotated-doc" },