-
Notifications
You must be signed in to change notification settings - Fork 10
Evaluation: Show cost #746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| """add cost tracking to evaluation_run | ||
|
|
||
| Revision ID: 050 | ||
| Revises: 049 | ||
| Create Date: 2026-04-09 12:00:00.000000 | ||
|
|
||
| """ | ||
|
|
||
| import sqlalchemy as sa | ||
| from alembic import op | ||
| from sqlalchemy.dialects import postgresql | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "050" | ||
| down_revision = "049" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "cost", | ||
| postgresql.JSONB(astext_type=sa.Text()), | ||
| nullable=True, | ||
| comment="Cost tracking (response/embedding tokens and USD)", | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| op.drop_column("evaluation_run", "cost") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,225 @@ | ||
| """ | ||
| Pricing utilities for evaluation cost tracking. | ||
|
|
||
| This module provides model pricing data and cost calculation functions | ||
| for both response generation and embedding stages of evaluation runs. | ||
|
|
||
| Pricing uses OpenAI Batch API rates (50% cheaper than real-time). | ||
| Source: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json | ||
| """ | ||
|
|
||
| import logging | ||
| from collections.abc import Callable, Iterable | ||
| from typing import Any | ||
|
|
||
| from app.crud.evaluations.embeddings import EMBEDDING_MODEL | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Number of decimals to round USD cost values to. | ||
| COST_USD_DECIMALS = 6 | ||
|
|
||
| # Batch API pricing in USD per token. | ||
| MODEL_PRICING: dict[str, dict[str, float]] = { | ||
| # GPT-4o (batch pricing) | ||
| "gpt-4o": { | ||
| "input_cost_per_token": 1.25e-06, | ||
| "output_cost_per_token": 5e-06, | ||
| }, | ||
| "gpt-4o-mini": { | ||
| "input_cost_per_token": 7.5e-08, | ||
| "output_cost_per_token": 3e-07, | ||
| }, | ||
| # GPT-4.1 (batch pricing) | ||
| "gpt-4.1": { | ||
| "input_cost_per_token": 1e-06, | ||
| "output_cost_per_token": 4e-06, | ||
| }, | ||
| # GPT-5 (batch pricing) | ||
| "gpt-5": { | ||
| "input_cost_per_token": 6.25e-07, | ||
| "output_cost_per_token": 5e-06, | ||
| }, | ||
| "gpt-5-mini": { | ||
| "input_cost_per_token": 1.25e-07, | ||
| "output_cost_per_token": 1e-06, | ||
| }, | ||
| "gpt-5-nano": { | ||
| "input_cost_per_token": 2.5e-08, | ||
| "output_cost_per_token": 2e-07, | ||
| }, | ||
| # GPT-5.4 (batch pricing) | ||
| "gpt-5.4": { | ||
| "input_cost_per_token": 1.25e-06, | ||
| "output_cost_per_token": 7.5e-06, | ||
| }, | ||
| "gpt-5.4-pro": { | ||
| "input_cost_per_token": 1.5e-05, | ||
| "output_cost_per_token": 9e-05, | ||
| }, | ||
| "gpt-5.4-mini": { | ||
| "input_cost_per_token": 3.75e-07, | ||
| "output_cost_per_token": 2.25e-06, | ||
| }, | ||
| "gpt-5.4-nano": { | ||
| "input_cost_per_token": 1e-07, | ||
| "output_cost_per_token": 6.25e-07, | ||
| }, | ||
| # Embedding models (batch pricing) | ||
| EMBEDDING_MODEL: { | ||
| "input_cost_per_token": 6.5e-08, | ||
| }, | ||
| } | ||
|
|
||
|
|
||
| def calculate_token_cost( | ||
| model: str, input_tokens: int, output_tokens: int = 0 | ||
| ) -> float: | ||
| """ | ||
| Calculate USD cost for a model call given input and output token counts. | ||
|
|
||
| Used for both response generation (input + output tokens) and embeddings | ||
| (input tokens only — pass output_tokens=0 or omit). | ||
|
|
||
| Args: | ||
| model: OpenAI model name (e.g., "gpt-4o", "text-embedding-3-large") | ||
| input_tokens: Number of input/prompt tokens | ||
| output_tokens: Number of output tokens (default 0 for embeddings) | ||
|
|
||
| Returns: | ||
| Cost in USD. Returns 0.0 if model is unknown. | ||
| """ | ||
| pricing = MODEL_PRICING.get(model) | ||
| if not pricing: | ||
| logger.warning( | ||
| f"[calculate_token_cost] Unknown model '{model}', returning cost 0.0" | ||
| ) | ||
| return 0.0 | ||
|
|
||
|
Comment on lines
+92
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Detect model literals in repo that are not directly present in MODEL_PRICING keys.
python - <<'PY'
import ast, pathlib, re, json
pricing_path = pathlib.Path("backend/app/crud/evaluations/pricing.py")
tree = ast.parse(pricing_path.read_text())
pricing_keys = set()
for node in tree.body:
if isinstance(node, ast.AnnAssign) and getattr(node.target, "id", None) == "MODEL_PRICING":
pricing_obj = ast.literal_eval(node.value)
pricing_keys = set(pricing_obj.keys())
break
model_regex = re.compile(r'["\\\']model["\\\']\s*:\s*["\\\']([^"\\\']+)["\\\']')
found = {}
for p in pathlib.Path(".").rglob("*.py"):
try:
text = p.read_text()
except Exception:
continue
for m in model_regex.finditer(text):
found.setdefault(m.group(1), set()).add(str(p))
unknown = {m: sorted(paths) for m, paths in found.items() if m not in pricing_keys}
print("MODEL_PRICING_KEYS=", sorted(pricing_keys))
print("UNKNOWN_MODEL_LITERALS=", json.dumps(unknown, indent=2))
PYRepository: ProjectTech4DevAI/kaapi-backend Length of output: 2493 🏁 Script executed: head -150 backend/app/crud/evaluations/pricing.py | tail -80Repository: ProjectTech4DevAI/kaapi-backend Length of output: 2439 🏁 Script executed: # Check where gpt-4, gpt-4-turbo, o1, gemini-2.5-pro are used in non-test code
rg -t py "gpt-4|gpt-4-turbo|o1|gemini-2\.5-pro" --glob '!**/tests/**' backend/Repository: ProjectTech4DevAI/kaapi-backend Length of output: 4282 🏁 Script executed: # Check if calculate_response_cost or calculate_embedding_cost are called with actual model values
rg "calculate_response_cost|calculate_embedding_cost" --glob '!**/tests/**' -A 2 -B 2 backend/Repository: ProjectTech4DevAI/kaapi-backend Length of output: 3012 🏁 Script executed: # Check who calls build_response_cost_entry and build_embedding_cost_entry in production code
rg "build_response_cost_entry|build_embedding_cost_entry" --glob '!**/tests/**' -B 3 -A 3 backend/Repository: ProjectTech4DevAI/kaapi-backend Length of output: 4317 🏁 Script executed: # Check what values are passed to build_response_cost_entry and EMBEDDING_MODEL in processing.py
rg "model\s*=" backend/app/crud/evaluations/processing.py | head -20
rg "EMBEDDING_MODEL" backend/app/crud/evaluations/processing.py -B 5 -A 2Repository: ProjectTech4DevAI/kaapi-backend Length of output: 900 🏁 Script executed: # Check what EMBEDDING_MODEL is set to
rg "EMBEDDING_MODEL\s*=" backend/app/crud/evaluations/embeddings.py -A 1
# Check what resolve_model_from_config returns
rg "def resolve_model_from_config" backend/app/crud/evaluations/core.py -A 20Repository: ProjectTech4DevAI/kaapi-backend Length of output: 745 🏁 Script executed: # Check if gpt-4, gpt-4-turbo, o1 are actually in the SUPPORTED models list
rg "SUPPORTED.*MODELS|DEFAULT.*MODEL" backend/app/models/llm/constants.py -B 2 -A 8 | head -50Repository: ProjectTech4DevAI/kaapi-backend Length of output: 579 🏁 Script executed: # Continue reading the SUPPORTED_MODELS to see all OpenAI models
rg "SUPPORTED_MODELS" backend/app/models/llm/constants.py -A 30 | grep -E "gpt-4|o1"Repository: ProjectTech4DevAI/kaapi-backend Length of output: 277 Add missing pricing data for production-supported models to avoid false-zero cost calculations. Lines 92 and 115 perform exact-key lookup against MODEL_PRICING, but several models in
When an evaluation run uses any of these models, 🐛 Suggested fixEither add missing model pricing to +def _normalize_pricing_model(model: str) -> str:
+ if model in MODEL_PRICING:
+ return model
+ # Match version-suffixed IDs to known base models, preferring longest base first
+ for base in sorted(MODEL_PRICING.keys(), key=len, reverse=True):
+ if model.startswith(f"{base}-"):
+ return base
+ return model
+
def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) -> float:
@@
- pricing = MODEL_PRICING.get(model)
+ normalized_model = _normalize_pricing_model(model)
+ pricing = MODEL_PRICING.get(normalized_model)
if not pricing:
logger.warning(
- f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0"
+ f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0"
)
return 0.0
@@
def calculate_embedding_cost(model: str, prompt_tokens: int) -> float:
@@
- pricing = MODEL_PRICING.get(model)
+ normalized_model = _normalize_pricing_model(model)
+ pricing = MODEL_PRICING.get(normalized_model)
if not pricing:
logger.warning(
f"[calculate_embedding_cost] Unknown model '{model}', returning cost 0.0"
)
return 0.0Also applies to: 115-121 🤖 Prompt for AI Agents |
||
| input_cost = input_tokens * pricing.get("input_cost_per_token", 0) | ||
| output_cost = output_tokens * pricing.get("output_cost_per_token", 0) | ||
| return input_cost + output_cost | ||
|
|
||
|
|
||
| def _sum_usage( | ||
| items: Iterable[dict[str, Any]], | ||
| usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None], | ||
| fields: tuple[str, ...], | ||
| ) -> dict[str, int]: | ||
| """ | ||
| Sum named token fields across items, using a caller-supplied extractor | ||
| to locate the per-item usage dict. | ||
|
|
||
| Args: | ||
| items: Iterable of items to aggregate | ||
| usage_extractor: Function returning the usage dict for an item, or None | ||
| fields: Token field names to sum (e.g., "input_tokens", "total_tokens") | ||
|
|
||
| Returns: | ||
| Mapping of field name to summed value | ||
| """ | ||
| totals: dict[str, int] = {field: 0 for field in fields} | ||
| for item in items: | ||
| usage = usage_extractor(item) | ||
| if not usage: | ||
| continue | ||
| for field in fields: | ||
| totals[field] += usage.get(field, 0) | ||
| return totals | ||
|
|
||
|
|
||
| def build_response_cost_entry( | ||
| model: str, results: list[dict[str, Any]] | ||
| ) -> dict[str, Any]: | ||
| """ | ||
| Aggregate token usage from parsed evaluation results and calculate cost. | ||
|
|
||
| Args: | ||
| model: OpenAI model name used for response generation | ||
| results: Parsed evaluation results from parse_evaluation_output(), | ||
| each containing a "usage" dict with input_tokens/output_tokens/total_tokens | ||
|
|
||
| Returns: | ||
| Response cost entry for the cost JSONB field | ||
| """ | ||
| totals = _sum_usage( | ||
| items=results, | ||
| usage_extractor=lambda r: r.get("usage"), | ||
| fields=("input_tokens", "output_tokens", "total_tokens"), | ||
| ) | ||
|
|
||
| cost_usd = calculate_token_cost( | ||
| model=model, | ||
| input_tokens=totals["input_tokens"], | ||
| output_tokens=totals["output_tokens"], | ||
| ) | ||
|
|
||
| return { | ||
| "model": model, | ||
| "input_tokens": totals["input_tokens"], | ||
| "output_tokens": totals["output_tokens"], | ||
| "total_tokens": totals["total_tokens"], | ||
| "cost_usd": round(cost_usd, COST_USD_DECIMALS), | ||
| } | ||
|
|
||
|
|
||
| def build_embedding_cost_entry( | ||
| model: str, raw_results: list[dict[str, Any]] | ||
| ) -> dict[str, Any]: | ||
| """ | ||
| Aggregate token usage from raw embedding batch results and calculate cost. | ||
|
|
||
| Args: | ||
| model: OpenAI embedding model name | ||
| raw_results: Raw JSONL lines from embedding batch output, | ||
| each containing response.body.usage with prompt_tokens/total_tokens | ||
|
|
||
| Returns: | ||
| Embedding cost entry for the cost JSONB field | ||
| """ | ||
| totals = _sum_usage( | ||
| items=raw_results, | ||
| usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"), | ||
| fields=("prompt_tokens", "total_tokens"), | ||
| ) | ||
|
|
||
| cost_usd = calculate_token_cost(model=model, input_tokens=totals["prompt_tokens"]) | ||
|
|
||
| return { | ||
| "model": model, | ||
| "prompt_tokens": totals["prompt_tokens"], | ||
| "total_tokens": totals["total_tokens"], | ||
| "cost_usd": round(cost_usd, COST_USD_DECIMALS), | ||
| } | ||
|
|
||
|
|
||
| def build_cost_dict( | ||
| response_entry: dict[str, Any] | None = None, | ||
| embedding_entry: dict[str, Any] | None = None, | ||
| ) -> dict[str, Any]: | ||
| """ | ||
| Combine response and embedding cost entries into the final cost JSONB structure. | ||
|
|
||
| Args: | ||
| response_entry: Response cost entry from build_response_cost_entry() | ||
| embedding_entry: Embedding cost entry from build_embedding_cost_entry() | ||
|
|
||
| Returns: | ||
| Combined cost dict with total_cost_usd | ||
| """ | ||
| cost: dict[str, Any] = {} | ||
|
|
||
| response_cost = 0.0 | ||
| embedding_cost = 0.0 | ||
|
|
||
| if response_entry: | ||
| cost["response"] = response_entry | ||
| response_cost = response_entry.get("cost_usd", 0.0) | ||
|
|
||
| if embedding_entry: | ||
| cost["embedding"] = embedding_entry | ||
| embedding_cost = embedding_entry.get("cost_usd", 0.0) | ||
|
|
||
| cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS) | ||
|
|
||
| return cost | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-backend
Length of output: 112
Add explicit return type hints to migration functions.
Lines 20 and 32 define
upgrade()anddowngrade()without return annotations, which violates the repository's Python typing guideline.✅ Minimal fix
📝 Committable suggestion
🤖 Prompt for AI Agents