Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions backend/app/alembic/versions/050_add_cost_to_evaluation_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""add cost tracking to evaluation_run

Revision ID: 050
Revises: 049
Create Date: 2026-04-09 12:00:00.000000

"""

import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "050"
down_revision = "049"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"evaluation_run",
sa.Column(
"cost",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="Cost tracking (response/embedding tokens and USD)",
),
)


def downgrade():
op.drop_column("evaluation_run", "cost")
Comment on lines +20 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify migration functions lacking return annotations
rg -nP '^def\s+(upgrade|downgrade)\s*\(\s*\)\s*:' backend/app/alembic/versions/050_add_cost_to_evaluation_run.py

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 112


Add explicit return type hints to migration functions.

Lines 20 and 32 define upgrade() and downgrade() without return annotations, which violates the repository's Python typing guideline.

✅ Minimal fix
-def upgrade():
+def upgrade() -> None:
     op.add_column(
         "evaluation_run",
         sa.Column(
             "cost",
             postgresql.JSONB(astext_type=sa.Text()),
             nullable=True,
             comment="Cost tracking (response/embedding tokens and USD)",
         ),
     )

-def downgrade():
+def downgrade() -> None:
     op.drop_column("evaluation_run", "cost")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def upgrade():
op.add_column(
"evaluation_run",
sa.Column(
"cost",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="Cost tracking (response/embedding tokens and USD)",
),
)
def downgrade():
op.drop_column("evaluation_run", "cost")
def upgrade() -> None:
op.add_column(
"evaluation_run",
sa.Column(
"cost",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="Cost tracking (response/embedding tokens and USD)",
),
)
def downgrade() -> None:
op.drop_column("evaluation_run", "cost")
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/alembic/versions/050_add_cost_to_evaluation_run.py` around lines
20 - 33, The migration functions upgrade and downgrade are missing return type
annotations; update the function signatures for upgrade() and downgrade() to
include explicit return type hints (-> None) to satisfy the repository's typing
guidelines, keeping the existing bodies unchanged and only modifying the
function declarations.

11 changes: 11 additions & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
update_traces_with_cosine_scores,
upload_dataset_to_langfuse,
)
from app.crud.evaluations.pricing import (
build_cost_dict,
build_embedding_cost_entry,
build_response_cost_entry,
calculate_token_cost,
)
from app.crud.evaluations.processing import (
check_and_process_evaluation,
poll_all_pending_evaluations,
Expand Down Expand Up @@ -74,6 +80,11 @@
"calculate_average_similarity",
"calculate_cosine_similarity",
"start_embedding_batch",
# Pricing
"build_cost_dict",
"build_embedding_cost_entry",
"build_response_cost_entry",
"calculate_token_cost",
# Langfuse
"create_langfuse_dataset_run",
"fetch_trace_scores_from_langfuse",
Expand Down
5 changes: 5 additions & 0 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def update_evaluation_run(
object_store_url: str | None = None,
score_trace_url: str | None = None,
score: dict | None = None,
cost: dict | None = None,
embedding_batch_job_id: int | None = None,
) -> EvaluationRun:
"""
Expand All @@ -211,7 +212,9 @@ def update_evaluation_run(
status: New status value (optional)
error_message: New error message (optional)
object_store_url: New object store URL (optional)
score_trace_url: New per-trace score S3 URL (optional)
score: New score dict (optional)
cost: New cost dict (optional)
embedding_batch_job_id: New embedding batch job ID (optional)

Returns:
Expand All @@ -226,6 +229,8 @@ def update_evaluation_run(
eval_run.object_store_url = object_store_url
if score is not None:
eval_run.score = score
if cost is not None:
eval_run.cost = cost
if embedding_batch_job_id is not None:
eval_run.embedding_batch_job_id = embedding_batch_job_id
if score_trace_url is not None:
Expand Down
225 changes: 225 additions & 0 deletions backend/app/crud/evaluations/pricing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
"""
Pricing utilities for evaluation cost tracking.

This module provides model pricing data and cost calculation functions
for both response generation and embedding stages of evaluation runs.

Pricing uses OpenAI Batch API rates (50% cheaper than real-time).
Source: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
"""

import logging
from collections.abc import Callable, Iterable
from typing import Any

from app.crud.evaluations.embeddings import EMBEDDING_MODEL

logger = logging.getLogger(__name__)

# Number of decimals to round USD cost values to.
COST_USD_DECIMALS = 6

# Batch API pricing in USD per token.
MODEL_PRICING: dict[str, dict[str, float]] = {
# GPT-4o (batch pricing)
"gpt-4o": {
"input_cost_per_token": 1.25e-06,
"output_cost_per_token": 5e-06,
},
"gpt-4o-mini": {
"input_cost_per_token": 7.5e-08,
"output_cost_per_token": 3e-07,
},
# GPT-4.1 (batch pricing)
"gpt-4.1": {
"input_cost_per_token": 1e-06,
"output_cost_per_token": 4e-06,
},
# GPT-5 (batch pricing)
"gpt-5": {
"input_cost_per_token": 6.25e-07,
"output_cost_per_token": 5e-06,
},
"gpt-5-mini": {
"input_cost_per_token": 1.25e-07,
"output_cost_per_token": 1e-06,
},
"gpt-5-nano": {
"input_cost_per_token": 2.5e-08,
"output_cost_per_token": 2e-07,
},
# GPT-5.4 (batch pricing)
"gpt-5.4": {
"input_cost_per_token": 1.25e-06,
"output_cost_per_token": 7.5e-06,
},
"gpt-5.4-pro": {
"input_cost_per_token": 1.5e-05,
"output_cost_per_token": 9e-05,
},
"gpt-5.4-mini": {
"input_cost_per_token": 3.75e-07,
"output_cost_per_token": 2.25e-06,
},
"gpt-5.4-nano": {
"input_cost_per_token": 1e-07,
"output_cost_per_token": 6.25e-07,
},
# Embedding models (batch pricing)
EMBEDDING_MODEL: {
"input_cost_per_token": 6.5e-08,
},
}


def calculate_token_cost(
model: str, input_tokens: int, output_tokens: int = 0
) -> float:
"""
Calculate USD cost for a model call given input and output token counts.

Used for both response generation (input + output tokens) and embeddings
(input tokens only — pass output_tokens=0 or omit).

Args:
model: OpenAI model name (e.g., "gpt-4o", "text-embedding-3-large")
input_tokens: Number of input/prompt tokens
output_tokens: Number of output tokens (default 0 for embeddings)

Returns:
Cost in USD. Returns 0.0 if model is unknown.
"""
pricing = MODEL_PRICING.get(model)
if not pricing:
logger.warning(
f"[calculate_token_cost] Unknown model '{model}', returning cost 0.0"
)
return 0.0

Comment on lines +92 to +98
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Detect model literals in repo that are not directly present in MODEL_PRICING keys.
python - <<'PY'
import ast, pathlib, re, json

pricing_path = pathlib.Path("backend/app/crud/evaluations/pricing.py")
tree = ast.parse(pricing_path.read_text())
pricing_keys = set()

for node in tree.body:
    if isinstance(node, ast.AnnAssign) and getattr(node.target, "id", None) == "MODEL_PRICING":
        pricing_obj = ast.literal_eval(node.value)
        pricing_keys = set(pricing_obj.keys())
        break

model_regex = re.compile(r'["\\\']model["\\\']\s*:\s*["\\\']([^"\\\']+)["\\\']')
found = {}

for p in pathlib.Path(".").rglob("*.py"):
    try:
        text = p.read_text()
    except Exception:
        continue
    for m in model_regex.finditer(text):
        found.setdefault(m.group(1), set()).add(str(p))

unknown = {m: sorted(paths) for m, paths in found.items() if m not in pricing_keys}
print("MODEL_PRICING_KEYS=", sorted(pricing_keys))
print("UNKNOWN_MODEL_LITERALS=", json.dumps(unknown, indent=2))
PY

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 2493


🏁 Script executed:

head -150 backend/app/crud/evaluations/pricing.py | tail -80

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 2439


🏁 Script executed:

# Check where gpt-4, gpt-4-turbo, o1, gemini-2.5-pro are used in non-test code
rg -t py "gpt-4|gpt-4-turbo|o1|gemini-2\.5-pro" --glob '!**/tests/**' backend/

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 4282


🏁 Script executed:

# Check if calculate_response_cost or calculate_embedding_cost are called with actual model values
rg "calculate_response_cost|calculate_embedding_cost" --glob '!**/tests/**' -A 2 -B 2 backend/

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 3012


🏁 Script executed:

# Check who calls build_response_cost_entry and build_embedding_cost_entry in production code
rg "build_response_cost_entry|build_embedding_cost_entry" --glob '!**/tests/**' -B 3 -A 3 backend/

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 4317


🏁 Script executed:

# Check what values are passed to build_response_cost_entry and EMBEDDING_MODEL in processing.py
rg "model\s*=" backend/app/crud/evaluations/processing.py | head -20
rg "EMBEDDING_MODEL" backend/app/crud/evaluations/processing.py -B 5 -A 2

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 900


🏁 Script executed:

# Check what EMBEDDING_MODEL is set to
rg "EMBEDDING_MODEL\s*=" backend/app/crud/evaluations/embeddings.py -A 1

# Check what resolve_model_from_config returns
rg "def resolve_model_from_config" backend/app/crud/evaluations/core.py -A 20

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 745


🏁 Script executed:

# Check if gpt-4, gpt-4-turbo, o1 are actually in the SUPPORTED models list
rg "SUPPORTED.*MODELS|DEFAULT.*MODEL" backend/app/models/llm/constants.py -B 2 -A 8 | head -50

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 579


🏁 Script executed:

# Continue reading the SUPPORTED_MODELS to see all OpenAI models
rg "SUPPORTED_MODELS" backend/app/models/llm/constants.py -A 30 | grep -E "gpt-4|o1"

Repository: ProjectTech4DevAI/kaapi-backend

Length of output: 277


Add missing pricing data for production-supported models to avoid false-zero cost calculations.

Lines 92 and 115 perform exact-key lookup against MODEL_PRICING, but several models in SUPPORTED_MODELS are missing pricing data:

  • gpt-4, gpt-4-turbo, o1-preview, o1-mini, gpt-4.1-mini, gpt-4.1-nano
  • gemini-2.5-pro, gemini-2.5-pro-preview-tts

When an evaluation run uses any of these models, calculate_response_cost() returns 0.0, causing cost tracking to underreport.

🐛 Suggested fix

Either add missing model pricing to MODEL_PRICING, or implement model normalization to map variants to available pricing data:

+def _normalize_pricing_model(model: str) -> str:
+    if model in MODEL_PRICING:
+        return model
+    # Match version-suffixed IDs to known base models, preferring longest base first
+    for base in sorted(MODEL_PRICING.keys(), key=len, reverse=True):
+        if model.startswith(f"{base}-"):
+            return base
+    return model
+
 def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) -> float:
@@
-    pricing = MODEL_PRICING.get(model)
+    normalized_model = _normalize_pricing_model(model)
+    pricing = MODEL_PRICING.get(normalized_model)
     if not pricing:
         logger.warning(
-            f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0"
+            f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0"
         )
         return 0.0
@@
 def calculate_embedding_cost(model: str, prompt_tokens: int) -> float:
@@
-    pricing = MODEL_PRICING.get(model)
+    normalized_model = _normalize_pricing_model(model)
+    pricing = MODEL_PRICING.get(normalized_model)
     if not pricing:
         logger.warning(
             f"[calculate_embedding_cost] Unknown model '{model}', returning cost 0.0"
         )
         return 0.0

Also applies to: 115-121

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/crud/evaluations/pricing.py` around lines 92 - 98,
calculate_response_cost is returning 0.0 for several production-supported models
because MODEL_PRICING lacks entries for variants (e.g., gpt-4, gpt-4-turbo,
o1-preview, o1-mini, gpt-4.1-mini, gpt-4.1-nano, gemini-2.5-pro,
gemini-2.5-pro-preview-tts) referenced in SUPPORTED_MODELS; update the
implementation by either (A) adding the missing pricing entries into
MODEL_PRICING for those exact model keys, or (B) implement normalization/mapping
inside calculate_response_cost (or a helper) to map variant names to existing
pricing keys (e.g., normalize "gpt-4-turbo" -> "gpt-4-turbo" pricing key or map
preview/mini/nano variants to their base model key) so the lookup
MODEL_PRICING.get(model) returns a valid price instead of falling back to 0.0;
ensure the same fix is applied to the other lookup block around lines 115-121
that uses MODEL_PRICING.

input_cost = input_tokens * pricing.get("input_cost_per_token", 0)
output_cost = output_tokens * pricing.get("output_cost_per_token", 0)
return input_cost + output_cost


def _sum_usage(
items: Iterable[dict[str, Any]],
usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None],
fields: tuple[str, ...],
) -> dict[str, int]:
"""
Sum named token fields across items, using a caller-supplied extractor
to locate the per-item usage dict.

Args:
items: Iterable of items to aggregate
usage_extractor: Function returning the usage dict for an item, or None
fields: Token field names to sum (e.g., "input_tokens", "total_tokens")

Returns:
Mapping of field name to summed value
"""
totals: dict[str, int] = {field: 0 for field in fields}
for item in items:
usage = usage_extractor(item)
if not usage:
continue
for field in fields:
totals[field] += usage.get(field, 0)
return totals


def build_response_cost_entry(
model: str, results: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Aggregate token usage from parsed evaluation results and calculate cost.

Args:
model: OpenAI model name used for response generation
results: Parsed evaluation results from parse_evaluation_output(),
each containing a "usage" dict with input_tokens/output_tokens/total_tokens

Returns:
Response cost entry for the cost JSONB field
"""
totals = _sum_usage(
items=results,
usage_extractor=lambda r: r.get("usage"),
fields=("input_tokens", "output_tokens", "total_tokens"),
)

cost_usd = calculate_token_cost(
model=model,
input_tokens=totals["input_tokens"],
output_tokens=totals["output_tokens"],
)

return {
"model": model,
"input_tokens": totals["input_tokens"],
"output_tokens": totals["output_tokens"],
"total_tokens": totals["total_tokens"],
"cost_usd": round(cost_usd, COST_USD_DECIMALS),
}


def build_embedding_cost_entry(
model: str, raw_results: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Aggregate token usage from raw embedding batch results and calculate cost.

Args:
model: OpenAI embedding model name
raw_results: Raw JSONL lines from embedding batch output,
each containing response.body.usage with prompt_tokens/total_tokens

Returns:
Embedding cost entry for the cost JSONB field
"""
totals = _sum_usage(
items=raw_results,
usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"),
fields=("prompt_tokens", "total_tokens"),
)

cost_usd = calculate_token_cost(model=model, input_tokens=totals["prompt_tokens"])

return {
"model": model,
"prompt_tokens": totals["prompt_tokens"],
"total_tokens": totals["total_tokens"],
"cost_usd": round(cost_usd, COST_USD_DECIMALS),
}


def build_cost_dict(
response_entry: dict[str, Any] | None = None,
embedding_entry: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""
Combine response and embedding cost entries into the final cost JSONB structure.

Args:
response_entry: Response cost entry from build_response_cost_entry()
embedding_entry: Embedding cost entry from build_embedding_cost_entry()

Returns:
Combined cost dict with total_cost_usd
"""
cost: dict[str, Any] = {}

response_cost = 0.0
embedding_cost = 0.0

if response_entry:
cost["response"] = response_entry
response_cost = response_entry.get("cost_usd", 0.0)

if embedding_entry:
cost["embedding"] = embedding_entry
embedding_cost = embedding_entry.get("cost_usd", 0.0)

cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS)

return cost
Loading
Loading