From b4bfffe7bd96f6fa4b14b5f266286763172344f7 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 12:49:35 +0400
Subject: [PATCH 01/18] test: lock retrieval seed ranking behavior

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/runtime/retrieve.ts     |  96 +++++++++++++++++++------
 tests/unit/retrieve.test.ts | 137 ++++++++++++++++++++++++++++++++++++
 2 files changed, 210 insertions(+), 23 deletions(-)

diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts
index 194089a..ca0083d 100644
--- a/src/runtime/retrieve.ts
+++ b/src/runtime/retrieve.ts
@@ -190,6 +190,14 @@ function storedCommunityLabelsFromGraph(graph: KnowledgeGraph): Record<number, s
   )
 }
 
+interface SeedScoreBreakdown {
+  labelExactScore: number
+  labelTokenScore: number
+  sourcePathScore: number
+  communityScore: number
+  total: number
+}
+
 interface ScoredNode {
   id: string
   label: string
@@ -198,10 +206,57 @@ interface ScoredNode {
   nodeKind: string
   fileType: string
   community: number | null
+  evidenceTier: 0 | 1 | 2
   score: number
   relevanceBand: 'direct' | 'related' | 'peripheral'
 }
 
+function normalizeSeedText(value: string): string {
+  return tokenizeLabel(value).join('')
+}
+
+function evidenceTierForSeedScore(score: SeedScoreBreakdown): 0 | 1 | 2 {
+  if (score.labelExactScore > 0 || score.labelTokenScore > 0) {
+    return 2
+  }
+  if (score.sourcePathScore > 0 || score.communityScore > 0) {
+    return 1
+  }
+  return 0
+}
+
+function compareScoredNodes(graph: KnowledgeGraph, left: ScoredNode, right: ScoredNode): number {
+  return (
+    right.evidenceTier - left.evidenceTier ||
+    right.score - left.score ||
+    graph.degree(right.id) - graph.degree(left.id)
+  )
+}
+
+function scoreSeedCandidate(
+  question: string,
+  questionTokens: readonly string[],
+  label: string,
+  sourceFile: string,
+  communityLabel: string | null,
+  tokenWeights: ReadonlyMap<string, number>,
+): SeedScoreBreakdown {
+  const labelExactScore = normalizeSeedText(question) !== '' && normalizeSeedText(question) === normalizeSeedText(label) ? 2 : 0
+  const labelTokenScore = scoreNode(questionTokens, tokenizeLabel(label), tokenWeights)
+  const sourcePathScore = scoreNode(questionTokens, tokenizeLabel(sourceFile), tokenWeights) * 0.25
+  const communityScore = communityLabel
+    ? Math.min(scoreNode(questionTokens, tokenizeLabel(communityLabel)) * 0.1, 0.2)
+    : 0
+
+  return {
+    labelExactScore,
+    labelTokenScore,
+    sourcePathScore,
+    communityScore,
+    total: labelExactScore + labelTokenScore + sourcePathScore + communityScore,
+  }
+}
+
 export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult {
   const { question, budget } = options
   const questionTokens = tokenizeQuestion(question)
@@ -217,23 +272,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
     }
   }
 
-  // Pre-compute community label scores so nodes in matching communities get a boost
+  // Pre-compute community labels so seed scoring can treat them as secondary evidence.
   const communities = communitiesFromGraph(graph)
   const communityLabels: Record<number, string> = {
     ...buildCommunityLabels(graph, communities),
     ...storedCommunityLabelsFromGraph(graph),
   }
-  const communityBoost = new Map<number, number>()
-  for (const [idStr, label] of Object.entries(communityLabels)) {
-    const id = Number(idStr)
-    const communityTokens = tokenizeLabel(label)
-    const score = scoreNode(questionTokens, communityTokens)
-    if (score > 0) {
-      communityBoost.set(id, Math.min(score * 0.1, 0.3))
-    }
-  }
 
-  // Step 1+2: Score all nodes with TF-IDF-weighted tokens + community boost
+  // Step 1+2: Score all nodes with explicit seed evidence weights.
   const tokenWeights = tokenWeightsForQuestion(graph, questionTokens)
   const scored: ScoredNode[] = []
   for (const [id, attributes] of graph.nodeEntries()) {
@@ -248,15 +294,17 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
     }
 
     const label = String(attributes.label ?? '')
-    const labelTokens = tokenizeLabel(label)
     const sourceFile = String(attributes.source_file ?? '')
-    const sourceTokens = tokenizeLabel(sourceFile)
-    const labelScore = scoreNode(questionTokens, labelTokens, tokenWeights)
-    const sourceScore = scoreNode(questionTokens, sourceTokens, tokenWeights) * 0.5
-    const comBoost = community !== null ? (communityBoost.get(community) ?? 0) : 0
-    const totalScore = labelScore + sourceScore + comBoost
-
-    if (totalScore > 0) {
+    const score = scoreSeedCandidate(
+      question,
+      questionTokens,
+      label,
+      sourceFile,
+      community !== null ? (communityLabels[community] ?? null) : null,
+      tokenWeights,
+    )
+
+    if (score.total > 0) {
       scored.push({
         id,
         label,
@@ -265,13 +313,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
         nodeKind: String(attributes.node_kind ?? ''),
         fileType,
         community,
-        score: totalScore,
-        relevanceBand: labelScore + sourceScore > 0 ? 'direct' : 'related',
+        evidenceTier: evidenceTierForSeedScore(score),
+        score: score.total,
+        relevanceBand: score.labelExactScore > 0 || score.labelTokenScore > 0 ? 'direct' : 'related',
       })
     }
   }
 
-  scored.sort((a, b) => b.score - a.score || graph.degree(b.id) - graph.degree(a.id))
+  scored.sort((a, b) => compareScoredNodes(graph, a, b))
 
   // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores
   const seedCount = Math.min(scored.length, 10)
@@ -333,6 +382,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
       nodeKind: String(attributes.node_kind ?? ''),
       fileType,
       community,
+      evidenceTier: 0,
       score: hopScore,
       relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral',
     })
@@ -352,7 +402,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
   }
 
   // Re-sort: seeds first by score, then neighbors by degree
-  scored.sort((a, b) => b.score - a.score || graph.degree(b.id) - graph.degree(a.id))
+  scored.sort((a, b) => compareScoredNodes(graph, a, b))
 
   // Step 4+5: Read snippets and assemble within budget
   const matchedNodes: RetrieveMatchedNode[] = []
diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts
index ae2f612..6310104 100644
--- a/tests/unit/retrieve.test.ts
+++ b/tests/unit/retrieve.test.ts
@@ -138,6 +138,143 @@ describe('retrieve', () => {
       expect(labels).toContain('authenticateUser')
     })
 
+    it('keeps direct symbol matches above path-only matches after structural boosts', () => {
+      const graph = new KnowledgeGraph()
+      graph.addNode('direct_symbol', {
+        label: 'LoginController',
+        source_file: '/src/controllers.ts',
+        line_number: 1,
+        node_kind: 'function',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('path_only', {
+        label: 'RenderPage',
+        source_file: '/src/login/handler.ts',
+        line_number: 2,
+        node_kind: 'function',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('guide_a', {
+        label: 'LoginHandlerGuideA',
+        source_file: '/docs/login-a.md',
+        line_number: 3,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_b', {
+        label: 'LoginHandlerGuideB',
+        source_file: '/docs/login-b.md',
+        line_number: 4,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_c', {
+        label: 'LoginHandlerGuideC',
+        source_file: '/docs/login-c.md',
+        line_number: 5,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_d', {
+        label: 'LoginHandlerGuideD',
+        source_file: '/docs/login-d.md',
+        line_number: 6,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 1,
+      })
+      graph.addEdge('path_only', 'guide_a', {
+        relation: 'calls',
+        confidence: 'EXTRACTED',
+        source_file: '/src/login/handler.ts',
+      })
+      graph.addEdge('path_only', 'guide_d', {
+        relation: 'calls',
+        confidence: 'EXTRACTED',
+        source_file: '/src/login/handler.ts',
+      })
+
+      const result = retrieveContext(graph, { question: 'login', budget: 5000, fileType: 'code' })
+
+      expect(result.matched_nodes.map((node) => node.label).slice(0, 2)).toEqual(['LoginController', 'RenderPage'])
+      expect(result.matched_nodes.find((node) => node.label === 'LoginController')?.relevance_band).toBe('direct')
+      expect(result.matched_nodes.find((node) => node.label === 'RenderPage')?.relevance_band).toBe('related')
+    })
+
+    it('keeps direct symbol matches above community-only matches after structural boosts', () => {
+      const graph = new KnowledgeGraph()
+      graph.addNode('direct_symbol', {
+        label: 'AuthGateway',
+        source_file: '/src/auth.ts',
+        line_number: 1,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('community_only', {
+        label: 'SessionCoordinator',
+        source_file: '/src/session.ts',
+        line_number: 2,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('guide_a', {
+        label: 'AuthGuideA',
+        source_file: '/docs/auth-a.md',
+        line_number: 3,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_b', {
+        label: 'AuthGuideB',
+        source_file: '/docs/auth-b.md',
+        line_number: 4,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_c', {
+        label: 'AuthGuideC',
+        source_file: '/docs/auth-c.md',
+        line_number: 5,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 0,
+      })
+      graph.addNode('guide_d', {
+        label: 'AuthGuideD',
+        source_file: '/docs/auth-d.md',
+        line_number: 6,
+        node_kind: 'section',
+        file_type: 'document',
+        community: 1,
+      })
+      graph.addEdge('community_only', 'guide_a', {
+        relation: 'depends_on',
+        confidence: 'EXTRACTED',
+        source_file: '/src/session.ts',
+      })
+      graph.addEdge('community_only', 'guide_d', {
+        relation: 'depends_on',
+        confidence: 'EXTRACTED',
+        source_file: '/src/session.ts',
+      })
+      graph.graph.community_labels = { 0: 'Auth' }
+
+      const result = retrieveContext(graph, { question: 'auth', budget: 5000, fileType: 'code' })
+
+      expect(result.matched_nodes.map((node) => node.label).slice(0, 2)).toEqual(['AuthGateway', 'SessionCoordinator'])
+      expect(result.matched_nodes.find((node) => node.label === 'AuthGateway')?.relevance_band).toBe('direct')
+      expect(result.matched_nodes.find((node) => node.label === 'SessionCoordinator')?.relevance_band).toBe('related')
+    })
+
     it('includes neighbors of matched nodes', () => {
       const graph = buildTestGraph()
       const result = retrieveContext(graph, { question: 'auth', budget: 5000 })

From bdcc7518973ea4bc9cf3bd0494765541ed1bac3b Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 13:34:07 +0400
Subject: [PATCH 02/18] feat: make retrieval expansion relation-aware

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/runtime/retrieve.ts     |  76 +++++++++++-----
 tests/unit/retrieve.test.ts | 169 ++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 20 deletions(-)

diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts
index ca0083d..51cdc79 100644
--- a/src/runtime/retrieve.ts
+++ b/src/runtime/retrieve.ts
@@ -257,6 +257,20 @@ function scoreSeedCandidate(
   }
 }
 
+function relationWeight(relation: string): number {
+  switch (relation) {
+    case 'calls':
+    case 'imports_from':
+    case 'defines':
+      return 1
+    case 'uses':
+    case 'depends_on':
+      return 0.7
+    default:
+      return 0.35
+  }
+}
+
 export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult {
   const { question, budget } = options
   const questionTokens = tokenizeQuestion(question)
@@ -325,36 +339,58 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
   // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores
   const seedCount = Math.min(scored.length, 10)
   const seedIds = new Set(scored.slice(0, seedCount).map((node) => node.id))
-  const directSeedIds = scored
+  const directSeeds = scored
     .filter((node) => node.relevanceBand === 'direct')
-    .slice(0, seedCount)
-    .map((node) => node.id)
-  const expansionSeedIds = new Set(directSeedIds.length > 0 ? directSeedIds : [...seedIds])
+    .slice(0, 4)
+  const expansionSeedIds = new Set((directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)).map((node) => node.id))
   const hopScores = new Map<string, number>()
   const hopDistances = new Map<string, 1 | 2>()
+  const hopEvidenceTiers = new Map<string, 0 | 1>()
+  const hop1Ids = new Set<string>()
 
-  // Hop 1: direct neighbors get 0.5x of best seed score
-  const bestSeedScore = scored.length > 0 ? scored[0]?.score ?? 0 : 0
-  for (const seedId of expansionSeedIds) {
-    for (const neighborId of graph.neighbors(seedId)) {
+  // Hop 1: direct neighbors inherit a relation-weighted slice of each strong seed's score.
+  for (const seed of directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)) {
+    for (const neighborId of graph.neighbors(seed.id)) {
       if (!expansionSeedIds.has(neighborId)) {
-        hopScores.set(neighborId, Math.max(hopScores.get(neighborId) ?? 0, bestSeedScore * 0.5))
-        hopDistances.set(neighborId, 1)
+        const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to')
+        const hopScore = seed.score * 0.5 * relationWeight(relation)
+        const hopEvidenceTier = relationWeight(relation) === 1 ? 1 : 0
+        const existingHopScore = hopScores.get(neighborId) ?? 0
+        const existingHopEvidenceTier = hopEvidenceTiers.get(neighborId) ?? 0
+        if (hopScore > existingHopScore || (hopScore === existingHopScore && hopEvidenceTier > existingHopEvidenceTier)) {
+          hopScores.set(neighborId, hopScore)
+          hopDistances.set(neighborId, 1)
+          hopEvidenceTiers.set(neighborId, hopEvidenceTier)
+        }
+        hop1Ids.add(neighborId)
       }
     }
   }
 
-  // Hop 2: neighbors-of-neighbors get 0.25x (skip if budget is tight)
+  // Hop 2: neighbors-of-neighbors decay again, but keep this pool small and relation-aware.
   if (budget >= 2000) {
-    const hop1Ids = new Set(hopScores.keys())
+    const hop2Scores = new Map<string, number>()
     for (const hop1Id of hop1Ids) {
+      const hop1Score = hopScores.get(hop1Id) ?? 0
+      if (hop1Score <= 0) continue
       for (const hop2Id of graph.neighbors(hop1Id)) {
         if (!seedIds.has(hop2Id) && !hop1Ids.has(hop2Id)) {
-          hopScores.set(hop2Id, Math.max(hopScores.get(hop2Id) ?? 0, bestSeedScore * 0.25))
-          hopDistances.set(hop2Id, 2)
+          const relation = String(graph.edgeAttributes(hop1Id, hop2Id).relation ?? 'related_to')
+          const hop2Score = hop1Score * 0.5 * relationWeight(relation)
+          if (hop2Score > (hop2Scores.get(hop2Id) ?? 0)) {
+            hop2Scores.set(hop2Id, hop2Score)
+          }
         }
       }
     }
+
+    const maxSecondHopAdds = budget >= 5000 ? 6 : 3
+    for (const [hop2Id, hop2Score] of [...hop2Scores.entries()]
+      .sort(([leftId, leftScore], [rightId, rightScore]) => rightScore - leftScore || graph.degree(rightId) - graph.degree(leftId))
+      .slice(0, maxSecondHopAdds)) {
+      hopScores.set(hop2Id, Math.max(hopScores.get(hop2Id) ?? 0, hop2Score))
+      hopDistances.set(hop2Id, 2)
+    }
   }
 
   // Add expanded nodes not already scored
@@ -382,7 +418,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
       nodeKind: String(attributes.node_kind ?? ''),
       fileType,
       community,
-      evidenceTier: 0,
+      evidenceTier: hopDistances.get(nodeId) === 1 ? (hopEvidenceTiers.get(nodeId) ?? 0) : 0,
       score: hopScore,
       relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral',
     })
@@ -408,12 +444,12 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
   const matchedNodes: RetrieveMatchedNode[] = []
   const includedIds = new Set<string>()
   let tokenCount = 0
+  const inclusionOrder = [
+    ...scored.filter((node) => (seedIds.has(node.id) || hopScores.has(node.id)) && node.relevanceBand !== 'peripheral'),
+    ...scored.filter((node) => (seedIds.has(node.id) || hopScores.has(node.id)) && node.relevanceBand === 'peripheral'),
+  ]
 
-  for (const node of scored) {
-    if (!seedIds.has(node.id) && !hopScores.has(node.id)) {
-      continue
-    }
-
+  for (const node of inclusionOrder) {
     const snippet = readSnippet(node.sourceFile, node.lineNumber)
     const nodeText = `${node.label} ${node.sourceFile}:${node.lineNumber} ${snippet ?? ''}`
     const nodeTokens = estimateTokens(nodeText)
diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts
index 6310104..146a7e0 100644
--- a/tests/unit/retrieve.test.ts
+++ b/tests/unit/retrieve.test.ts
@@ -119,6 +119,153 @@ describe('retrieve', () => {
       return graph
     }
 
+    function buildExpansionGraph(): KnowledgeGraph {
+      const graph = new KnowledgeGraph()
+
+      graph.addNode('auth_user', {
+        label: 'authenticateUser',
+        source_file: '/src/auth.ts',
+        line_number: 10,
+        node_kind: 'function',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('auth_flow_controller', {
+        label: 'AuthFlowController',
+        source_file: '/src/auth/flow-controller.ts',
+        line_number: 20,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('auth_guard', {
+        label: 'AuthGuard',
+        source_file: '/src/auth/guard.ts',
+        line_number: 30,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 0,
+      })
+      graph.addNode('auth_policy', {
+        label: 'AuthPolicy',
+        source_file: '/src/auth/policy.ts',
+        line_number: 40,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 0,
+      })
+
+      graph.addNode('session_mgr', {
+        label: 'SessionManager',
+        source_file: '/src/session.ts',
+        line_number: 5,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 2,
+      })
+      graph.addNode('session_validator', {
+        label: 'SessionValidator',
+        source_file: '/src/session-validator.ts',
+        line_number: 6,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 2,
+      })
+      graph.addNode('session_router', {
+        label: 'SessionRouter',
+        source_file: '/src/session-router.ts',
+        line_number: 7,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 2,
+      })
+      graph.addNode('session_policy', {
+        label: 'SessionPolicy',
+        source_file: '/src/session-policy.ts',
+        line_number: 8,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 2,
+      })
+
+      graph.addNode('billing_store', {
+        label: 'BillingStore',
+        source_file: '/src/billing.ts',
+        line_number: 9,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 1,
+      })
+      graph.addNode('billing_cache', {
+        label: 'BillingCache',
+        source_file: '/src/billing-cache.ts',
+        line_number: 10,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 1,
+      })
+      graph.addNode('invoice_ledger', {
+        label: 'InvoiceLedger',
+        source_file: '/src/invoice-ledger.ts',
+        line_number: 11,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 1,
+      })
+      graph.addNode('tax_rules', {
+        label: 'TaxRules',
+        source_file: '/src/tax-rules.ts',
+        line_number: 12,
+        node_kind: 'class',
+        file_type: 'code',
+        community: 1,
+      })
+
+      graph.addEdge('auth_user', 'session_mgr', { relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/auth.ts' })
+      graph.addEdge('auth_flow_controller', 'session_validator', {
+        relation: 'imports_from',
+        confidence: 'EXTRACTED',
+        source_file: '/src/auth/flow-controller.ts',
+      })
+      graph.addEdge('auth_guard', 'session_router', {
+        relation: 'calls',
+        confidence: 'EXTRACTED',
+        source_file: '/src/auth/guard.ts',
+      })
+      graph.addEdge('auth_policy', 'session_policy', {
+        relation: 'defines',
+        confidence: 'EXTRACTED',
+        source_file: '/src/auth/policy.ts',
+      })
+      graph.addEdge('auth_guard', 'billing_store', {
+        relation: 'depends_on',
+        confidence: 'EXTRACTED',
+        source_file: '/src/auth/guard.ts',
+      })
+      graph.addEdge('billing_store', 'billing_cache', {
+        relation: 'depends_on',
+        confidence: 'EXTRACTED',
+        source_file: '/src/billing.ts',
+      })
+      graph.addEdge('billing_store', 'invoice_ledger', {
+        relation: 'uses',
+        confidence: 'EXTRACTED',
+        source_file: '/src/billing.ts',
+      })
+      graph.addEdge('billing_store', 'tax_rules', {
+        relation: 'uses',
+        confidence: 'EXTRACTED',
+        source_file: '/src/billing.ts',
+      })
+      graph.graph.community_labels = {
+        0: 'Authentication',
+        1: 'Billing',
+        2: 'Session',
+      }
+
+      return graph
+    }
+
     it('returns empty result for no matching tokens', () => {
       const graph = buildTestGraph()
       const result = retrieveContext(graph, { question: 'how does the', budget: 5000 })
@@ -304,6 +451,28 @@ describe('retrieve', () => {
       expect(community0).toBeDefined()
     })
 
+    it('prefers calls and imports edges over generic second-hop expansion', () => {
+      const graph = buildExpansionGraph()
+
+      const result = retrieveContext(graph, { question: 'auth', budget: 5000 })
+      const labels = result.matched_nodes.map((node) => node.label)
+
+      expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('BillingStore'))
+    })
+
+    it('avoids promoting weak peripheral nodes when budget is tight', () => {
+      const graph = buildExpansionGraph()
+
+      const result = retrieveContext(graph, { question: 'auth flow', budget: 80 })
+
+      expect(result.matched_nodes.map((node) => node.label)).toEqual(
+        expect.arrayContaining(['authenticateUser']),
+      )
+      expect(result.matched_nodes).not.toEqual(
+        expect.arrayContaining([expect.objectContaining({ label: 'BillingStore' })]),
+      )
+    })
+
     it('respects community filter', () => {
       const graph = buildTestGraph()
       const result = retrieveContext(graph, { question: 'database', budget: 5000, community: 1 })

From 286db3902e288e0b6ce125c053871b8f99a80177 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 13:44:31 +0400
Subject: [PATCH 03/18] test: assert real second-hop retrieval nodes

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/retrieve.test.ts | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts
index 146a7e0..50542e1 100644
--- a/tests/unit/retrieve.test.ts
+++ b/tests/unit/retrieve.test.ts
@@ -457,20 +457,28 @@ describe('retrieve', () => {
       const result = retrieveContext(graph, { question: 'auth', budget: 5000 })
       const labels = result.matched_nodes.map((node) => node.label)
 
-      expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('BillingStore'))
+      expect(labels).toContain('SessionValidator')
+      expect(labels).toContain('SessionRouter')
+      expect(labels).toContain('SessionManager')
+      expect(labels).toContain('BillingCache')
+      expect(labels).toContain('InvoiceLedger')
+      expect(labels).toContain('TaxRules')
+      expect(labels.indexOf('SessionValidator')).toBeLessThan(labels.indexOf('BillingCache'))
+      expect(labels.indexOf('SessionRouter')).toBeLessThan(labels.indexOf('InvoiceLedger'))
+      expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('TaxRules'))
+      expect(result.matched_nodes.find((node) => node.label === 'BillingCache')?.relevance_band).toBe('peripheral')
     })
 
     it('avoids promoting weak peripheral nodes when budget is tight', () => {
       const graph = buildExpansionGraph()
 
       const result = retrieveContext(graph, { question: 'auth flow', budget: 80 })
+      const labels = result.matched_nodes.map((node) => node.label)
 
-      expect(result.matched_nodes.map((node) => node.label)).toEqual(
-        expect.arrayContaining(['authenticateUser']),
-      )
-      expect(result.matched_nodes).not.toEqual(
-        expect.arrayContaining([expect.objectContaining({ label: 'BillingStore' })]),
-      )
+      expect(labels).toEqual(expect.arrayContaining(['authenticateUser']))
+      expect(labels).not.toContain('BillingCache')
+      expect(labels).not.toContain('InvoiceLedger')
+      expect(labels).not.toContain('TaxRules')
     })
 
     it('respects community filter', () => {

From 2672fb14766259aa4c9e67ecd53e1400a21dc6a3 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 14:00:44 +0400
Subject: [PATCH 04/18] test: add retrieval quality guardrails

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/infrastructure/benchmark/quality.ts | 12 ++++++++
 src/runtime/retrieve.ts                 | 40 +++++++++++++++++++++++--
 tests/unit/benchmark-quality.test.ts    | 23 ++++++++++++++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/src/infrastructure/benchmark/quality.ts b/src/infrastructure/benchmark/quality.ts
index eb5d4cd..680e4b2 100644
--- a/src/infrastructure/benchmark/quality.ts
+++ b/src/infrastructure/benchmark/quality.ts
@@ -55,6 +55,18 @@ export const GOLD_QUESTIONS: GoldQuestion[] = [
     question: 'how does the retrieve MCP tool find relevant nodes',
     expected_labels: ['retrievecontext', 'scorenode'],
   },
+  {
+    question: 'retrieveContext',
+    expected_labels: ['retrievecontext'],
+  },
+  {
+    question: 'how does retrieveContext build community labels',
+    expected_labels: ['retrievecontext', 'buildcommunitylabels'],
+  },
+  {
+    question: 'scoreNode',
+    expected_labels: ['scorenode'],
+  },
   {
     question: 'how does javascript extraction work',
     expected_labels: ['extractjs', 'extractionnode'],
diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts
index 51cdc79..64458fd 100644
--- a/src/runtime/retrieve.ts
+++ b/src/runtime/retrieve.ts
@@ -206,6 +206,8 @@ interface ScoredNode {
   nodeKind: string
   fileType: string
   community: number | null
+  exactLabelMatch: boolean
+  sourcePathMatch: boolean
   evidenceTier: 0 | 1 | 2
   score: number
   relevanceBand: 'direct' | 'related' | 'peripheral'
@@ -263,6 +265,8 @@ function relationWeight(relation: string): number {
     case 'imports_from':
     case 'defines':
       return 1
+    case 'contains':
+      return 1.2
     case 'uses':
     case 'depends_on':
       return 0.7
@@ -271,6 +275,10 @@ function relationWeight(relation: string): number {
   }
 }
 
+function isPrimaryExpansionRelation(relation: string): boolean {
+  return relation === 'calls' || relation === 'imports_from' || relation === 'defines' || relation === 'contains'
+}
+
 export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult {
   const { question, budget } = options
   const questionTokens = tokenizeQuestion(question)
@@ -327,6 +335,8 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
         nodeKind: String(attributes.node_kind ?? ''),
         fileType,
         community,
+        exactLabelMatch: score.labelExactScore > 0,
+        sourcePathMatch: score.sourcePathScore > 0,
         evidenceTier: evidenceTierForSeedScore(score),
         score: score.total,
         relevanceBand: score.labelExactScore > 0 || score.labelTokenScore > 0 ? 'direct' : 'related',
@@ -338,6 +348,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
 
   // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores
   const seedCount = Math.min(scored.length, 10)
+  const hasExactSeedMatch = scored.some((node) => node.exactLabelMatch)
   const seedIds = new Set(scored.slice(0, seedCount).map((node) => node.id))
   const directSeeds = scored
     .filter((node) => node.relevanceBand === 'direct')
@@ -354,7 +365,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
       if (!expansionSeedIds.has(neighborId)) {
         const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to')
         const hopScore = seed.score * 0.5 * relationWeight(relation)
-        const hopEvidenceTier = relationWeight(relation) === 1 ? 1 : 0
+        const hopEvidenceTier = isPrimaryExpansionRelation(relation) ? 1 : 0
         const existingHopScore = hopScores.get(neighborId) ?? 0
         const existingHopEvidenceTier = hopEvidenceTiers.get(neighborId) ?? 0
         if (hopScore > existingHopScore || (hopScore === existingHopScore && hopEvidenceTier > existingHopEvidenceTier)) {
@@ -367,8 +378,31 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
     }
   }
 
+  for (const node of scored) {
+    const hopScore = hopScores.get(node.id)
+    if (!hopScore) {
+      continue
+    }
+
+    node.score += hopScore
+    const hopEvidenceTier = hopEvidenceTiers.get(node.id) ?? 0
+    if (node.sourcePathMatch && hopEvidenceTier > 0) {
+      node.evidenceTier = 2
+      node.relevanceBand = 'direct'
+      node.score += 0.5
+      continue
+    }
+
+    if (hopEvidenceTier > node.evidenceTier) {
+      node.evidenceTier = hopEvidenceTier
+      if (node.relevanceBand === 'peripheral') {
+        node.relevanceBand = 'related'
+      }
+    }
+  }
+
   // Hop 2: neighbors-of-neighbors decay again, but keep this pool small and relation-aware.
-  if (budget >= 2000) {
+  if (budget >= 2000 && !hasExactSeedMatch) {
     const hop2Scores = new Map<string, number>()
     for (const hop1Id of hop1Ids) {
       const hop1Score = hopScores.get(hop1Id) ?? 0
@@ -418,6 +452,8 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
       nodeKind: String(attributes.node_kind ?? ''),
       fileType,
       community,
+      exactLabelMatch: false,
+      sourcePathMatch: false,
       evidenceTier: hopDistances.get(nodeId) === 1 ? (hopEvidenceTiers.get(nodeId) ?? 0) : 0,
       score: hopScore,
       relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral',
diff --git a/tests/unit/benchmark-quality.test.ts b/tests/unit/benchmark-quality.test.ts
index e6e34ad..258bcf3 100644
--- a/tests/unit/benchmark-quality.test.ts
+++ b/tests/unit/benchmark-quality.test.ts
@@ -63,6 +63,29 @@ describe('retrieval quality benchmark', () => {
     expect(report.questions[0]!.missing_labels).toEqual([])
   })
 
+  it('raises reciprocal rank when the expected direct node appears before supporting context', () => {
+    const graph = buildTestGraph()
+    const questions: GoldQuestion[] = [
+      { question: 'how does authentication work', expected_labels: ['loginhandler'] },
+    ]
+
+    const report = evaluateRetrievalQuality(graph, questions, 3000)
+
+    expect(report.mrr).toBe(1)
+  })
+
+  it('keeps recall while reducing unnecessary returned labels for narrow symbol queries', () => {
+    const graph = buildTestGraph()
+    const report = evaluateRetrievalQuality(
+      graph,
+      [{ question: 'login handler', expected_labels: ['loginhandler'] }],
+      3000,
+    )
+
+    expect(report.questions[0]?.recall).toBe(1)
+    expect(report.questions[0]?.returned_labels.length).toBeLessThanOrEqual(3)
+  })
+
   it('reports zero recall when no expected labels match', () => {
     const graph = buildTestGraph()
     const questions: GoldQuestion[] = [{ question: 'quantum entanglement physics', expected_labels: ['quantumprocessor'] }]

From 0ba24d0ab86fe16adf3961d6523d5b547fe939fa Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 14:20:44 +0400
Subject: [PATCH 05/18] feat: improve retrieval ranking quality

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d83967..ccc05a9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to the TypeScript package will be documented in this file.
 
 ## [Unreleased]
 
+### Improved
+
+- **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results
+
 ## [0.8.7] - 2026-04-27
 
 ### Changed

From 8b7492bdac97178c7097c9ea6cfbb0a5a7699bc2 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 14:42:45 +0400
Subject: [PATCH 06/18] Fix directed retrieval expansion

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/runtime/retrieve.ts     | 20 ++++++++++++++++----
 tests/unit/retrieve.test.ts | 30 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts
index 64458fd..ff42184 100644
--- a/src/runtime/retrieve.ts
+++ b/src/runtime/retrieve.ts
@@ -275,6 +275,18 @@ function relationWeight(relation: string): number {
   }
 }
 
+function relationBetweenNodes(graph: KnowledgeGraph, source: string, target: string): string {
+  try {
+    return String(graph.edgeAttributes(source, target).relation ?? 'related_to')
+  } catch {
+    try {
+      return String(graph.edgeAttributes(target, source).relation ?? 'related_to')
+    } catch {
+      return 'related_to'
+    }
+  }
+}
+
 function isPrimaryExpansionRelation(relation: string): boolean {
   return relation === 'calls' || relation === 'imports_from' || relation === 'defines' || relation === 'contains'
 }
@@ -361,9 +373,9 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
 
   // Hop 1: direct neighbors inherit a relation-weighted slice of each strong seed's score.
   for (const seed of directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)) {
-    for (const neighborId of graph.neighbors(seed.id)) {
+    for (const neighborId of graph.incidentNeighbors(seed.id)) {
       if (!expansionSeedIds.has(neighborId)) {
-        const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to')
+        const relation = relationBetweenNodes(graph, seed.id, neighborId)
         const hopScore = seed.score * 0.5 * relationWeight(relation)
         const hopEvidenceTier = isPrimaryExpansionRelation(relation) ? 1 : 0
         const existingHopScore = hopScores.get(neighborId) ?? 0
@@ -407,9 +419,9 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions)
     for (const hop1Id of hop1Ids) {
       const hop1Score = hopScores.get(hop1Id) ?? 0
       if (hop1Score <= 0) continue
-      for (const hop2Id of graph.neighbors(hop1Id)) {
+      for (const hop2Id of graph.incidentNeighbors(hop1Id)) {
         if (!seedIds.has(hop2Id) && !hop1Ids.has(hop2Id)) {
-          const relation = String(graph.edgeAttributes(hop1Id, hop2Id).relation ?? 'related_to')
+          const relation = relationBetweenNodes(graph, hop1Id, hop2Id)
           const hop2Score = hop1Score * 0.5 * relationWeight(relation)
           if (hop2Score > (hop2Scores.get(hop2Id) ?? 0)) {
             hop2Scores.set(hop2Id, hop2Score)
diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts
index 50542e1..a9cc965 100644
--- a/tests/unit/retrieve.test.ts
+++ b/tests/unit/retrieve.test.ts
@@ -432,6 +432,36 @@ describe('retrieve', () => {
       expect(labels).toContain('SessionManager')
     })
 
+    it('includes predecessors of matched nodes in directed graphs', () => {
+      const graph = new KnowledgeGraph({ directed: true })
+      graph.addNode('caller', {
+        label: 'CallerService',
+        source_file: '/src/caller.ts',
+        line_number: 1,
+        node_kind: 'function',
+        file_type: 'code',
+      })
+      graph.addNode('target', {
+        label: 'TargetHandler',
+        source_file: '/src/target.ts',
+        line_number: 2,
+        node_kind: 'function',
+        file_type: 'code',
+      })
+      graph.addEdge('caller', 'target', {
+        relation: 'calls',
+        confidence: 'EXTRACTED',
+        source_file: '/src/caller.ts',
+      })
+
+      const result = retrieveContext(graph, { question: 'target', budget: 5000 })
+      const labels = result.matched_nodes.map((node) => node.label)
+
+      expect(labels).toContain('TargetHandler')
+      expect(labels).toContain('CallerService')
+      expect(result.matched_nodes.find((node) => node.label === 'CallerService')?.relevance_band).toBe('related')
+    })
+
     it('includes relationships between matched nodes', () => {
       const graph = buildTestGraph()
       const result = retrieveContext(graph, { question: 'auth', budget: 5000 })

From 74f6aedf33618d50b4d3ab148b954b73387bc724 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 20:28:51 +0400
Subject: [PATCH 07/18] chore: port compare usage baseline

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/infrastructure/compare.ts | 192 ++++++++++++++++++++++++++++++++--
 tests/unit/compare.test.ts    | 125 ++++++++++++++++++++++
 2 files changed, 311 insertions(+), 6 deletions(-)

diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts
index 58a5f33..fcd2ec3 100644
--- a/src/infrastructure/compare.ts
+++ b/src/infrastructure/compare.ts
@@ -15,6 +15,7 @@ export type CompareBaselineMode = 'full' | 'bounded'
 export type CompareRunMode = 'baseline' | 'graphify'
 export type CompareRunStatus = 'not_run' | 'succeeded' | 'failed' | 'context_overflow'
 export type CompareFailureReason = 'prompt_too_long' | 'runner_error' | 'exec_error'
+export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input'
 
 export interface ComparePromptPack {
   kind: 'baseline' | 'graphify'
@@ -60,6 +61,17 @@ export interface ComparePromptTokenEstimator {
   exact: boolean
 }
 
+export interface ComparePromptUsage {
+  provider: 'claude'
+  source: 'structured_stdout'
+  input_tokens: number
+  output_tokens: number
+  cache_creation_input_tokens: number
+  cache_read_input_tokens: number
+  input_total_tokens: number
+  total_tokens: number
+}
+
 export interface ComparePromptReport {
   question: string
   graph_path: string
@@ -68,10 +80,21 @@ export interface ComparePromptReport {
   baseline_prompt_tokens: number
   graphify_prompt_tokens: number
   reduction_ratio: number
+  baseline_total_tokens: number | null
+  graphify_total_tokens: number | null
+  total_reduction_ratio: number | null
   baseline_prompt_tokens_estimated: number
   graphify_prompt_tokens_estimated: number
   reduction_ratio_estimated: number
   prompt_token_estimator: ComparePromptTokenEstimator
+  prompt_token_source: {
+    baseline: ComparePromptTokenSource
+    graphify: ComparePromptTokenSource
+  }
+  usage: {
+    baseline: ComparePromptUsage | null
+    graphify: ComparePromptUsage | null
+  }
   started_at: string
   completed_at: string
   elapsed_ms: {
@@ -149,6 +172,11 @@ export interface ExecuteCompareRunsDependencies {
   now?: () => Date
 }
 
+interface ParsedCompareRunnerOutput {
+  answerText: string
+  usage: ComparePromptUsage | null
+}
+
 const DEFAULT_RETRIEVAL_BUDGET = 3_000
 const DEFAULT_BOUNDED_BASELINE_TOKENS = 4_000
 const EXEC_TEMPLATE_PLACEHOLDER_PATTERN = /\{[a-z_][a-z0-9_]*\}/gi
@@ -179,6 +207,80 @@ function summarizeExecTemplate(execTemplate: string): CompareExecCommandSummary
   }
 }
 
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null
+}
+
+function parseNonNegativeNumber(value: unknown): number | null {
+  return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : null
+}
+
+function parseStructuredCompareAnswer(payload: Record<string, unknown>): string | null {
+  if (typeof payload.result === 'string') {
+    return payload.result
+  }
+  if (typeof payload.completion === 'string') {
+    return payload.completion
+  }
+  return null
+}
+
+function parseClaudeStructuredUsage(payload: Record<string, unknown>): ComparePromptUsage | null {
+  if (!isRecord(payload.usage)) {
+    return null
+  }
+
+  const inputTokens = parseNonNegativeNumber(payload.usage.input_tokens)
+  const outputTokens = parseNonNegativeNumber(payload.usage.output_tokens)
+  if (inputTokens === null || outputTokens === null) {
+    return null
+  }
+
+  const cacheCreationInputTokens = parseNonNegativeNumber(payload.usage.cache_creation_input_tokens) ?? 0
+  const cacheReadInputTokens = parseNonNegativeNumber(payload.usage.cache_read_input_tokens) ?? 0
+  const inputTotalTokens = inputTokens + cacheCreationInputTokens + cacheReadInputTokens
+
+  return {
+    provider: 'claude',
+    source: 'structured_stdout',
+    input_tokens: inputTokens,
+    output_tokens: outputTokens,
+    cache_creation_input_tokens: cacheCreationInputTokens,
+    cache_read_input_tokens: cacheReadInputTokens,
+    input_total_tokens: inputTotalTokens,
+    total_tokens: inputTotalTokens + outputTokens,
+  }
+}
+
+function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null {
+  const trimmed = stdout.trim()
+  if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) {
+    return null
+  }
+
+  let payload: unknown
+  try {
+    payload = JSON.parse(trimmed)
+  } catch {
+    return null
+  }
+
+  if (!isRecord(payload)) {
+    return null
+  }
+
+  const answerText = parseStructuredCompareAnswer(payload)
+  const usage = parseClaudeStructuredUsage(payload)
+  if (answerText === null && usage === null) {
+    return null
+  }
+
+  return {
+    answerText: answerText ?? stdout,
+    usage,
+  }
+}
+
 function validateCompareExecTemplate(template: string): void {
   if (PROMPT_FILE_COMMAND_SUBSTITUTION_PATTERNS.some((pattern) => pattern.test(template))) {
     throw new Error(
@@ -413,6 +515,33 @@ function computeReductionRatio(baselinePromptTokens: number, graphifyPromptToken
   return Number((baselinePromptTokens / graphifyPromptTokens).toFixed(1))
 }
 
+function formatTokenComparison(baselineTokens: number, graphifyTokens: number): string {
+  if (baselineTokens <= 0 || graphifyTokens <= 0) {
+    return 'n/a'
+  }
+  if (baselineTokens === graphifyTokens) {
+    return 'same size'
+  }
+  if (baselineTokens > graphifyTokens) {
+    return `${computeReductionRatio(baselineTokens, graphifyTokens)}x smaller`
+  }
+  return `${Number((graphifyTokens / baselineTokens).toFixed(1))}x larger`
+}
+
+function syncComparePromptMetrics(report: ComparePromptReport): void {
+  report.baseline_prompt_tokens = report.usage.baseline?.input_total_tokens ?? report.baseline_prompt_tokens_estimated
+  report.graphify_prompt_tokens = report.usage.graphify?.input_total_tokens ?? report.graphify_prompt_tokens_estimated
+  report.reduction_ratio = computeReductionRatio(report.baseline_prompt_tokens, report.graphify_prompt_tokens)
+  report.baseline_total_tokens = report.usage.baseline?.total_tokens ?? null
+  report.graphify_total_tokens = report.usage.graphify?.total_tokens ?? null
+  report.total_reduction_ratio =
+    report.baseline_total_tokens !== null && report.graphify_total_tokens !== null
+      ? computeReductionRatio(report.baseline_total_tokens, report.graphify_total_tokens)
+      : null
+  report.prompt_token_source.baseline = report.usage.baseline === null ? 'estimated_cl100k_base' : 'claude_reported_input'
+  report.prompt_token_source.graphify = report.usage.graphify === null ? 'estimated_cl100k_base' : 'claude_reported_input'
+}
+
 function portablePath(path: string): string {
   return relative(process.cwd(), path) || '.'
 }
@@ -756,10 +885,21 @@ export function generateCompareArtifacts(input: GenerateCompareArtifactsInput):
       baseline_prompt_tokens: baselinePromptTokens,
       graphify_prompt_tokens: graphifyPromptTokens,
       reduction_ratio: computeReductionRatio(baselinePromptTokens, graphifyPromptTokens),
+      baseline_total_tokens: null,
+      graphify_total_tokens: null,
+      total_reduction_ratio: null,
       baseline_prompt_tokens_estimated: baselinePromptTokens,
       graphify_prompt_tokens_estimated: graphifyPromptTokens,
       reduction_ratio_estimated: computeReductionRatio(baselinePromptTokens, graphifyPromptTokens),
       prompt_token_estimator: QUERY_TOKEN_ESTIMATOR,
+      prompt_token_source: {
+        baseline: 'estimated_cl100k_base',
+        graphify: 'estimated_cl100k_base',
+      },
+      usage: {
+        baseline: null,
+        graphify: null,
+      },
       started_at: now.toISOString(),
       completed_at: now.toISOString(),
       elapsed_ms: {
@@ -790,6 +930,7 @@ export function generateCompareArtifacts(input: GenerateCompareArtifactsInput):
       paths,
     }
 
+    syncComparePromptMetrics(report)
     writeCompareReport(report)
     return report
   })
@@ -841,9 +982,11 @@ export async function executeCompareRuns(
           question: report.question,
           command,
         })
-        ensureCompareAnswerFile(execution.outputFile, executionResult.stdout)
+        const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout)
+        ensureCompareAnswerFile(execution.outputFile, parsedOutput?.answerText ?? executionResult.stdout)
         const contextOverflowEvidence =
           executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr)
+        report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null
         report.status[execution.mode] =
           executionResult.exitCode === 0 ? 'succeeded' : contextOverflowEvidence !== null ? 'context_overflow' : 'failed'
         report.elapsed_ms[execution.mode] = executionResult.elapsedMs
@@ -854,6 +997,7 @@ export async function executeCompareRuns(
         report.evidence[execution.mode] = contextOverflowEvidence
       } catch (error) {
         ensureCompareAnswerFile(execution.outputFile, '')
+        report.usage[execution.mode] = null
         const errorMessage = error instanceof Error ? error.message : String(error)
         const contextOverflowEvidence = extractContextOverflowEvidence(errorMessage)
         report.status[execution.mode] = contextOverflowEvidence !== null ? 'context_overflow' : 'failed'
@@ -864,6 +1008,7 @@ export async function executeCompareRuns(
         report.evidence[execution.mode] = contextOverflowEvidence
       }
 
+      syncComparePromptMetrics(report)
       report.completed_at = now().toISOString()
       writeCompareReport(report)
     }
@@ -876,6 +1021,18 @@ function sumPromptTokens(reports: readonly ComparePromptReport[], mode: CompareR
   return reports.reduce((total, report) => total + (mode === 'baseline' ? report.baseline_prompt_tokens : report.graphify_prompt_tokens), 0)
 }
 
+function sumTotalTokens(reports: readonly ComparePromptReport[], mode: CompareRunMode): number | null {
+  let total = 0
+  for (const report of reports) {
+    const value = mode === 'baseline' ? report.baseline_total_tokens : report.graphify_total_tokens
+    if (value === null) {
+      return null
+    }
+    total += value
+  }
+  return total
+}
+
 function countPromptRuns(reports: readonly ComparePromptReport[], status: Exclude<CompareRunStatus, 'not_run'>): number {
   return reports.reduce((total, report) => {
     const baseline = report.status.baseline === status ? 1 : 0
@@ -884,22 +1041,45 @@ function countPromptRuns(reports: readonly ComparePromptReport[], status: Exclud
   }, 0)
 }
 
+function countPromptUsageRuns(reports: readonly ComparePromptReport[]): number {
+  return reports.reduce((total, report) => total + (report.usage.baseline === null ? 0 : 1) + (report.usage.graphify === null ? 0 : 1), 0)
+}
+
 export function formatCompareSummary(result: GenerateCompareArtifactsResult): string {
   const baselineTokens = sumPromptTokens(result.reports, 'baseline')
   const graphifyTokens = sumPromptTokens(result.reports, 'graphify')
-  const reductionRatio = computeReductionRatio(baselineTokens, graphifyTokens)
+  const baselineTotalTokens = sumTotalTokens(result.reports, 'baseline')
+  const graphifyTotalTokens = sumTotalTokens(result.reports, 'graphify')
+  const totalReductionRatio =
+    baselineTotalTokens !== null && graphifyTotalTokens !== null ? computeReductionRatio(baselineTotalTokens, graphifyTotalTokens) : null
   const failedRuns = countPromptRuns(result.reports, 'failed')
   const contextOverflowRuns = countPromptRuns(result.reports, 'context_overflow')
   const succeededRuns = countPromptRuns(result.reports, 'succeeded')
-
-  return [
+  const usageRuns = countPromptUsageRuns(result.reports)
+  const totalRuns = result.reports.length * 2
+  const promptTokenLabel =
+    usageRuns === totalRuns
+      ? 'Input tokens (Claude reported)'
+      : usageRuns > 0
+        ? `Input tokens (Claude reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)`
+        : `Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model})`
+
+  const lines = [
     `[graphify compare] completed ${result.reports.length} question(s)`,
     `- Output: ${result.output_root}`,
-    `- Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model}): baseline ${baselineTokens} · graphify ${graphifyTokens} · ${reductionRatio}x smaller`,
+    `- ${promptTokenLabel}: baseline ${baselineTokens} · graphify ${graphifyTokens} · ${formatTokenComparison(baselineTokens, graphifyTokens)}`,
     `- Prompt runs: ${succeededRuns} succeeded${contextOverflowRuns > 0 ? ` · ${contextOverflowRuns} context overflow` : ''}${
       failedRuns > 0 ? ` · ${failedRuns} failed` : ''
     }`,
-  ].join('\n')
+  ]
+
+  if (baselineTotalTokens !== null && graphifyTotalTokens !== null && totalReductionRatio !== null) {
+    lines.splice(3, 0, `- Total tokens (Claude reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`)
+  } else if (usageRuns > 0 && usageRuns < totalRuns) {
+    lines.splice(3, 0, `- Usage capture: Claude reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`)
+  }
+
+  return lines.join('\n')
 }
 
 export async function runCompareCommand(
diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index f83e079..a8e9e2d 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -611,6 +611,131 @@ describe('compare runtime', () => {
     )
   })
 
+  it('captures Claude-reported usage from structured runner output and saves plain answers', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            type: 'result',
+            subtype: 'success',
+            result: `${execution.mode} answer\n`,
+            usage:
+              execution.mode === 'baseline'
+                ? {
+                    input_tokens: 1200,
+                    output_tokens: 90,
+                    cache_creation_input_tokens: 100,
+                    cache_read_input_tokens: 20,
+                  }
+                : {
+                    input_tokens: 400,
+                    output_tokens: 70,
+                    cache_creation_input_tokens: 0,
+                    cache_read_input_tokens: 10,
+                  },
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n')
+    expect(report.baseline_prompt_tokens).toBe(1320)
+    expect(report.graphify_prompt_tokens).toBe(410)
+    expect(report.prompt_token_source).toEqual({
+      baseline: 'claude_reported_input',
+      graphify: 'claude_reported_input',
+    })
+    expect(report.usage).toEqual({
+      baseline: {
+        provider: 'claude',
+        source: 'structured_stdout',
+        input_tokens: 1200,
+        output_tokens: 90,
+        cache_creation_input_tokens: 100,
+        cache_read_input_tokens: 20,
+        input_total_tokens: 1320,
+        total_tokens: 1410,
+      },
+      graphify: {
+        provider: 'claude',
+        source: 'structured_stdout',
+        input_tokens: 400,
+        output_tokens: 70,
+        cache_creation_input_tokens: 0,
+        cache_read_input_tokens: 10,
+        input_total_tokens: 410,
+        total_tokens: 480,
+      },
+    })
+    expect(report.baseline_total_tokens).toBe(1410)
+    expect(report.graphify_total_tokens).toBe(480)
+    expect(formatCompareSummary(result)).toContain('Input tokens (Claude reported): baseline 1320 · graphify 410')
+    expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 1410 · graphify 480')
+  })
+
+  it('reports when graphify uses more Claude-reported tokens than the baseline', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            type: 'result',
+            subtype: 'success',
+            result: `${execution.mode} answer\n`,
+            usage:
+              execution.mode === 'baseline'
+                ? {
+                    input_tokens: 300,
+                    output_tokens: 50,
+                    cache_creation_input_tokens: 0,
+                    cache_read_input_tokens: 0,
+                  }
+                : {
+                    input_tokens: 500,
+                    output_tokens: 80,
+                    cache_creation_input_tokens: 0,
+                    cache_read_input_tokens: 0,
+                  },
+          }),
+          stderr: '',
+          elapsedMs: 1,
+        }),
+      },
+    )
+
+    expect(formatCompareSummary(result)).toContain('Input tokens (Claude reported): baseline 300 · graphify 500 · 1.7x larger')
+    expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 350 · graphify 580 · 1.7x larger')
+  })
+
   it('preserves partial compare results when one side fails', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From b0312e1a559f4c9eba938348b90724311aede67f Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 20:38:22 +0400
Subject: [PATCH 08/18] fix: avoid JSON answer artifact fallback

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/infrastructure/compare.ts |  9 ++--
 tests/unit/compare.test.ts    | 77 +++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts
index fcd2ec3..58f12cf 100644
--- a/src/infrastructure/compare.ts
+++ b/src/infrastructure/compare.ts
@@ -173,7 +173,7 @@ export interface ExecuteCompareRunsDependencies {
 }
 
 interface ParsedCompareRunnerOutput {
-  answerText: string
+  answerText: string | null
   usage: ComparePromptUsage | null
 }
 
@@ -276,7 +276,7 @@ function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunner
   }
 
   return {
-    answerText: answerText ?? stdout,
+    answerText,
     usage,
   }
 }
@@ -983,7 +983,10 @@ export async function executeCompareRuns(
           command,
         })
         const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout)
-        ensureCompareAnswerFile(execution.outputFile, parsedOutput?.answerText ?? executionResult.stdout)
+        ensureCompareAnswerFile(
+          execution.outputFile,
+          parsedOutput === null ? executionResult.stdout : parsedOutput.answerText ?? '',
+        )
         const contextOverflowEvidence =
           executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr)
         report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null
diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index a8e9e2d..cd4514c 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -690,6 +690,83 @@ describe('compare runtime', () => {
     expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 1410 · graphify 480')
   })
 
+  it('does not write structured stdout JSON into answer artifacts when usage is present without answer text', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async () => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            type: 'result',
+            subtype: 'success',
+            usage: {
+              input_tokens: 1200,
+              output_tokens: 90,
+              cache_creation_input_tokens: 100,
+              cache_read_input_tokens: 20,
+            },
+          }),
+          stderr: '',
+          elapsedMs: 11,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('')
+    expect(report.usage.baseline?.total_tokens).toBe(1410)
+    expect(report.usage.graphify?.total_tokens).toBe(1410)
+  })
+
+  it('falls back to raw stdout for unrecognized structured JSON output', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const stdout = JSON.stringify({
+      type: 'result',
+      subtype: 'success',
+      message: 'runner emitted raw JSON without parsed answer metadata',
+    })
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async () => ({
+          exitCode: 0,
+          stdout,
+          stderr: '',
+          elapsedMs: 11,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe(stdout)
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe(stdout)
+    expect(report.usage.baseline).toBeNull()
+    expect(report.usage.graphify).toBeNull()
+  })
+
   it('reports when graphify uses more Claude-reported tokens than the baseline', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From a3f703b53c1a8ce42545602b676b8d854f82b4b3 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 20:49:39 +0400
Subject: [PATCH 09/18] test: cover Gemini compare usage capture

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/compare.test.ts | 128 +++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index cd4514c..8aad476 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -767,6 +767,134 @@ describe('compare runtime', () => {
     expect(report.usage.graphify).toBeNull()
   })
 
+  it('captures Gemini-reported usage from structured runner output and saves plain answers', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            candidates: [
+              {
+                content: {
+                  parts: [{ text: `${execution.mode} answer\n` }],
+                },
+              },
+            ],
+            usageMetadata: {
+              promptTokenCount: 400,
+              candidatesTokenCount: 80,
+              totalTokenCount: 480,
+            },
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n')
+    expect(report.usage.baseline).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 80,
+        total_tokens: 480,
+      }),
+    )
+    expect(report.usage.graphify).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 80,
+        total_tokens: 480,
+      }),
+    )
+
+    const savedReport = JSON.parse(readFileSync(report.paths.report, 'utf8')) as {
+      usage: {
+        baseline: Record<string, unknown> | null
+        graphify: Record<string, unknown> | null
+      }
+    }
+    expect(savedReport.usage.baseline).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 80,
+        total_tokens: 480,
+      }),
+    )
+    expect(savedReport.usage.graphify).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 80,
+        total_tokens: 480,
+      }),
+    )
+  })
+
+  it('promotes Gemini-reported input and total tokens into compare summaries', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            candidates: [
+              {
+                content: {
+                  parts: [{ text: `${execution.mode} answer\n` }],
+                },
+              },
+            ],
+            usageMetadata: {
+              promptTokenCount: 400,
+              candidatesTokenCount: 80,
+              totalTokenCount: 480,
+            },
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(report.baseline_prompt_tokens).toBe(400)
+    expect(report.graphify_prompt_tokens).toBe(400)
+    expect(report.baseline_total_tokens).toBe(480)
+    expect(report.graphify_total_tokens).toBe(480)
+    expect(formatCompareSummary(result)).toContain('Input tokens (Gemini reported): baseline 400 · graphify 400')
+    expect(formatCompareSummary(result)).toContain('Total tokens (Gemini reported): baseline 480 · graphify 480')
+    expect(formatCompareSummary(result)).toContain('reported')
+  })
+
   it('reports when graphify uses more Claude-reported tokens than the baseline', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From 78f1c01a7137dc586dd6a2ccd6a1bbdfdfe3c66b Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 20:58:42 +0400
Subject: [PATCH 10/18] test: strengthen Gemini compare regressions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/compare.test.ts | 66 +++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index 8aad476..82386e9 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -792,11 +792,18 @@ describe('compare runtime', () => {
                 },
               },
             ],
-            usageMetadata: {
-              promptTokenCount: 400,
-              candidatesTokenCount: 80,
-              totalTokenCount: 480,
-            },
+            usageMetadata:
+              execution.mode === 'baseline'
+                ? {
+                    promptTokenCount: 1200,
+                    candidatesTokenCount: 90,
+                    totalTokenCount: 1290,
+                  }
+                : {
+                    promptTokenCount: 400,
+                    candidatesTokenCount: 70,
+                    totalTokenCount: 470,
+                  },
           }),
           stderr: '',
           elapsedMs: execution.mode === 'baseline' ? 11 : 17,
@@ -810,17 +817,17 @@ describe('compare runtime', () => {
     expect(report.usage.baseline).toEqual(
       expect.objectContaining({
         provider: 'gemini',
-        input_tokens: 400,
-        output_tokens: 80,
-        total_tokens: 480,
+        input_tokens: 1200,
+        output_tokens: 90,
+        total_tokens: 1290,
       }),
     )
     expect(report.usage.graphify).toEqual(
       expect.objectContaining({
         provider: 'gemini',
         input_tokens: 400,
-        output_tokens: 80,
-        total_tokens: 480,
+        output_tokens: 70,
+        total_tokens: 470,
       }),
     )
 
@@ -833,17 +840,17 @@ describe('compare runtime', () => {
     expect(savedReport.usage.baseline).toEqual(
       expect.objectContaining({
         provider: 'gemini',
-        input_tokens: 400,
-        output_tokens: 80,
-        total_tokens: 480,
+        input_tokens: 1200,
+        output_tokens: 90,
+        total_tokens: 1290,
       }),
     )
     expect(savedReport.usage.graphify).toEqual(
       expect.objectContaining({
         provider: 'gemini',
         input_tokens: 400,
-        output_tokens: 80,
-        total_tokens: 480,
+        output_tokens: 70,
+        total_tokens: 470,
       }),
     )
   })
@@ -873,11 +880,18 @@ describe('compare runtime', () => {
                 },
               },
             ],
-            usageMetadata: {
-              promptTokenCount: 400,
-              candidatesTokenCount: 80,
-              totalTokenCount: 480,
-            },
+            usageMetadata:
+              execution.mode === 'baseline'
+                ? {
+                    promptTokenCount: 1200,
+                    candidatesTokenCount: 90,
+                    totalTokenCount: 1290,
+                  }
+                : {
+                    promptTokenCount: 400,
+                    candidatesTokenCount: 70,
+                    totalTokenCount: 470,
+                  },
           }),
           stderr: '',
           elapsedMs: execution.mode === 'baseline' ? 11 : 17,
@@ -886,13 +900,13 @@ describe('compare runtime', () => {
     )
 
     const report = result.reports[0]!
-    expect(report.baseline_prompt_tokens).toBe(400)
+    expect(report.baseline_prompt_tokens).toBe(1200)
     expect(report.graphify_prompt_tokens).toBe(400)
-    expect(report.baseline_total_tokens).toBe(480)
-    expect(report.graphify_total_tokens).toBe(480)
-    expect(formatCompareSummary(result)).toContain('Input tokens (Gemini reported): baseline 400 · graphify 400')
-    expect(formatCompareSummary(result)).toContain('Total tokens (Gemini reported): baseline 480 · graphify 480')
-    expect(formatCompareSummary(result)).toContain('reported')
+    expect(report.baseline_total_tokens).toBe(1290)
+    expect(report.graphify_total_tokens).toBe(470)
+    const summary = formatCompareSummary(result)
+    expect(summary).toContain('Input tokens (Gemini reported): baseline 1200 · graphify 400')
+    expect(summary).toContain('Total tokens (Gemini reported): baseline 1290 · graphify 470')
   })
 
   it('reports when graphify uses more Claude-reported tokens than the baseline', async () => {

From 112bfe724a30193f4eebb8b0585f0ccee7aa4636 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 21:14:03 +0400
Subject: [PATCH 11/18] feat: capture Gemini compare usage

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/infrastructure/compare.ts | 162 +++++++++++++++++++++++++++++-----
 1 file changed, 141 insertions(+), 21 deletions(-)

diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts
index 58f12cf..61a45d9 100644
--- a/src/infrastructure/compare.ts
+++ b/src/infrastructure/compare.ts
@@ -15,7 +15,7 @@ export type CompareBaselineMode = 'full' | 'bounded'
 export type CompareRunMode = 'baseline' | 'graphify'
 export type CompareRunStatus = 'not_run' | 'succeeded' | 'failed' | 'context_overflow'
 export type CompareFailureReason = 'prompt_too_long' | 'runner_error' | 'exec_error'
-export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input'
+export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input' | 'gemini_reported_input'
 
 export interface ComparePromptPack {
   kind: 'baseline' | 'graphify'
@@ -62,7 +62,7 @@ export interface ComparePromptTokenEstimator {
 }
 
 export interface ComparePromptUsage {
-  provider: 'claude'
+  provider: 'claude' | 'gemini'
   source: 'structured_stdout'
   input_tokens: number
   output_tokens: number
@@ -177,6 +177,8 @@ interface ParsedCompareRunnerOutput {
   usage: ComparePromptUsage | null
 }
 
+type CompareRunnerOutputParser = (stdout: string) => ParsedCompareRunnerOutput | null
+
 const DEFAULT_RETRIEVAL_BUDGET = 3_000
 const DEFAULT_BOUNDED_BASELINE_TOKENS = 4_000
 const EXEC_TEMPLATE_PLACEHOLDER_PATTERN = /\{[a-z_][a-z0-9_]*\}/gi
@@ -225,6 +227,22 @@ function parseStructuredCompareAnswer(payload: Record<string, unknown>): string
   return null
 }
 
+function parseJsonRecord(stdout: string): Record<string, unknown> | null {
+  const trimmed = stdout.trim()
+  if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) {
+    return null
+  }
+
+  let payload: unknown
+  try {
+    payload = JSON.parse(trimmed)
+  } catch {
+    return null
+  }
+
+  return isRecord(payload) ? payload : null
+}
+
 function parseClaudeStructuredUsage(payload: Record<string, unknown>): ComparePromptUsage | null {
   if (!isRecord(payload.usage)) {
     return null
@@ -252,25 +270,75 @@ function parseClaudeStructuredUsage(payload: Record<string, unknown>): ComparePr
   }
 }
 
-function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null {
-  const trimmed = stdout.trim()
-  if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) {
+function parseClaudeStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null {
+  const payload = parseJsonRecord(stdout)
+  if (payload === null) {
     return null
   }
 
-  let payload: unknown
-  try {
-    payload = JSON.parse(trimmed)
-  } catch {
+  const answerText = parseStructuredCompareAnswer(payload)
+  const usage = parseClaudeStructuredUsage(payload)
+  if (answerText === null && usage === null) {
     return null
   }
 
-  if (!isRecord(payload)) {
+  return {
+    answerText,
+    usage,
+  }
+}
+
+function parseGeminiStructuredAnswer(payload: Record<string, unknown>): string | null {
+  if (!Array.isArray(payload.candidates) || payload.candidates.length === 0) {
     return null
   }
 
-  const answerText = parseStructuredCompareAnswer(payload)
-  const usage = parseClaudeStructuredUsage(payload)
+  const firstCandidate = payload.candidates[0]
+  if (!isRecord(firstCandidate) || !isRecord(firstCandidate.content) || !Array.isArray(firstCandidate.content.parts)) {
+    return null
+  }
+
+  for (const part of firstCandidate.content.parts) {
+    if (isRecord(part) && typeof part.text === 'string') {
+      return part.text
+    }
+  }
+
+  return null
+}
+
+function parseGeminiStructuredUsage(payload: Record<string, unknown>): ComparePromptUsage | null {
+  if (!isRecord(payload.usageMetadata)) {
+    return null
+  }
+
+  const inputTokens = parseNonNegativeNumber(payload.usageMetadata.promptTokenCount)
+  const outputTokens = parseNonNegativeNumber(payload.usageMetadata.candidatesTokenCount)
+  const totalTokens = parseNonNegativeNumber(payload.usageMetadata.totalTokenCount)
+  if (inputTokens === null || outputTokens === null || totalTokens === null) {
+    return null
+  }
+
+  return {
+    provider: 'gemini',
+    source: 'structured_stdout',
+    input_tokens: inputTokens,
+    output_tokens: outputTokens,
+    cache_creation_input_tokens: 0,
+    cache_read_input_tokens: 0,
+    input_total_tokens: inputTokens,
+    total_tokens: totalTokens,
+  }
+}
+
+function parseGeminiStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null {
+  const payload = parseJsonRecord(stdout)
+  if (payload === null) {
+    return null
+  }
+
+  const answerText = parseGeminiStructuredAnswer(payload)
+  const usage = parseGeminiStructuredUsage(payload)
   if (answerText === null && usage === null) {
     return null
   }
@@ -281,6 +349,29 @@ function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunner
   }
 }
 
+const COMPARE_RUNNER_OUTPUT_PARSERS: readonly CompareRunnerOutputParser[] = [
+  parseClaudeStructuredCompareRunnerOutput,
+  parseGeminiStructuredCompareRunnerOutput,
+]
+
+function parsePlainTextCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput {
+  return {
+    answerText: stdout,
+    usage: null,
+  }
+}
+
+function parseCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput {
+  for (const parser of COMPARE_RUNNER_OUTPUT_PARSERS) {
+    const parsedOutput = parser(stdout)
+    if (parsedOutput !== null) {
+      return parsedOutput
+    }
+  }
+
+  return parsePlainTextCompareRunnerOutput(stdout)
+}
+
 function validateCompareExecTemplate(template: string): void {
   if (PROMPT_FILE_COMMAND_SUBSTITUTION_PATTERNS.some((pattern) => pattern.test(template))) {
     throw new Error(
@@ -538,8 +629,16 @@ function syncComparePromptMetrics(report: ComparePromptReport): void {
     report.baseline_total_tokens !== null && report.graphify_total_tokens !== null
       ? computeReductionRatio(report.baseline_total_tokens, report.graphify_total_tokens)
       : null
-  report.prompt_token_source.baseline = report.usage.baseline === null ? 'estimated_cl100k_base' : 'claude_reported_input'
-  report.prompt_token_source.graphify = report.usage.graphify === null ? 'estimated_cl100k_base' : 'claude_reported_input'
+  report.prompt_token_source.baseline = comparePromptTokenSource(report.usage.baseline)
+  report.prompt_token_source.graphify = comparePromptTokenSource(report.usage.graphify)
+}
+
+function comparePromptTokenSource(usage: ComparePromptUsage | null): ComparePromptTokenSource {
+  if (usage === null) {
+    return 'estimated_cl100k_base'
+  }
+
+  return usage.provider === 'claude' ? 'claude_reported_input' : 'gemini_reported_input'
 }
 
 function portablePath(path: string): string {
@@ -982,14 +1081,14 @@ export async function executeCompareRuns(
           question: report.question,
           command,
         })
-        const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout)
+        const parsedOutput = parseCompareRunnerOutput(executionResult.stdout)
         ensureCompareAnswerFile(
           execution.outputFile,
-          parsedOutput === null ? executionResult.stdout : parsedOutput.answerText ?? '',
+          parsedOutput.answerText ?? '',
         )
         const contextOverflowEvidence =
           executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr)
-        report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null
+        report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput.usage : null
         report.status[execution.mode] =
           executionResult.exitCode === 0 ? 'succeeded' : contextOverflowEvidence !== null ? 'context_overflow' : 'failed'
         report.elapsed_ms[execution.mode] = executionResult.elapsedMs
@@ -1048,6 +1147,26 @@ function countPromptUsageRuns(reports: readonly ComparePromptReport[]): number {
   return reports.reduce((total, report) => total + (report.usage.baseline === null ? 0 : 1) + (report.usage.graphify === null ? 0 : 1), 0)
 }
 
+function usageProviderSummaryLabel(reports: readonly ComparePromptReport[]): string {
+  const providers = new Set<ComparePromptUsage['provider']>()
+
+  for (const report of reports) {
+    if (report.usage.baseline !== null) {
+      providers.add(report.usage.baseline.provider)
+    }
+    if (report.usage.graphify !== null) {
+      providers.add(report.usage.graphify.provider)
+    }
+  }
+
+  if (providers.size !== 1) {
+    return 'Runner'
+  }
+
+  const [provider] = providers
+  return provider === 'gemini' ? 'Gemini' : 'Claude'
+}
+
 export function formatCompareSummary(result: GenerateCompareArtifactsResult): string {
   const baselineTokens = sumPromptTokens(result.reports, 'baseline')
   const graphifyTokens = sumPromptTokens(result.reports, 'graphify')
@@ -1060,11 +1179,12 @@ export function formatCompareSummary(result: GenerateCompareArtifactsResult): st
   const succeededRuns = countPromptRuns(result.reports, 'succeeded')
   const usageRuns = countPromptUsageRuns(result.reports)
   const totalRuns = result.reports.length * 2
+  const usageProviderLabel = usageProviderSummaryLabel(result.reports)
   const promptTokenLabel =
     usageRuns === totalRuns
-      ? 'Input tokens (Claude reported)'
+      ? `Input tokens (${usageProviderLabel} reported)`
       : usageRuns > 0
-        ? `Input tokens (Claude reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)`
+        ? `Input tokens (${usageProviderLabel} reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)`
         : `Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model})`
 
   const lines = [
@@ -1077,9 +1197,9 @@ export function formatCompareSummary(result: GenerateCompareArtifactsResult): st
   ]
 
   if (baselineTotalTokens !== null && graphifyTotalTokens !== null && totalReductionRatio !== null) {
-    lines.splice(3, 0, `- Total tokens (Claude reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`)
+    lines.splice(3, 0, `- Total tokens (${usageProviderLabel} reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`)
   } else if (usageRuns > 0 && usageRuns < totalRuns) {
-    lines.splice(3, 0, `- Usage capture: Claude reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`)
+    lines.splice(3, 0, `- Usage capture: ${usageProviderLabel} reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`)
   }
 
   return lines.join('\n')

From 60cac7631165534947bace44a34b4e50b870c8b8 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 21:27:30 +0400
Subject: [PATCH 12/18] fix: concatenate Gemini compare answer parts

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/infrastructure/compare.ts |  5 ++--
 tests/unit/compare.test.ts    | 47 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts
index 61a45d9..5c61863 100644
--- a/src/infrastructure/compare.ts
+++ b/src/infrastructure/compare.ts
@@ -298,13 +298,14 @@ function parseGeminiStructuredAnswer(payload: Record<string, unknown>): string |
     return null
   }
 
+  let answerText = ''
   for (const part of firstCandidate.content.parts) {
     if (isRecord(part) && typeof part.text === 'string') {
-      return part.text
+      answerText += part.text
     }
   }
 
-  return null
+  return answerText.length > 0 ? answerText : null
 }
 
 function parseGeminiStructuredUsage(payload: Record<string, unknown>): ComparePromptUsage | null {
diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index 82386e9..2cceb5d 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -855,6 +855,53 @@ describe('compare runtime', () => {
     )
   })
 
+  it('concatenates Gemini text parts from the first candidate into answer artifacts', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            candidates: [
+              {
+                content: {
+                  parts: [{ text: `${execution.mode} ` }, { inlineData: { mimeType: 'text/plain' } }, { text: 'answer' }, { text: '\n' }],
+                },
+              },
+              {
+                content: {
+                  parts: [{ text: 'ignored candidate answer\n' }],
+                },
+              },
+            ],
+            usageMetadata: {
+              promptTokenCount: 1200,
+              candidatesTokenCount: 90,
+              totalTokenCount: 1290,
+            },
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n')
+  })
+
   it('promotes Gemini-reported input and total tokens into compare summaries', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From 75630eeb39919e5800b13409a9da68158178c7cf Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 21:35:42 +0400
Subject: [PATCH 13/18] test: lock Gemini compare fallback behavior

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/compare.test.ts | 75 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index 2cceb5d..6674217 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -855,6 +855,50 @@ describe('compare runtime', () => {
     )
   })
 
+  it('saves Gemini answers when structured usage metadata is missing and keeps estimate summaries', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            candidates: [
+              {
+                content: {
+                  parts: [{ text: `${execution.mode} answer\n` }],
+                },
+              },
+            ],
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n')
+    expect(report.usage.baseline).toBeNull()
+    expect(report.usage.graphify).toBeNull()
+    expect(report.prompt_token_source).toEqual({
+      baseline: 'estimated_cl100k_base',
+      graphify: 'estimated_cl100k_base',
+    })
+    expect(formatCompareSummary(result)).toContain('estimate')
+  })
+
   it('concatenates Gemini text parts from the first candidate into answer artifacts', async () => {
     const graph = makeGraph()
     writeProjectFiles()
@@ -902,6 +946,37 @@ describe('compare runtime', () => {
     expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n')
   })
 
+  it('preserves malformed Gemini JSON stdout as the answer artifact without capturing usage', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async () => ({
+          exitCode: 0,
+          stdout: '{not valid json',
+          stderr: '',
+          elapsedMs: 11,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toContain('{not valid json')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toContain('{not valid json')
+    expect(report.usage.baseline).toBeNull()
+    expect(report.usage.graphify).toBeNull()
+  })
+
   it('promotes Gemini-reported input and total tokens into compare summaries', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From 90f0c7acc2409df510943dc11ee5de40b283d3fc Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 21:38:31 +0400
Subject: [PATCH 14/18] test: tighten Gemini fallback assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/compare.test.ts | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index 6674217..43a738e 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -950,6 +950,7 @@ describe('compare runtime', () => {
     const graph = makeGraph()
     writeProjectFiles()
     const graphPath = writeGraphFixture(graph)
+    const rawStdout = '{not valid json'
 
     const result = await executeCompareRuns(
       {
@@ -963,7 +964,7 @@ describe('compare runtime', () => {
       {
         runner: async () => ({
           exitCode: 0,
-          stdout: '{not valid json',
+          stdout: rawStdout,
           stderr: '',
           elapsedMs: 11,
         }),
@@ -971,8 +972,8 @@ describe('compare runtime', () => {
     )
 
     const report = result.reports[0]!
-    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toContain('{not valid json')
-    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toContain('{not valid json')
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe(rawStdout)
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe(rawStdout)
     expect(report.usage.baseline).toBeNull()
     expect(report.usage.graphify).toBeNull()
   })

From f5afba88a254eb35ca189d5aca39eb400f781e0a Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 22:02:34 +0400
Subject: [PATCH 15/18] test: cover Gemini usage-only artifacts

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit/compare.test.ts | 88 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts
index 43a738e..bee03b2 100644
--- a/tests/unit/compare.test.ts
+++ b/tests/unit/compare.test.ts
@@ -855,6 +855,94 @@ describe('compare runtime', () => {
     )
   })
 
+  it('does not write Gemini structured stdout JSON into answer artifacts when usage metadata is present without answer text', async () => {
+    const graph = makeGraph()
+    writeProjectFiles()
+    const graphPath = writeGraphFixture(graph)
+
+    const result = await executeCompareRuns(
+      {
+        graphPath,
+        question: 'how does login create a session',
+        outputDir: COMPARE_OUTPUT_ROOT,
+        execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}',
+        baselineMode: 'full',
+        now: new Date('2026-04-24T19:30:00.000Z'),
+      },
+      {
+        runner: async (execution) => ({
+          exitCode: 0,
+          stdout: JSON.stringify({
+            candidates: [
+              {
+                content: {
+                  parts: [{ inlineData: { mimeType: 'text/plain' } }],
+                },
+              },
+            ],
+            usageMetadata:
+              execution.mode === 'baseline'
+                ? {
+                    promptTokenCount: 1200,
+                    candidatesTokenCount: 90,
+                    totalTokenCount: 1290,
+                  }
+                : {
+                    promptTokenCount: 400,
+                    candidatesTokenCount: 70,
+                    totalTokenCount: 470,
+                  },
+          }),
+          stderr: '',
+          elapsedMs: execution.mode === 'baseline' ? 11 : 17,
+        }),
+      },
+    )
+
+    const report = result.reports[0]!
+    expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('')
+    expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('')
+    expect(report.usage.baseline).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 1200,
+        output_tokens: 90,
+        total_tokens: 1290,
+      }),
+    )
+    expect(report.usage.graphify).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 70,
+        total_tokens: 470,
+      }),
+    )
+
+    const savedReport = JSON.parse(readFileSync(report.paths.report, 'utf8')) as {
+      usage: {
+        baseline: Record<string, unknown> | null
+        graphify: Record<string, unknown> | null
+      }
+    }
+    expect(savedReport.usage.baseline).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 1200,
+        output_tokens: 90,
+        total_tokens: 1290,
+      }),
+    )
+    expect(savedReport.usage.graphify).toEqual(
+      expect.objectContaining({
+        provider: 'gemini',
+        input_tokens: 400,
+        output_tokens: 70,
+        total_tokens: 470,
+      }),
+    )
+  })
+
   it('saves Gemini answers when structured usage metadata is missing and keeps estimate summaries', async () => {
     const graph = makeGraph()
     writeProjectFiles()

From 686e57150f35d8fa0b96c370fb1c8a217edf588f Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 22:09:42 +0400
Subject: [PATCH 16/18] feat: support Gemini compare usage capture

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 CHANGELOG.md             |  1 +
 README.md                | 13 +++++++++++--
 docs/proof-workflows.md  | 12 ++++++++++--
 examples/why-graphify.md | 19 ++++++++++++++++---
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ccc05a9..7e90189 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to the TypeScript package will be documented in this file.
 ### Improved
 
 - **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results
+- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces
 
 ## [0.8.7] - 2026-04-27
 
diff --git a/README.md b/README.md
index 442c67c..d51fbc1 100644
--- a/README.md
+++ b/README.md
@@ -79,16 +79,25 @@ node dist/src/cli/bin.js compare "How does login create a session?" \
   --yes
 ```
 
+Gemini-safe installed-CLI invocation:
+
+```bash
+graphify-ts compare "How does auth work?" \
+  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --yes
+```
+
 What `compare` does:
 
 - Prints a warning before execution because it may consume paid model tokens. Use `--yes` for non-interactive runs and CI.
 - Expands runner placeholders: `{prompt_file}`, `{question}`, `{mode}`, and `{output_file}`.
 - For large prompts, pass `{prompt_file}` through stdin or file redirection. Avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`), which can hit OS argument-length limits.
 - Writes a proof bundle under `graphify-out/compare/<timestamp>/` with `baseline-prompt.txt`, `graphify-prompt.txt`, `baseline-answer.txt`, `graphify-answer.txt`, and `report.json`.
-- Reports prompt-token counts as local `cl100k_base` estimates, not provider billing tokens.
+- Promotes provider-reported usage into `report.json` and the terminal summary when the runner emits structured JSON with usage (for Gemini, `usageMetadata` from `--output-format json`; for Claude, structured JSON with `usage`).
+- Falls back to labeled local `cl100k_base` prompt estimates when the runner only returns answer text or malformed JSON, so the token source stays explicit.
 - Preserves partial artifacts when one side fails, and classifies prompt-size failures such as `Prompt is too long` as `context_overflow` evidence in `report.json`.
 
-Use `compare` when you want a showcase or a customer-proof run. Use `benchmark` and `eval` when you want repeatable local measurements without calling a model.
+Use `compare` when you want a showcase or a customer-proof run. Use `benchmark` and `eval` when you want repeatable local measurements without calling a model; they remain offline estimate surfaces rather than provider-reported usage surfaces.
 
 ## Graph time travel (ref-to-ref graph compare)
 
diff --git a/docs/proof-workflows.md b/docs/proof-workflows.md
index c2f2360..2d35e3a 100644
--- a/docs/proof-workflows.md
+++ b/docs/proof-workflows.md
@@ -32,6 +32,14 @@ node dist/src/cli/bin.js compare "How does login create a session?" \
   --yes
 ```
 
+Gemini-safe installed-CLI invocation:
+
+```bash
+graphify-ts compare "How does auth work?" \
+  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --yes
+```
+
 What gets saved under `graphify-out/compare/<timestamp>/`:
 
 - `baseline-prompt.txt`
@@ -40,7 +48,7 @@ What gets saved under `graphify-out/compare/<timestamp>/`:
 - `graphify-answer.txt`
 - `report.json`
 
-Use this when you need customer-proof or your own apples-to-apples answer comparison. It can spend paid model tokens, so it is intentionally separate from the local benchmark/eval path.
+When Gemini emits structured JSON with `usageMetadata`, `compare` captures real reported input and total tokens in `report.json` and the terminal summary. If the runner only returns answer text or malformed JSON, `compare` falls back to labeled local `cl100k_base` prompt estimates instead. Use this when you need customer-proof or your own apples-to-apples answer comparison. It can spend paid model tokens, so it is intentionally separate from the local benchmark/eval path. `benchmark` and `eval` remain offline estimate surfaces.
 
 ## 3. Production and multi-repo proof
 
@@ -78,7 +86,7 @@ What this proves that a single-repo demo cannot:
 |---|---|
 | "Does the graph improve retrieval quality on a labeled set?" | `eval` |
 | "Does the graph reduce prompt size while keeping expected evidence?" | `benchmark` |
-| "Will my actual model answer better with graphify than with a naive baseline?" | `compare` |
+| "Will my actual model answer better with graphify than with a naive baseline, and optionally capture provider-reported usage?" | `compare` |
 | "Can this work across frontend/backend/shared repos?" | `federate` + `serve --stdio` |
 
 For the narrative production benchmark and the GoValidate numbers, see [`examples/why-graphify.md`](../examples/why-graphify.md). For exact support coverage by language and file type, see [`language-capability-matrix.md`](./language-capability-matrix.md).
diff --git a/examples/why-graphify.md b/examples/why-graphify.md
index d7b5b19..da88406 100644
--- a/examples/why-graphify.md
+++ b/examples/why-graphify.md
@@ -141,14 +141,22 @@ node dist/src/cli/bin.js compare "How does login create a session?" \
   --yes
 ```
 
+Gemini-safe installed-CLI invocation:
+
+```bash
+graphify-ts compare "How does auth work?" \
+  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --yes
+```
+
 What this gives you:
 
 - one baseline prompt and one graphify prompt for the same question
 - two real model answers from your own terminal runner
 - a saved proof bundle in `graphify-out/compare/<timestamp>/`
-- prompt-token counts and run statuses in `report.json`
+- prompt-token counts, usage-source labels, and run statuses in `report.json`
 
-Important: `compare` may spend paid model tokens. It prints a warning before execution and requires `--yes` in non-interactive runs. For large prompts, use stdin or file redirection with `{prompt_file}`; avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`) because shell argument expansion can fail on full-repo baselines.
+Important: `compare` may spend paid model tokens. It prints a warning before execution and requires `--yes` in non-interactive runs. For large prompts, use stdin or file redirection with `{prompt_file}`; avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`) because shell argument expansion can fail on full-repo baselines. If Gemini emits structured JSON with `usageMetadata`, `compare` records real reported input and total tokens. If the runner only returns answer text or malformed JSON, `compare` falls back to labeled local `cl100k_base` prompt estimates instead. `benchmark` and `eval` stay offline estimate surfaces.
 
 ## Run It on Your Own Codebase
 
@@ -168,6 +176,11 @@ graphify-ts eval graphify-out/graph.json --questions benchmark-questions.json
 # If you want a real same-model A/B proof run
 graphify-ts compare "How does auth work?" --exec 'cat {prompt_file} | claude -p' --yes
 
+# Gemini-safe compare runner with structured usage capture
+graphify-ts compare "How does auth work?" \
+  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --yes
+
 # Set up your AI agent
 graphify-ts claude install    # writes .mcp.json with MCP server
 graphify-ts cursor install    # writes .cursor/mcp.json
@@ -187,7 +200,7 @@ For an internal team rollout, the most convincing sequence is usually:
 That progression keeps the proof honest:
 
 - `benchmark` and `eval` are local graph-quality measurements
-- `compare` is the model-facing proof
+- `compare` is the model-facing proof, with reported usage when the runner emits structured JSON and labeled estimates otherwise
 - `federate` is the production architecture proof for frontend/backend/shared or microservice splits
 
 ## Capability Coverage Matters

From 29599cdfd92d53205b252e22791996a33cbe9907 Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 22:15:10 +0400
Subject: [PATCH 17/18] docs: fix Gemini compare invocation

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md                | 2 +-
 docs/proof-workflows.md  | 2 +-
 examples/why-graphify.md | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d51fbc1..3b38d19 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ Gemini-safe installed-CLI invocation:
 
 ```bash
 graphify-ts compare "How does auth work?" \
-  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --exec 'cat {prompt_file} | gemini -p "" --output-format json' \
   --yes
 ```
 
diff --git a/docs/proof-workflows.md b/docs/proof-workflows.md
index 2d35e3a..754c3d5 100644
--- a/docs/proof-workflows.md
+++ b/docs/proof-workflows.md
@@ -36,7 +36,7 @@ Gemini-safe installed-CLI invocation:
 
 ```bash
 graphify-ts compare "How does auth work?" \
-  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --exec 'cat {prompt_file} | gemini -p "" --output-format json' \
   --yes
 ```
 
diff --git a/examples/why-graphify.md b/examples/why-graphify.md
index da88406..4b16d16 100644
--- a/examples/why-graphify.md
+++ b/examples/why-graphify.md
@@ -145,7 +145,7 @@ Gemini-safe installed-CLI invocation:
 
 ```bash
 graphify-ts compare "How does auth work?" \
-  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --exec 'cat {prompt_file} | gemini -p "" --output-format json' \
   --yes
 ```
 
@@ -178,7 +178,7 @@ graphify-ts compare "How does auth work?" --exec 'cat {prompt_file} | claude -p'
 
 # Gemini-safe compare runner with structured usage capture
 graphify-ts compare "How does auth work?" \
-  --exec 'cat {prompt_file} | gemini -p --output-format json' \
+  --exec 'cat {prompt_file} | gemini -p "" --output-format json' \
   --yes
 
 # Set up your AI agent

From b281904e09b92b6458153eb747238934010b9e6d Mon Sep 17 00:00:00 2001
From: mohammed naji <mohammed@naji.dev>
Date: Mon, 27 Apr 2026 22:18:51 +0400
Subject: [PATCH 18/18] docs: fix Gemini changelog example

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e90189..657bc5e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ All notable changes to the TypeScript package will be documented in this file.
 ### Improved
 
 - **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results
-- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces
+- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p "" --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces
 
 ## [0.8.7] - 2026-04-27