From b4bfffe7bd96f6fa4b14b5f266286763172344f7 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 12:49:35 +0400 Subject: [PATCH 01/18] test: lock retrieval seed ranking behavior Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/runtime/retrieve.ts | 96 +++++++++++++++++++------ tests/unit/retrieve.test.ts | 137 ++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 23 deletions(-) diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 194089a..ca0083d 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -190,6 +190,14 @@ function storedCommunityLabelsFromGraph(graph: KnowledgeGraph): Record 0 || score.labelTokenScore > 0) { + return 2 + } + if (score.sourcePathScore > 0 || score.communityScore > 0) { + return 1 + } + return 0 +} + +function compareScoredNodes(graph: KnowledgeGraph, left: ScoredNode, right: ScoredNode): number { + return ( + right.evidenceTier - left.evidenceTier || + right.score - left.score || + graph.degree(right.id) - graph.degree(left.id) + ) +} + +function scoreSeedCandidate( + question: string, + questionTokens: readonly string[], + label: string, + sourceFile: string, + communityLabel: string | null, + tokenWeights: ReadonlyMap, +): SeedScoreBreakdown { + const labelExactScore = normalizeSeedText(question) !== '' && normalizeSeedText(question) === normalizeSeedText(label) ? 2 : 0 + const labelTokenScore = scoreNode(questionTokens, tokenizeLabel(label), tokenWeights) + const sourcePathScore = scoreNode(questionTokens, tokenizeLabel(sourceFile), tokenWeights) * 0.25 + const communityScore = communityLabel + ? Math.min(scoreNode(questionTokens, tokenizeLabel(communityLabel)) * 0.1, 0.2) + : 0 + + return { + labelExactScore, + labelTokenScore, + sourcePathScore, + communityScore, + total: labelExactScore + labelTokenScore + sourcePathScore + communityScore, + } +} + export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult { const { question, budget } = options const questionTokens = tokenizeQuestion(question) @@ -217,23 +272,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) } } - // Pre-compute community label scores so nodes in matching communities get a boost + // Pre-compute community labels so seed scoring can treat them as secondary evidence. const communities = communitiesFromGraph(graph) const communityLabels: Record = { ...buildCommunityLabels(graph, communities), ...storedCommunityLabelsFromGraph(graph), } - const communityBoost = new Map() - for (const [idStr, label] of Object.entries(communityLabels)) { - const id = Number(idStr) - const communityTokens = tokenizeLabel(label) - const score = scoreNode(questionTokens, communityTokens) - if (score > 0) { - communityBoost.set(id, Math.min(score * 0.1, 0.3)) - } - } - // Step 1+2: Score all nodes with TF-IDF-weighted tokens + community boost + // Step 1+2: Score all nodes with explicit seed evidence weights. const tokenWeights = tokenWeightsForQuestion(graph, questionTokens) const scored: ScoredNode[] = [] for (const [id, attributes] of graph.nodeEntries()) { @@ -248,15 +294,17 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) } const label = String(attributes.label ?? '') - const labelTokens = tokenizeLabel(label) const sourceFile = String(attributes.source_file ?? '') - const sourceTokens = tokenizeLabel(sourceFile) - const labelScore = scoreNode(questionTokens, labelTokens, tokenWeights) - const sourceScore = scoreNode(questionTokens, sourceTokens, tokenWeights) * 0.5 - const comBoost = community !== null ? (communityBoost.get(community) ?? 0) : 0 - const totalScore = labelScore + sourceScore + comBoost - - if (totalScore > 0) { + const score = scoreSeedCandidate( + question, + questionTokens, + label, + sourceFile, + community !== null ? (communityLabels[community] ?? null) : null, + tokenWeights, + ) + + if (score.total > 0) { scored.push({ id, label, @@ -265,13 +313,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), fileType, community, - score: totalScore, - relevanceBand: labelScore + sourceScore > 0 ? 'direct' : 'related', + evidenceTier: evidenceTierForSeedScore(score), + score: score.total, + relevanceBand: score.labelExactScore > 0 || score.labelTokenScore > 0 ? 'direct' : 'related', }) } } - scored.sort((a, b) => b.score - a.score || graph.degree(b.id) - graph.degree(a.id)) + scored.sort((a, b) => compareScoredNodes(graph, a, b)) // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores const seedCount = Math.min(scored.length, 10) @@ -333,6 +382,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), fileType, community, + evidenceTier: 0, score: hopScore, relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral', }) @@ -352,7 +402,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) } // Re-sort: seeds first by score, then neighbors by degree - scored.sort((a, b) => b.score - a.score || graph.degree(b.id) - graph.degree(a.id)) + scored.sort((a, b) => compareScoredNodes(graph, a, b)) // Step 4+5: Read snippets and assemble within budget const matchedNodes: RetrieveMatchedNode[] = [] diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index ae2f612..6310104 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -138,6 +138,143 @@ describe('retrieve', () => { expect(labels).toContain('authenticateUser') }) + it('keeps direct symbol matches above path-only matches after structural boosts', () => { + const graph = new KnowledgeGraph() + graph.addNode('direct_symbol', { + label: 'LoginController', + source_file: '/src/controllers.ts', + line_number: 1, + node_kind: 'function', + file_type: 'code', + community: 0, + }) + graph.addNode('path_only', { + label: 'RenderPage', + source_file: '/src/login/handler.ts', + line_number: 2, + node_kind: 'function', + file_type: 'code', + community: 0, + }) + graph.addNode('guide_a', { + label: 'LoginHandlerGuideA', + source_file: '/docs/login-a.md', + line_number: 3, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_b', { + label: 'LoginHandlerGuideB', + source_file: '/docs/login-b.md', + line_number: 4, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_c', { + label: 'LoginHandlerGuideC', + source_file: '/docs/login-c.md', + line_number: 5, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_d', { + label: 'LoginHandlerGuideD', + source_file: '/docs/login-d.md', + line_number: 6, + node_kind: 'section', + file_type: 'document', + community: 1, + }) + graph.addEdge('path_only', 'guide_a', { + relation: 'calls', + confidence: 'EXTRACTED', + source_file: '/src/login/handler.ts', + }) + graph.addEdge('path_only', 'guide_d', { + relation: 'calls', + confidence: 'EXTRACTED', + source_file: '/src/login/handler.ts', + }) + + const result = retrieveContext(graph, { question: 'login', budget: 5000, fileType: 'code' }) + + expect(result.matched_nodes.map((node) => node.label).slice(0, 2)).toEqual(['LoginController', 'RenderPage']) + expect(result.matched_nodes.find((node) => node.label === 'LoginController')?.relevance_band).toBe('direct') + expect(result.matched_nodes.find((node) => node.label === 'RenderPage')?.relevance_band).toBe('related') + }) + + it('keeps direct symbol matches above community-only matches after structural boosts', () => { + const graph = new KnowledgeGraph() + graph.addNode('direct_symbol', { + label: 'AuthGateway', + source_file: '/src/auth.ts', + line_number: 1, + node_kind: 'class', + file_type: 'code', + community: 0, + }) + graph.addNode('community_only', { + label: 'SessionCoordinator', + source_file: '/src/session.ts', + line_number: 2, + node_kind: 'class', + file_type: 'code', + community: 0, + }) + graph.addNode('guide_a', { + label: 'AuthGuideA', + source_file: '/docs/auth-a.md', + line_number: 3, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_b', { + label: 'AuthGuideB', + source_file: '/docs/auth-b.md', + line_number: 4, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_c', { + label: 'AuthGuideC', + source_file: '/docs/auth-c.md', + line_number: 5, + node_kind: 'section', + file_type: 'document', + community: 0, + }) + graph.addNode('guide_d', { + label: 'AuthGuideD', + source_file: '/docs/auth-d.md', + line_number: 6, + node_kind: 'section', + file_type: 'document', + community: 1, + }) + graph.addEdge('community_only', 'guide_a', { + relation: 'depends_on', + confidence: 'EXTRACTED', + source_file: '/src/session.ts', + }) + graph.addEdge('community_only', 'guide_d', { + relation: 'depends_on', + confidence: 'EXTRACTED', + source_file: '/src/session.ts', + }) + graph.graph.community_labels = { 0: 'Auth' } + + const result = retrieveContext(graph, { question: 'auth', budget: 5000, fileType: 'code' }) + + expect(result.matched_nodes.map((node) => node.label).slice(0, 2)).toEqual(['AuthGateway', 'SessionCoordinator']) + expect(result.matched_nodes.find((node) => node.label === 'AuthGateway')?.relevance_band).toBe('direct') + expect(result.matched_nodes.find((node) => node.label === 'SessionCoordinator')?.relevance_band).toBe('related') + }) + it('includes neighbors of matched nodes', () => { const graph = buildTestGraph() const result = retrieveContext(graph, { question: 'auth', budget: 5000 }) From bdcc7518973ea4bc9cf3bd0494765541ed1bac3b Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 13:34:07 +0400 Subject: [PATCH 02/18] feat: make retrieval expansion relation-aware Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/runtime/retrieve.ts | 76 +++++++++++----- tests/unit/retrieve.test.ts | 169 ++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 20 deletions(-) diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index ca0083d..51cdc79 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -257,6 +257,20 @@ function scoreSeedCandidate( } } +function relationWeight(relation: string): number { + switch (relation) { + case 'calls': + case 'imports_from': + case 'defines': + return 1 + case 'uses': + case 'depends_on': + return 0.7 + default: + return 0.35 + } +} + export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult { const { question, budget } = options const questionTokens = tokenizeQuestion(question) @@ -325,36 +339,58 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores const seedCount = Math.min(scored.length, 10) const seedIds = new Set(scored.slice(0, seedCount).map((node) => node.id)) - const directSeedIds = scored + const directSeeds = scored .filter((node) => node.relevanceBand === 'direct') - .slice(0, seedCount) - .map((node) => node.id) - const expansionSeedIds = new Set(directSeedIds.length > 0 ? directSeedIds : [...seedIds]) + .slice(0, 4) + const expansionSeedIds = new Set((directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)).map((node) => node.id)) const hopScores = new Map() const hopDistances = new Map() + const hopEvidenceTiers = new Map() + const hop1Ids = new Set() - // Hop 1: direct neighbors get 0.5x of best seed score - const bestSeedScore = scored.length > 0 ? scored[0]?.score ?? 0 : 0 - for (const seedId of expansionSeedIds) { - for (const neighborId of graph.neighbors(seedId)) { + // Hop 1: direct neighbors inherit a relation-weighted slice of each strong seed's score. + for (const seed of directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)) { + for (const neighborId of graph.neighbors(seed.id)) { if (!expansionSeedIds.has(neighborId)) { - hopScores.set(neighborId, Math.max(hopScores.get(neighborId) ?? 0, bestSeedScore * 0.5)) - hopDistances.set(neighborId, 1) + const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to') + const hopScore = seed.score * 0.5 * relationWeight(relation) + const hopEvidenceTier = relationWeight(relation) === 1 ? 1 : 0 + const existingHopScore = hopScores.get(neighborId) ?? 0 + const existingHopEvidenceTier = hopEvidenceTiers.get(neighborId) ?? 0 + if (hopScore > existingHopScore || (hopScore === existingHopScore && hopEvidenceTier > existingHopEvidenceTier)) { + hopScores.set(neighborId, hopScore) + hopDistances.set(neighborId, 1) + hopEvidenceTiers.set(neighborId, hopEvidenceTier) + } + hop1Ids.add(neighborId) } } } - // Hop 2: neighbors-of-neighbors get 0.25x (skip if budget is tight) + // Hop 2: neighbors-of-neighbors decay again, but keep this pool small and relation-aware. if (budget >= 2000) { - const hop1Ids = new Set(hopScores.keys()) + const hop2Scores = new Map() for (const hop1Id of hop1Ids) { + const hop1Score = hopScores.get(hop1Id) ?? 0 + if (hop1Score <= 0) continue for (const hop2Id of graph.neighbors(hop1Id)) { if (!seedIds.has(hop2Id) && !hop1Ids.has(hop2Id)) { - hopScores.set(hop2Id, Math.max(hopScores.get(hop2Id) ?? 0, bestSeedScore * 0.25)) - hopDistances.set(hop2Id, 2) + const relation = String(graph.edgeAttributes(hop1Id, hop2Id).relation ?? 'related_to') + const hop2Score = hop1Score * 0.5 * relationWeight(relation) + if (hop2Score > (hop2Scores.get(hop2Id) ?? 0)) { + hop2Scores.set(hop2Id, hop2Score) + } } } } + + const maxSecondHopAdds = budget >= 5000 ? 6 : 3 + for (const [hop2Id, hop2Score] of [...hop2Scores.entries()] + .sort(([leftId, leftScore], [rightId, rightScore]) => rightScore - leftScore || graph.degree(rightId) - graph.degree(leftId)) + .slice(0, maxSecondHopAdds)) { + hopScores.set(hop2Id, Math.max(hopScores.get(hop2Id) ?? 0, hop2Score)) + hopDistances.set(hop2Id, 2) + } } // Add expanded nodes not already scored @@ -382,7 +418,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), fileType, community, - evidenceTier: 0, + evidenceTier: hopDistances.get(nodeId) === 1 ? (hopEvidenceTiers.get(nodeId) ?? 0) : 0, score: hopScore, relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral', }) @@ -408,12 +444,12 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) const matchedNodes: RetrieveMatchedNode[] = [] const includedIds = new Set() let tokenCount = 0 + const inclusionOrder = [ + ...scored.filter((node) => (seedIds.has(node.id) || hopScores.has(node.id)) && node.relevanceBand !== 'peripheral'), + ...scored.filter((node) => (seedIds.has(node.id) || hopScores.has(node.id)) && node.relevanceBand === 'peripheral'), + ] - for (const node of scored) { - if (!seedIds.has(node.id) && !hopScores.has(node.id)) { - continue - } - + for (const node of inclusionOrder) { const snippet = readSnippet(node.sourceFile, node.lineNumber) const nodeText = `${node.label} ${node.sourceFile}:${node.lineNumber} ${snippet ?? ''}` const nodeTokens = estimateTokens(nodeText) diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index 6310104..146a7e0 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -119,6 +119,153 @@ describe('retrieve', () => { return graph } + function buildExpansionGraph(): KnowledgeGraph { + const graph = new KnowledgeGraph() + + graph.addNode('auth_user', { + label: 'authenticateUser', + source_file: '/src/auth.ts', + line_number: 10, + node_kind: 'function', + file_type: 'code', + community: 0, + }) + graph.addNode('auth_flow_controller', { + label: 'AuthFlowController', + source_file: '/src/auth/flow-controller.ts', + line_number: 20, + node_kind: 'class', + file_type: 'code', + community: 0, + }) + graph.addNode('auth_guard', { + label: 'AuthGuard', + source_file: '/src/auth/guard.ts', + line_number: 30, + node_kind: 'class', + file_type: 'code', + community: 0, + }) + graph.addNode('auth_policy', { + label: 'AuthPolicy', + source_file: '/src/auth/policy.ts', + line_number: 40, + node_kind: 'class', + file_type: 'code', + community: 0, + }) + + graph.addNode('session_mgr', { + label: 'SessionManager', + source_file: '/src/session.ts', + line_number: 5, + node_kind: 'class', + file_type: 'code', + community: 2, + }) + graph.addNode('session_validator', { + label: 'SessionValidator', + source_file: '/src/session-validator.ts', + line_number: 6, + node_kind: 'class', + file_type: 'code', + community: 2, + }) + graph.addNode('session_router', { + label: 'SessionRouter', + source_file: '/src/session-router.ts', + line_number: 7, + node_kind: 'class', + file_type: 'code', + community: 2, + }) + graph.addNode('session_policy', { + label: 'SessionPolicy', + source_file: '/src/session-policy.ts', + line_number: 8, + node_kind: 'class', + file_type: 'code', + community: 2, + }) + + graph.addNode('billing_store', { + label: 'BillingStore', + source_file: '/src/billing.ts', + line_number: 9, + node_kind: 'class', + file_type: 'code', + community: 1, + }) + graph.addNode('billing_cache', { + label: 'BillingCache', + source_file: '/src/billing-cache.ts', + line_number: 10, + node_kind: 'class', + file_type: 'code', + community: 1, + }) + graph.addNode('invoice_ledger', { + label: 'InvoiceLedger', + source_file: '/src/invoice-ledger.ts', + line_number: 11, + node_kind: 'class', + file_type: 'code', + community: 1, + }) + graph.addNode('tax_rules', { + label: 'TaxRules', + source_file: '/src/tax-rules.ts', + line_number: 12, + node_kind: 'class', + file_type: 'code', + community: 1, + }) + + graph.addEdge('auth_user', 'session_mgr', { relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/auth.ts' }) + graph.addEdge('auth_flow_controller', 'session_validator', { + relation: 'imports_from', + confidence: 'EXTRACTED', + source_file: '/src/auth/flow-controller.ts', + }) + graph.addEdge('auth_guard', 'session_router', { + relation: 'calls', + confidence: 'EXTRACTED', + source_file: '/src/auth/guard.ts', + }) + graph.addEdge('auth_policy', 'session_policy', { + relation: 'defines', + confidence: 'EXTRACTED', + source_file: '/src/auth/policy.ts', + }) + graph.addEdge('auth_guard', 'billing_store', { + relation: 'depends_on', + confidence: 'EXTRACTED', + source_file: '/src/auth/guard.ts', + }) + graph.addEdge('billing_store', 'billing_cache', { + relation: 'depends_on', + confidence: 'EXTRACTED', + source_file: '/src/billing.ts', + }) + graph.addEdge('billing_store', 'invoice_ledger', { + relation: 'uses', + confidence: 'EXTRACTED', + source_file: '/src/billing.ts', + }) + graph.addEdge('billing_store', 'tax_rules', { + relation: 'uses', + confidence: 'EXTRACTED', + source_file: '/src/billing.ts', + }) + graph.graph.community_labels = { + 0: 'Authentication', + 1: 'Billing', + 2: 'Session', + } + + return graph + } + it('returns empty result for no matching tokens', () => { const graph = buildTestGraph() const result = retrieveContext(graph, { question: 'how does the', budget: 5000 }) @@ -304,6 +451,28 @@ describe('retrieve', () => { expect(community0).toBeDefined() }) + it('prefers calls and imports edges over generic second-hop expansion', () => { + const graph = buildExpansionGraph() + + const result = retrieveContext(graph, { question: 'auth', budget: 5000 }) + const labels = result.matched_nodes.map((node) => node.label) + + expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('BillingStore')) + }) + + it('avoids promoting weak peripheral nodes when budget is tight', () => { + const graph = buildExpansionGraph() + + const result = retrieveContext(graph, { question: 'auth flow', budget: 80 }) + + expect(result.matched_nodes.map((node) => node.label)).toEqual( + expect.arrayContaining(['authenticateUser']), + ) + expect(result.matched_nodes).not.toEqual( + expect.arrayContaining([expect.objectContaining({ label: 'BillingStore' })]), + ) + }) + it('respects community filter', () => { const graph = buildTestGraph() const result = retrieveContext(graph, { question: 'database', budget: 5000, community: 1 }) From 286db3902e288e0b6ce125c053871b8f99a80177 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 13:44:31 +0400 Subject: [PATCH 03/18] test: assert real second-hop retrieval nodes Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/retrieve.test.ts | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index 146a7e0..50542e1 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -457,20 +457,28 @@ describe('retrieve', () => { const result = retrieveContext(graph, { question: 'auth', budget: 5000 }) const labels = result.matched_nodes.map((node) => node.label) - expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('BillingStore')) + expect(labels).toContain('SessionValidator') + expect(labels).toContain('SessionRouter') + expect(labels).toContain('SessionManager') + expect(labels).toContain('BillingCache') + expect(labels).toContain('InvoiceLedger') + expect(labels).toContain('TaxRules') + expect(labels.indexOf('SessionValidator')).toBeLessThan(labels.indexOf('BillingCache')) + expect(labels.indexOf('SessionRouter')).toBeLessThan(labels.indexOf('InvoiceLedger')) + expect(labels.indexOf('SessionManager')).toBeLessThan(labels.indexOf('TaxRules')) + expect(result.matched_nodes.find((node) => node.label === 'BillingCache')?.relevance_band).toBe('peripheral') }) it('avoids promoting weak peripheral nodes when budget is tight', () => { const graph = buildExpansionGraph() const result = retrieveContext(graph, { question: 'auth flow', budget: 80 }) + const labels = result.matched_nodes.map((node) => node.label) - expect(result.matched_nodes.map((node) => node.label)).toEqual( - expect.arrayContaining(['authenticateUser']), - ) - expect(result.matched_nodes).not.toEqual( - expect.arrayContaining([expect.objectContaining({ label: 'BillingStore' })]), - ) + expect(labels).toEqual(expect.arrayContaining(['authenticateUser'])) + expect(labels).not.toContain('BillingCache') + expect(labels).not.toContain('InvoiceLedger') + expect(labels).not.toContain('TaxRules') }) it('respects community filter', () => { From 2672fb14766259aa4c9e67ecd53e1400a21dc6a3 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 14:00:44 +0400 Subject: [PATCH 04/18] test: add retrieval quality guardrails Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/infrastructure/benchmark/quality.ts | 12 ++++++++ src/runtime/retrieve.ts | 40 +++++++++++++++++++++++-- tests/unit/benchmark-quality.test.ts | 23 ++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/src/infrastructure/benchmark/quality.ts b/src/infrastructure/benchmark/quality.ts index eb5d4cd..680e4b2 100644 --- a/src/infrastructure/benchmark/quality.ts +++ b/src/infrastructure/benchmark/quality.ts @@ -55,6 +55,18 @@ export const GOLD_QUESTIONS: GoldQuestion[] = [ question: 'how does the retrieve MCP tool find relevant nodes', expected_labels: ['retrievecontext', 'scorenode'], }, + { + question: 'retrieveContext', + expected_labels: ['retrievecontext'], + }, + { + question: 'how does retrieveContext build community labels', + expected_labels: ['retrievecontext', 'buildcommunitylabels'], + }, + { + question: 'scoreNode', + expected_labels: ['scorenode'], + }, { question: 'how does javascript extraction work', expected_labels: ['extractjs', 'extractionnode'], diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 51cdc79..64458fd 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -206,6 +206,8 @@ interface ScoredNode { nodeKind: string fileType: string community: number | null + exactLabelMatch: boolean + sourcePathMatch: boolean evidenceTier: 0 | 1 | 2 score: number relevanceBand: 'direct' | 'related' | 'peripheral' @@ -263,6 +265,8 @@ function relationWeight(relation: string): number { case 'imports_from': case 'defines': return 1 + case 'contains': + return 1.2 case 'uses': case 'depends_on': return 0.7 @@ -271,6 +275,10 @@ function relationWeight(relation: string): number { } } +function isPrimaryExpansionRelation(relation: string): boolean { + return relation === 'calls' || relation === 'imports_from' || relation === 'defines' || relation === 'contains' +} + export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult { const { question, budget } = options const questionTokens = tokenizeQuestion(question) @@ -327,6 +335,8 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), fileType, community, + exactLabelMatch: score.labelExactScore > 0, + sourcePathMatch: score.sourcePathScore > 0, evidenceTier: evidenceTierForSeedScore(score), score: score.total, relevanceBand: score.labelExactScore > 0 || score.labelTokenScore > 0 ? 'direct' : 'related', @@ -338,6 +348,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) // Step 3: Multi-hop expansion — take top seeds, expand 2 hops with decaying scores const seedCount = Math.min(scored.length, 10) + const hasExactSeedMatch = scored.some((node) => node.exactLabelMatch) const seedIds = new Set(scored.slice(0, seedCount).map((node) => node.id)) const directSeeds = scored .filter((node) => node.relevanceBand === 'direct') @@ -354,7 +365,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) if (!expansionSeedIds.has(neighborId)) { const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to') const hopScore = seed.score * 0.5 * relationWeight(relation) - const hopEvidenceTier = relationWeight(relation) === 1 ? 1 : 0 + const hopEvidenceTier = isPrimaryExpansionRelation(relation) ? 1 : 0 const existingHopScore = hopScores.get(neighborId) ?? 0 const existingHopEvidenceTier = hopEvidenceTiers.get(neighborId) ?? 0 if (hopScore > existingHopScore || (hopScore === existingHopScore && hopEvidenceTier > existingHopEvidenceTier)) { @@ -367,8 +378,31 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) } } + for (const node of scored) { + const hopScore = hopScores.get(node.id) + if (!hopScore) { + continue + } + + node.score += hopScore + const hopEvidenceTier = hopEvidenceTiers.get(node.id) ?? 0 + if (node.sourcePathMatch && hopEvidenceTier > 0) { + node.evidenceTier = 2 + node.relevanceBand = 'direct' + node.score += 0.5 + continue + } + + if (hopEvidenceTier > node.evidenceTier) { + node.evidenceTier = hopEvidenceTier + if (node.relevanceBand === 'peripheral') { + node.relevanceBand = 'related' + } + } + } + // Hop 2: neighbors-of-neighbors decay again, but keep this pool small and relation-aware. - if (budget >= 2000) { + if (budget >= 2000 && !hasExactSeedMatch) { const hop2Scores = new Map() for (const hop1Id of hop1Ids) { const hop1Score = hopScores.get(hop1Id) ?? 0 @@ -418,6 +452,8 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), fileType, community, + exactLabelMatch: false, + sourcePathMatch: false, evidenceTier: hopDistances.get(nodeId) === 1 ? (hopEvidenceTiers.get(nodeId) ?? 0) : 0, score: hopScore, relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral', diff --git a/tests/unit/benchmark-quality.test.ts b/tests/unit/benchmark-quality.test.ts index e6e34ad..258bcf3 100644 --- a/tests/unit/benchmark-quality.test.ts +++ b/tests/unit/benchmark-quality.test.ts @@ -63,6 +63,29 @@ describe('retrieval quality benchmark', () => { expect(report.questions[0]!.missing_labels).toEqual([]) }) + it('raises reciprocal rank when the expected direct node appears before supporting context', () => { + const graph = buildTestGraph() + const questions: GoldQuestion[] = [ + { question: 'how does authentication work', expected_labels: ['loginhandler'] }, + ] + + const report = evaluateRetrievalQuality(graph, questions, 3000) + + expect(report.mrr).toBe(1) + }) + + it('keeps recall while reducing unnecessary returned labels for narrow symbol queries', () => { + const graph = buildTestGraph() + const report = evaluateRetrievalQuality( + graph, + [{ question: 'login handler', expected_labels: ['loginhandler'] }], + 3000, + ) + + expect(report.questions[0]?.recall).toBe(1) + expect(report.questions[0]?.returned_labels.length).toBeLessThanOrEqual(3) + }) + it('reports zero recall when no expected labels match', () => { const graph = buildTestGraph() const questions: GoldQuestion[] = [{ question: 'quantum entanglement physics', expected_labels: ['quantumprocessor'] }] From 0ba24d0ab86fe16adf3961d6523d5b547fe939fa Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 14:20:44 +0400 Subject: [PATCH 05/18] feat: improve retrieval ranking quality Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d83967..ccc05a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to the TypeScript package will be documented in this file. ## [Unreleased] +### Improved + +- **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results + ## [0.8.7] - 2026-04-27 ### Changed From 8b7492bdac97178c7097c9ea6cfbb0a5a7699bc2 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 14:42:45 +0400 Subject: [PATCH 06/18] Fix directed retrieval expansion Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/runtime/retrieve.ts | 20 ++++++++++++++++---- tests/unit/retrieve.test.ts | 30 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 64458fd..ff42184 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -275,6 +275,18 @@ function relationWeight(relation: string): number { } } +function relationBetweenNodes(graph: KnowledgeGraph, source: string, target: string): string { + try { + return String(graph.edgeAttributes(source, target).relation ?? 'related_to') + } catch { + try { + return String(graph.edgeAttributes(target, source).relation ?? 'related_to') + } catch { + return 'related_to' + } + } +} + function isPrimaryExpansionRelation(relation: string): boolean { return relation === 'calls' || relation === 'imports_from' || relation === 'defines' || relation === 'contains' } @@ -361,9 +373,9 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) // Hop 1: direct neighbors inherit a relation-weighted slice of each strong seed's score. for (const seed of directSeeds.length > 0 ? directSeeds : scored.slice(0, seedCount)) { - for (const neighborId of graph.neighbors(seed.id)) { + for (const neighborId of graph.incidentNeighbors(seed.id)) { if (!expansionSeedIds.has(neighborId)) { - const relation = String(graph.edgeAttributes(seed.id, neighborId).relation ?? 'related_to') + const relation = relationBetweenNodes(graph, seed.id, neighborId) const hopScore = seed.score * 0.5 * relationWeight(relation) const hopEvidenceTier = isPrimaryExpansionRelation(relation) ? 1 : 0 const existingHopScore = hopScores.get(neighborId) ?? 0 @@ -407,9 +419,9 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) for (const hop1Id of hop1Ids) { const hop1Score = hopScores.get(hop1Id) ?? 0 if (hop1Score <= 0) continue - for (const hop2Id of graph.neighbors(hop1Id)) { + for (const hop2Id of graph.incidentNeighbors(hop1Id)) { if (!seedIds.has(hop2Id) && !hop1Ids.has(hop2Id)) { - const relation = String(graph.edgeAttributes(hop1Id, hop2Id).relation ?? 'related_to') + const relation = relationBetweenNodes(graph, hop1Id, hop2Id) const hop2Score = hop1Score * 0.5 * relationWeight(relation) if (hop2Score > (hop2Scores.get(hop2Id) ?? 0)) { hop2Scores.set(hop2Id, hop2Score) diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index 50542e1..a9cc965 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -432,6 +432,36 @@ describe('retrieve', () => { expect(labels).toContain('SessionManager') }) + it('includes predecessors of matched nodes in directed graphs', () => { + const graph = new KnowledgeGraph({ directed: true }) + graph.addNode('caller', { + label: 'CallerService', + source_file: '/src/caller.ts', + line_number: 1, + node_kind: 'function', + file_type: 'code', + }) + graph.addNode('target', { + label: 'TargetHandler', + source_file: '/src/target.ts', + line_number: 2, + node_kind: 'function', + file_type: 'code', + }) + graph.addEdge('caller', 'target', { + relation: 'calls', + confidence: 'EXTRACTED', + source_file: '/src/caller.ts', + }) + + const result = retrieveContext(graph, { question: 'target', budget: 5000 }) + const labels = result.matched_nodes.map((node) => node.label) + + expect(labels).toContain('TargetHandler') + expect(labels).toContain('CallerService') + expect(result.matched_nodes.find((node) => node.label === 'CallerService')?.relevance_band).toBe('related') + }) + it('includes relationships between matched nodes', () => { const graph = buildTestGraph() const result = retrieveContext(graph, { question: 'auth', budget: 5000 }) From 74f6aedf33618d50b4d3ab148b954b73387bc724 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 20:28:51 +0400 Subject: [PATCH 07/18] chore: port compare usage baseline Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/infrastructure/compare.ts | 192 ++++++++++++++++++++++++++++++++-- tests/unit/compare.test.ts | 125 ++++++++++++++++++++++ 2 files changed, 311 insertions(+), 6 deletions(-) diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts index 58a5f33..fcd2ec3 100644 --- a/src/infrastructure/compare.ts +++ b/src/infrastructure/compare.ts @@ -15,6 +15,7 @@ export type CompareBaselineMode = 'full' | 'bounded' export type CompareRunMode = 'baseline' | 'graphify' export type CompareRunStatus = 'not_run' | 'succeeded' | 'failed' | 'context_overflow' export type CompareFailureReason = 'prompt_too_long' | 'runner_error' | 'exec_error' +export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input' export interface ComparePromptPack { kind: 'baseline' | 'graphify' @@ -60,6 +61,17 @@ export interface ComparePromptTokenEstimator { exact: boolean } +export interface ComparePromptUsage { + provider: 'claude' + source: 'structured_stdout' + input_tokens: number + output_tokens: number + cache_creation_input_tokens: number + cache_read_input_tokens: number + input_total_tokens: number + total_tokens: number +} + export interface ComparePromptReport { question: string graph_path: string @@ -68,10 +80,21 @@ export interface ComparePromptReport { baseline_prompt_tokens: number graphify_prompt_tokens: number reduction_ratio: number + baseline_total_tokens: number | null + graphify_total_tokens: number | null + total_reduction_ratio: number | null baseline_prompt_tokens_estimated: number graphify_prompt_tokens_estimated: number reduction_ratio_estimated: number prompt_token_estimator: ComparePromptTokenEstimator + prompt_token_source: { + baseline: ComparePromptTokenSource + graphify: ComparePromptTokenSource + } + usage: { + baseline: ComparePromptUsage | null + graphify: ComparePromptUsage | null + } started_at: string completed_at: string elapsed_ms: { @@ -149,6 +172,11 @@ export interface ExecuteCompareRunsDependencies { now?: () => Date } +interface ParsedCompareRunnerOutput { + answerText: string + usage: ComparePromptUsage | null +} + const DEFAULT_RETRIEVAL_BUDGET = 3_000 const DEFAULT_BOUNDED_BASELINE_TOKENS = 4_000 const EXEC_TEMPLATE_PLACEHOLDER_PATTERN = /\{[a-z_][a-z0-9_]*\}/gi @@ -179,6 +207,80 @@ function summarizeExecTemplate(execTemplate: string): CompareExecCommandSummary } } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null +} + +function parseNonNegativeNumber(value: unknown): number | null { + return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : null +} + +function parseStructuredCompareAnswer(payload: Record): string | null { + if (typeof payload.result === 'string') { + return payload.result + } + if (typeof payload.completion === 'string') { + return payload.completion + } + return null +} + +function parseClaudeStructuredUsage(payload: Record): ComparePromptUsage | null { + if (!isRecord(payload.usage)) { + return null + } + + const inputTokens = parseNonNegativeNumber(payload.usage.input_tokens) + const outputTokens = parseNonNegativeNumber(payload.usage.output_tokens) + if (inputTokens === null || outputTokens === null) { + return null + } + + const cacheCreationInputTokens = parseNonNegativeNumber(payload.usage.cache_creation_input_tokens) ?? 0 + const cacheReadInputTokens = parseNonNegativeNumber(payload.usage.cache_read_input_tokens) ?? 0 + const inputTotalTokens = inputTokens + cacheCreationInputTokens + cacheReadInputTokens + + return { + provider: 'claude', + source: 'structured_stdout', + input_tokens: inputTokens, + output_tokens: outputTokens, + cache_creation_input_tokens: cacheCreationInputTokens, + cache_read_input_tokens: cacheReadInputTokens, + input_total_tokens: inputTotalTokens, + total_tokens: inputTotalTokens + outputTokens, + } +} + +function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null { + const trimmed = stdout.trim() + if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) { + return null + } + + let payload: unknown + try { + payload = JSON.parse(trimmed) + } catch { + return null + } + + if (!isRecord(payload)) { + return null + } + + const answerText = parseStructuredCompareAnswer(payload) + const usage = parseClaudeStructuredUsage(payload) + if (answerText === null && usage === null) { + return null + } + + return { + answerText: answerText ?? stdout, + usage, + } +} + function validateCompareExecTemplate(template: string): void { if (PROMPT_FILE_COMMAND_SUBSTITUTION_PATTERNS.some((pattern) => pattern.test(template))) { throw new Error( @@ -413,6 +515,33 @@ function computeReductionRatio(baselinePromptTokens: number, graphifyPromptToken return Number((baselinePromptTokens / graphifyPromptTokens).toFixed(1)) } +function formatTokenComparison(baselineTokens: number, graphifyTokens: number): string { + if (baselineTokens <= 0 || graphifyTokens <= 0) { + return 'n/a' + } + if (baselineTokens === graphifyTokens) { + return 'same size' + } + if (baselineTokens > graphifyTokens) { + return `${computeReductionRatio(baselineTokens, graphifyTokens)}x smaller` + } + return `${Number((graphifyTokens / baselineTokens).toFixed(1))}x larger` +} + +function syncComparePromptMetrics(report: ComparePromptReport): void { + report.baseline_prompt_tokens = report.usage.baseline?.input_total_tokens ?? report.baseline_prompt_tokens_estimated + report.graphify_prompt_tokens = report.usage.graphify?.input_total_tokens ?? report.graphify_prompt_tokens_estimated + report.reduction_ratio = computeReductionRatio(report.baseline_prompt_tokens, report.graphify_prompt_tokens) + report.baseline_total_tokens = report.usage.baseline?.total_tokens ?? null + report.graphify_total_tokens = report.usage.graphify?.total_tokens ?? null + report.total_reduction_ratio = + report.baseline_total_tokens !== null && report.graphify_total_tokens !== null + ? computeReductionRatio(report.baseline_total_tokens, report.graphify_total_tokens) + : null + report.prompt_token_source.baseline = report.usage.baseline === null ? 'estimated_cl100k_base' : 'claude_reported_input' + report.prompt_token_source.graphify = report.usage.graphify === null ? 'estimated_cl100k_base' : 'claude_reported_input' +} + function portablePath(path: string): string { return relative(process.cwd(), path) || '.' } @@ -756,10 +885,21 @@ export function generateCompareArtifacts(input: GenerateCompareArtifactsInput): baseline_prompt_tokens: baselinePromptTokens, graphify_prompt_tokens: graphifyPromptTokens, reduction_ratio: computeReductionRatio(baselinePromptTokens, graphifyPromptTokens), + baseline_total_tokens: null, + graphify_total_tokens: null, + total_reduction_ratio: null, baseline_prompt_tokens_estimated: baselinePromptTokens, graphify_prompt_tokens_estimated: graphifyPromptTokens, reduction_ratio_estimated: computeReductionRatio(baselinePromptTokens, graphifyPromptTokens), prompt_token_estimator: QUERY_TOKEN_ESTIMATOR, + prompt_token_source: { + baseline: 'estimated_cl100k_base', + graphify: 'estimated_cl100k_base', + }, + usage: { + baseline: null, + graphify: null, + }, started_at: now.toISOString(), completed_at: now.toISOString(), elapsed_ms: { @@ -790,6 +930,7 @@ export function generateCompareArtifacts(input: GenerateCompareArtifactsInput): paths, } + syncComparePromptMetrics(report) writeCompareReport(report) return report }) @@ -841,9 +982,11 @@ export async function executeCompareRuns( question: report.question, command, }) - ensureCompareAnswerFile(execution.outputFile, executionResult.stdout) + const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout) + ensureCompareAnswerFile(execution.outputFile, parsedOutput?.answerText ?? executionResult.stdout) const contextOverflowEvidence = executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr) + report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null report.status[execution.mode] = executionResult.exitCode === 0 ? 'succeeded' : contextOverflowEvidence !== null ? 'context_overflow' : 'failed' report.elapsed_ms[execution.mode] = executionResult.elapsedMs @@ -854,6 +997,7 @@ export async function executeCompareRuns( report.evidence[execution.mode] = contextOverflowEvidence } catch (error) { ensureCompareAnswerFile(execution.outputFile, '') + report.usage[execution.mode] = null const errorMessage = error instanceof Error ? error.message : String(error) const contextOverflowEvidence = extractContextOverflowEvidence(errorMessage) report.status[execution.mode] = contextOverflowEvidence !== null ? 'context_overflow' : 'failed' @@ -864,6 +1008,7 @@ export async function executeCompareRuns( report.evidence[execution.mode] = contextOverflowEvidence } + syncComparePromptMetrics(report) report.completed_at = now().toISOString() writeCompareReport(report) } @@ -876,6 +1021,18 @@ function sumPromptTokens(reports: readonly ComparePromptReport[], mode: CompareR return reports.reduce((total, report) => total + (mode === 'baseline' ? report.baseline_prompt_tokens : report.graphify_prompt_tokens), 0) } +function sumTotalTokens(reports: readonly ComparePromptReport[], mode: CompareRunMode): number | null { + let total = 0 + for (const report of reports) { + const value = mode === 'baseline' ? report.baseline_total_tokens : report.graphify_total_tokens + if (value === null) { + return null + } + total += value + } + return total +} + function countPromptRuns(reports: readonly ComparePromptReport[], status: Exclude): number { return reports.reduce((total, report) => { const baseline = report.status.baseline === status ? 1 : 0 @@ -884,22 +1041,45 @@ function countPromptRuns(reports: readonly ComparePromptReport[], status: Exclud }, 0) } +function countPromptUsageRuns(reports: readonly ComparePromptReport[]): number { + return reports.reduce((total, report) => total + (report.usage.baseline === null ? 0 : 1) + (report.usage.graphify === null ? 0 : 1), 0) +} + export function formatCompareSummary(result: GenerateCompareArtifactsResult): string { const baselineTokens = sumPromptTokens(result.reports, 'baseline') const graphifyTokens = sumPromptTokens(result.reports, 'graphify') - const reductionRatio = computeReductionRatio(baselineTokens, graphifyTokens) + const baselineTotalTokens = sumTotalTokens(result.reports, 'baseline') + const graphifyTotalTokens = sumTotalTokens(result.reports, 'graphify') + const totalReductionRatio = + baselineTotalTokens !== null && graphifyTotalTokens !== null ? computeReductionRatio(baselineTotalTokens, graphifyTotalTokens) : null const failedRuns = countPromptRuns(result.reports, 'failed') const contextOverflowRuns = countPromptRuns(result.reports, 'context_overflow') const succeededRuns = countPromptRuns(result.reports, 'succeeded') - - return [ + const usageRuns = countPromptUsageRuns(result.reports) + const totalRuns = result.reports.length * 2 + const promptTokenLabel = + usageRuns === totalRuns + ? 'Input tokens (Claude reported)' + : usageRuns > 0 + ? `Input tokens (Claude reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)` + : `Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model})` + + const lines = [ `[graphify compare] completed ${result.reports.length} question(s)`, `- Output: ${result.output_root}`, - `- Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model}): baseline ${baselineTokens} · graphify ${graphifyTokens} · ${reductionRatio}x smaller`, + `- ${promptTokenLabel}: baseline ${baselineTokens} · graphify ${graphifyTokens} · ${formatTokenComparison(baselineTokens, graphifyTokens)}`, `- Prompt runs: ${succeededRuns} succeeded${contextOverflowRuns > 0 ? ` · ${contextOverflowRuns} context overflow` : ''}${ failedRuns > 0 ? ` · ${failedRuns} failed` : '' }`, - ].join('\n') + ] + + if (baselineTotalTokens !== null && graphifyTotalTokens !== null && totalReductionRatio !== null) { + lines.splice(3, 0, `- Total tokens (Claude reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`) + } else if (usageRuns > 0 && usageRuns < totalRuns) { + lines.splice(3, 0, `- Usage capture: Claude reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`) + } + + return lines.join('\n') } export async function runCompareCommand( diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index f83e079..a8e9e2d 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -611,6 +611,131 @@ describe('compare runtime', () => { ) }) + it('captures Claude-reported usage from structured runner output and saves plain answers', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + type: 'result', + subtype: 'success', + result: `${execution.mode} answer\n`, + usage: + execution.mode === 'baseline' + ? { + input_tokens: 1200, + output_tokens: 90, + cache_creation_input_tokens: 100, + cache_read_input_tokens: 20, + } + : { + input_tokens: 400, + output_tokens: 70, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 10, + }, + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n') + expect(report.baseline_prompt_tokens).toBe(1320) + expect(report.graphify_prompt_tokens).toBe(410) + expect(report.prompt_token_source).toEqual({ + baseline: 'claude_reported_input', + graphify: 'claude_reported_input', + }) + expect(report.usage).toEqual({ + baseline: { + provider: 'claude', + source: 'structured_stdout', + input_tokens: 1200, + output_tokens: 90, + cache_creation_input_tokens: 100, + cache_read_input_tokens: 20, + input_total_tokens: 1320, + total_tokens: 1410, + }, + graphify: { + provider: 'claude', + source: 'structured_stdout', + input_tokens: 400, + output_tokens: 70, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 10, + input_total_tokens: 410, + total_tokens: 480, + }, + }) + expect(report.baseline_total_tokens).toBe(1410) + expect(report.graphify_total_tokens).toBe(480) + expect(formatCompareSummary(result)).toContain('Input tokens (Claude reported): baseline 1320 · graphify 410') + expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 1410 · graphify 480') + }) + + it('reports when graphify uses more Claude-reported tokens than the baseline', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + type: 'result', + subtype: 'success', + result: `${execution.mode} answer\n`, + usage: + execution.mode === 'baseline' + ? { + input_tokens: 300, + output_tokens: 50, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + } + : { + input_tokens: 500, + output_tokens: 80, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + }), + stderr: '', + elapsedMs: 1, + }), + }, + ) + + expect(formatCompareSummary(result)).toContain('Input tokens (Claude reported): baseline 300 · graphify 500 · 1.7x larger') + expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 350 · graphify 580 · 1.7x larger') + }) + it('preserves partial compare results when one side fails', async () => { const graph = makeGraph() writeProjectFiles() From b0312e1a559f4c9eba938348b90724311aede67f Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 20:38:22 +0400 Subject: [PATCH 08/18] fix: avoid JSON answer artifact fallback Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/infrastructure/compare.ts | 9 ++-- tests/unit/compare.test.ts | 77 +++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts index fcd2ec3..58f12cf 100644 --- a/src/infrastructure/compare.ts +++ b/src/infrastructure/compare.ts @@ -173,7 +173,7 @@ export interface ExecuteCompareRunsDependencies { } interface ParsedCompareRunnerOutput { - answerText: string + answerText: string | null usage: ComparePromptUsage | null } @@ -276,7 +276,7 @@ function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunner } return { - answerText: answerText ?? stdout, + answerText, usage, } } @@ -983,7 +983,10 @@ export async function executeCompareRuns( command, }) const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout) - ensureCompareAnswerFile(execution.outputFile, parsedOutput?.answerText ?? executionResult.stdout) + ensureCompareAnswerFile( + execution.outputFile, + parsedOutput === null ? executionResult.stdout : parsedOutput.answerText ?? '', + ) const contextOverflowEvidence = executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr) report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index a8e9e2d..cd4514c 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -690,6 +690,83 @@ describe('compare runtime', () => { expect(formatCompareSummary(result)).toContain('Total tokens (Claude reported): baseline 1410 · graphify 480') }) + it('does not write structured stdout JSON into answer artifacts when usage is present without answer text', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async () => ({ + exitCode: 0, + stdout: JSON.stringify({ + type: 'result', + subtype: 'success', + usage: { + input_tokens: 1200, + output_tokens: 90, + cache_creation_input_tokens: 100, + cache_read_input_tokens: 20, + }, + }), + stderr: '', + elapsedMs: 11, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('') + expect(report.usage.baseline?.total_tokens).toBe(1410) + expect(report.usage.graphify?.total_tokens).toBe(1410) + }) + + it('falls back to raw stdout for unrecognized structured JSON output', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const stdout = JSON.stringify({ + type: 'result', + subtype: 'success', + message: 'runner emitted raw JSON without parsed answer metadata', + }) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async () => ({ + exitCode: 0, + stdout, + stderr: '', + elapsedMs: 11, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe(stdout) + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe(stdout) + expect(report.usage.baseline).toBeNull() + expect(report.usage.graphify).toBeNull() + }) + it('reports when graphify uses more Claude-reported tokens than the baseline', async () => { const graph = makeGraph() writeProjectFiles() From a3f703b53c1a8ce42545602b676b8d854f82b4b3 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 20:49:39 +0400 Subject: [PATCH 09/18] test: cover Gemini compare usage capture Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/compare.test.ts | 128 +++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index cd4514c..8aad476 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -767,6 +767,134 @@ describe('compare runtime', () => { expect(report.usage.graphify).toBeNull() }) + it('captures Gemini-reported usage from structured runner output and saves plain answers', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + candidates: [ + { + content: { + parts: [{ text: `${execution.mode} answer\n` }], + }, + }, + ], + usageMetadata: { + promptTokenCount: 400, + candidatesTokenCount: 80, + totalTokenCount: 480, + }, + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n') + expect(report.usage.baseline).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 80, + total_tokens: 480, + }), + ) + expect(report.usage.graphify).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 80, + total_tokens: 480, + }), + ) + + const savedReport = JSON.parse(readFileSync(report.paths.report, 'utf8')) as { + usage: { + baseline: Record | null + graphify: Record | null + } + } + expect(savedReport.usage.baseline).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 80, + total_tokens: 480, + }), + ) + expect(savedReport.usage.graphify).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 80, + total_tokens: 480, + }), + ) + }) + + it('promotes Gemini-reported input and total tokens into compare summaries', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + candidates: [ + { + content: { + parts: [{ text: `${execution.mode} answer\n` }], + }, + }, + ], + usageMetadata: { + promptTokenCount: 400, + candidatesTokenCount: 80, + totalTokenCount: 480, + }, + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(report.baseline_prompt_tokens).toBe(400) + expect(report.graphify_prompt_tokens).toBe(400) + expect(report.baseline_total_tokens).toBe(480) + expect(report.graphify_total_tokens).toBe(480) + expect(formatCompareSummary(result)).toContain('Input tokens (Gemini reported): baseline 400 · graphify 400') + expect(formatCompareSummary(result)).toContain('Total tokens (Gemini reported): baseline 480 · graphify 480') + expect(formatCompareSummary(result)).toContain('reported') + }) + it('reports when graphify uses more Claude-reported tokens than the baseline', async () => { const graph = makeGraph() writeProjectFiles() From 78f1c01a7137dc586dd6a2ccd6a1bbdfdfe3c66b Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 20:58:42 +0400 Subject: [PATCH 10/18] test: strengthen Gemini compare regressions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/compare.test.ts | 66 +++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index 8aad476..82386e9 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -792,11 +792,18 @@ describe('compare runtime', () => { }, }, ], - usageMetadata: { - promptTokenCount: 400, - candidatesTokenCount: 80, - totalTokenCount: 480, - }, + usageMetadata: + execution.mode === 'baseline' + ? { + promptTokenCount: 1200, + candidatesTokenCount: 90, + totalTokenCount: 1290, + } + : { + promptTokenCount: 400, + candidatesTokenCount: 70, + totalTokenCount: 470, + }, }), stderr: '', elapsedMs: execution.mode === 'baseline' ? 11 : 17, @@ -810,17 +817,17 @@ describe('compare runtime', () => { expect(report.usage.baseline).toEqual( expect.objectContaining({ provider: 'gemini', - input_tokens: 400, - output_tokens: 80, - total_tokens: 480, + input_tokens: 1200, + output_tokens: 90, + total_tokens: 1290, }), ) expect(report.usage.graphify).toEqual( expect.objectContaining({ provider: 'gemini', input_tokens: 400, - output_tokens: 80, - total_tokens: 480, + output_tokens: 70, + total_tokens: 470, }), ) @@ -833,17 +840,17 @@ describe('compare runtime', () => { expect(savedReport.usage.baseline).toEqual( expect.objectContaining({ provider: 'gemini', - input_tokens: 400, - output_tokens: 80, - total_tokens: 480, + input_tokens: 1200, + output_tokens: 90, + total_tokens: 1290, }), ) expect(savedReport.usage.graphify).toEqual( expect.objectContaining({ provider: 'gemini', input_tokens: 400, - output_tokens: 80, - total_tokens: 480, + output_tokens: 70, + total_tokens: 470, }), ) }) @@ -873,11 +880,18 @@ describe('compare runtime', () => { }, }, ], - usageMetadata: { - promptTokenCount: 400, - candidatesTokenCount: 80, - totalTokenCount: 480, - }, + usageMetadata: + execution.mode === 'baseline' + ? { + promptTokenCount: 1200, + candidatesTokenCount: 90, + totalTokenCount: 1290, + } + : { + promptTokenCount: 400, + candidatesTokenCount: 70, + totalTokenCount: 470, + }, }), stderr: '', elapsedMs: execution.mode === 'baseline' ? 11 : 17, @@ -886,13 +900,13 @@ describe('compare runtime', () => { ) const report = result.reports[0]! - expect(report.baseline_prompt_tokens).toBe(400) + expect(report.baseline_prompt_tokens).toBe(1200) expect(report.graphify_prompt_tokens).toBe(400) - expect(report.baseline_total_tokens).toBe(480) - expect(report.graphify_total_tokens).toBe(480) - expect(formatCompareSummary(result)).toContain('Input tokens (Gemini reported): baseline 400 · graphify 400') - expect(formatCompareSummary(result)).toContain('Total tokens (Gemini reported): baseline 480 · graphify 480') - expect(formatCompareSummary(result)).toContain('reported') + expect(report.baseline_total_tokens).toBe(1290) + expect(report.graphify_total_tokens).toBe(470) + const summary = formatCompareSummary(result) + expect(summary).toContain('Input tokens (Gemini reported): baseline 1200 · graphify 400') + expect(summary).toContain('Total tokens (Gemini reported): baseline 1290 · graphify 470') }) it('reports when graphify uses more Claude-reported tokens than the baseline', async () => { From 112bfe724a30193f4eebb8b0585f0ccee7aa4636 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 21:14:03 +0400 Subject: [PATCH 11/18] feat: capture Gemini compare usage Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/infrastructure/compare.ts | 162 +++++++++++++++++++++++++++++----- 1 file changed, 141 insertions(+), 21 deletions(-) diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts index 58f12cf..61a45d9 100644 --- a/src/infrastructure/compare.ts +++ b/src/infrastructure/compare.ts @@ -15,7 +15,7 @@ export type CompareBaselineMode = 'full' | 'bounded' export type CompareRunMode = 'baseline' | 'graphify' export type CompareRunStatus = 'not_run' | 'succeeded' | 'failed' | 'context_overflow' export type CompareFailureReason = 'prompt_too_long' | 'runner_error' | 'exec_error' -export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input' +export type ComparePromptTokenSource = 'estimated_cl100k_base' | 'claude_reported_input' | 'gemini_reported_input' export interface ComparePromptPack { kind: 'baseline' | 'graphify' @@ -62,7 +62,7 @@ export interface ComparePromptTokenEstimator { } export interface ComparePromptUsage { - provider: 'claude' + provider: 'claude' | 'gemini' source: 'structured_stdout' input_tokens: number output_tokens: number @@ -177,6 +177,8 @@ interface ParsedCompareRunnerOutput { usage: ComparePromptUsage | null } +type CompareRunnerOutputParser = (stdout: string) => ParsedCompareRunnerOutput | null + const DEFAULT_RETRIEVAL_BUDGET = 3_000 const DEFAULT_BOUNDED_BASELINE_TOKENS = 4_000 const EXEC_TEMPLATE_PLACEHOLDER_PATTERN = /\{[a-z_][a-z0-9_]*\}/gi @@ -225,6 +227,22 @@ function parseStructuredCompareAnswer(payload: Record): string return null } +function parseJsonRecord(stdout: string): Record | null { + const trimmed = stdout.trim() + if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) { + return null + } + + let payload: unknown + try { + payload = JSON.parse(trimmed) + } catch { + return null + } + + return isRecord(payload) ? payload : null +} + function parseClaudeStructuredUsage(payload: Record): ComparePromptUsage | null { if (!isRecord(payload.usage)) { return null @@ -252,25 +270,75 @@ function parseClaudeStructuredUsage(payload: Record): ComparePr } } -function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null { - const trimmed = stdout.trim() - if (!trimmed.startsWith('{') || !trimmed.endsWith('}')) { +function parseClaudeStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null { + const payload = parseJsonRecord(stdout) + if (payload === null) { return null } - let payload: unknown - try { - payload = JSON.parse(trimmed) - } catch { + const answerText = parseStructuredCompareAnswer(payload) + const usage = parseClaudeStructuredUsage(payload) + if (answerText === null && usage === null) { return null } - if (!isRecord(payload)) { + return { + answerText, + usage, + } +} + +function parseGeminiStructuredAnswer(payload: Record): string | null { + if (!Array.isArray(payload.candidates) || payload.candidates.length === 0) { return null } - const answerText = parseStructuredCompareAnswer(payload) - const usage = parseClaudeStructuredUsage(payload) + const firstCandidate = payload.candidates[0] + if (!isRecord(firstCandidate) || !isRecord(firstCandidate.content) || !Array.isArray(firstCandidate.content.parts)) { + return null + } + + for (const part of firstCandidate.content.parts) { + if (isRecord(part) && typeof part.text === 'string') { + return part.text + } + } + + return null +} + +function parseGeminiStructuredUsage(payload: Record): ComparePromptUsage | null { + if (!isRecord(payload.usageMetadata)) { + return null + } + + const inputTokens = parseNonNegativeNumber(payload.usageMetadata.promptTokenCount) + const outputTokens = parseNonNegativeNumber(payload.usageMetadata.candidatesTokenCount) + const totalTokens = parseNonNegativeNumber(payload.usageMetadata.totalTokenCount) + if (inputTokens === null || outputTokens === null || totalTokens === null) { + return null + } + + return { + provider: 'gemini', + source: 'structured_stdout', + input_tokens: inputTokens, + output_tokens: outputTokens, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + input_total_tokens: inputTokens, + total_tokens: totalTokens, + } +} + +function parseGeminiStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput | null { + const payload = parseJsonRecord(stdout) + if (payload === null) { + return null + } + + const answerText = parseGeminiStructuredAnswer(payload) + const usage = parseGeminiStructuredUsage(payload) if (answerText === null && usage === null) { return null } @@ -281,6 +349,29 @@ function parseStructuredCompareRunnerOutput(stdout: string): ParsedCompareRunner } } +const COMPARE_RUNNER_OUTPUT_PARSERS: readonly CompareRunnerOutputParser[] = [ + parseClaudeStructuredCompareRunnerOutput, + parseGeminiStructuredCompareRunnerOutput, +] + +function parsePlainTextCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput { + return { + answerText: stdout, + usage: null, + } +} + +function parseCompareRunnerOutput(stdout: string): ParsedCompareRunnerOutput { + for (const parser of COMPARE_RUNNER_OUTPUT_PARSERS) { + const parsedOutput = parser(stdout) + if (parsedOutput !== null) { + return parsedOutput + } + } + + return parsePlainTextCompareRunnerOutput(stdout) +} + function validateCompareExecTemplate(template: string): void { if (PROMPT_FILE_COMMAND_SUBSTITUTION_PATTERNS.some((pattern) => pattern.test(template))) { throw new Error( @@ -538,8 +629,16 @@ function syncComparePromptMetrics(report: ComparePromptReport): void { report.baseline_total_tokens !== null && report.graphify_total_tokens !== null ? computeReductionRatio(report.baseline_total_tokens, report.graphify_total_tokens) : null - report.prompt_token_source.baseline = report.usage.baseline === null ? 'estimated_cl100k_base' : 'claude_reported_input' - report.prompt_token_source.graphify = report.usage.graphify === null ? 'estimated_cl100k_base' : 'claude_reported_input' + report.prompt_token_source.baseline = comparePromptTokenSource(report.usage.baseline) + report.prompt_token_source.graphify = comparePromptTokenSource(report.usage.graphify) +} + +function comparePromptTokenSource(usage: ComparePromptUsage | null): ComparePromptTokenSource { + if (usage === null) { + return 'estimated_cl100k_base' + } + + return usage.provider === 'claude' ? 'claude_reported_input' : 'gemini_reported_input' } function portablePath(path: string): string { @@ -982,14 +1081,14 @@ export async function executeCompareRuns( question: report.question, command, }) - const parsedOutput = parseStructuredCompareRunnerOutput(executionResult.stdout) + const parsedOutput = parseCompareRunnerOutput(executionResult.stdout) ensureCompareAnswerFile( execution.outputFile, - parsedOutput === null ? executionResult.stdout : parsedOutput.answerText ?? '', + parsedOutput.answerText ?? '', ) const contextOverflowEvidence = executionResult.exitCode === 0 ? null : extractContextOverflowEvidence(executionResult.stdout, executionResult.stderr) - report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput?.usage ?? null : null + report.usage[execution.mode] = executionResult.exitCode === 0 ? parsedOutput.usage : null report.status[execution.mode] = executionResult.exitCode === 0 ? 'succeeded' : contextOverflowEvidence !== null ? 'context_overflow' : 'failed' report.elapsed_ms[execution.mode] = executionResult.elapsedMs @@ -1048,6 +1147,26 @@ function countPromptUsageRuns(reports: readonly ComparePromptReport[]): number { return reports.reduce((total, report) => total + (report.usage.baseline === null ? 0 : 1) + (report.usage.graphify === null ? 0 : 1), 0) } +function usageProviderSummaryLabel(reports: readonly ComparePromptReport[]): string { + const providers = new Set() + + for (const report of reports) { + if (report.usage.baseline !== null) { + providers.add(report.usage.baseline.provider) + } + if (report.usage.graphify !== null) { + providers.add(report.usage.graphify.provider) + } + } + + if (providers.size !== 1) { + return 'Runner' + } + + const [provider] = providers + return provider === 'gemini' ? 'Gemini' : 'Claude' +} + export function formatCompareSummary(result: GenerateCompareArtifactsResult): string { const baselineTokens = sumPromptTokens(result.reports, 'baseline') const graphifyTokens = sumPromptTokens(result.reports, 'graphify') @@ -1060,11 +1179,12 @@ export function formatCompareSummary(result: GenerateCompareArtifactsResult): st const succeededRuns = countPromptRuns(result.reports, 'succeeded') const usageRuns = countPromptUsageRuns(result.reports) const totalRuns = result.reports.length * 2 + const usageProviderLabel = usageProviderSummaryLabel(result.reports) const promptTokenLabel = usageRuns === totalRuns - ? 'Input tokens (Claude reported)' + ? `Input tokens (${usageProviderLabel} reported)` : usageRuns > 0 - ? `Input tokens (Claude reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)` + ? `Input tokens (${usageProviderLabel} reported where available; ${QUERY_TOKEN_ESTIMATOR.model} estimate fallback)` : `Prompt tokens (estimated ${QUERY_TOKEN_ESTIMATOR.model})` const lines = [ @@ -1077,9 +1197,9 @@ export function formatCompareSummary(result: GenerateCompareArtifactsResult): st ] if (baselineTotalTokens !== null && graphifyTotalTokens !== null && totalReductionRatio !== null) { - lines.splice(3, 0, `- Total tokens (Claude reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`) + lines.splice(3, 0, `- Total tokens (${usageProviderLabel} reported): baseline ${baselineTotalTokens} · graphify ${graphifyTotalTokens} · ${formatTokenComparison(baselineTotalTokens, graphifyTotalTokens)}`) } else if (usageRuns > 0 && usageRuns < totalRuns) { - lines.splice(3, 0, `- Usage capture: Claude reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`) + lines.splice(3, 0, `- Usage capture: ${usageProviderLabel} reported usage for ${usageRuns}/${totalRuns} prompt runs; remaining runs used local estimate fallback`) } return lines.join('\n') From 60cac7631165534947bace44a34b4e50b870c8b8 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 21:27:30 +0400 Subject: [PATCH 12/18] fix: concatenate Gemini compare answer parts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/infrastructure/compare.ts | 5 ++-- tests/unit/compare.test.ts | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/infrastructure/compare.ts b/src/infrastructure/compare.ts index 61a45d9..5c61863 100644 --- a/src/infrastructure/compare.ts +++ b/src/infrastructure/compare.ts @@ -298,13 +298,14 @@ function parseGeminiStructuredAnswer(payload: Record): string | return null } + let answerText = '' for (const part of firstCandidate.content.parts) { if (isRecord(part) && typeof part.text === 'string') { - return part.text + answerText += part.text } } - return null + return answerText.length > 0 ? answerText : null } function parseGeminiStructuredUsage(payload: Record): ComparePromptUsage | null { diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index 82386e9..2cceb5d 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -855,6 +855,53 @@ describe('compare runtime', () => { ) }) + it('concatenates Gemini text parts from the first candidate into answer artifacts', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + candidates: [ + { + content: { + parts: [{ text: `${execution.mode} ` }, { inlineData: { mimeType: 'text/plain' } }, { text: 'answer' }, { text: '\n' }], + }, + }, + { + content: { + parts: [{ text: 'ignored candidate answer\n' }], + }, + }, + ], + usageMetadata: { + promptTokenCount: 1200, + candidatesTokenCount: 90, + totalTokenCount: 1290, + }, + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n') + }) + it('promotes Gemini-reported input and total tokens into compare summaries', async () => { const graph = makeGraph() writeProjectFiles() From 75630eeb39919e5800b13409a9da68158178c7cf Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 21:35:42 +0400 Subject: [PATCH 13/18] test: lock Gemini compare fallback behavior Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/compare.test.ts | 75 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index 2cceb5d..6674217 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -855,6 +855,50 @@ describe('compare runtime', () => { ) }) + it('saves Gemini answers when structured usage metadata is missing and keeps estimate summaries', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + candidates: [ + { + content: { + parts: [{ text: `${execution.mode} answer\n` }], + }, + }, + ], + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('baseline answer\n') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n') + expect(report.usage.baseline).toBeNull() + expect(report.usage.graphify).toBeNull() + expect(report.prompt_token_source).toEqual({ + baseline: 'estimated_cl100k_base', + graphify: 'estimated_cl100k_base', + }) + expect(formatCompareSummary(result)).toContain('estimate') + }) + it('concatenates Gemini text parts from the first candidate into answer artifacts', async () => { const graph = makeGraph() writeProjectFiles() @@ -902,6 +946,37 @@ describe('compare runtime', () => { expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('graphify answer\n') }) + it('preserves malformed Gemini JSON stdout as the answer artifact without capturing usage', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async () => ({ + exitCode: 0, + stdout: '{not valid json', + stderr: '', + elapsedMs: 11, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toContain('{not valid json') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toContain('{not valid json') + expect(report.usage.baseline).toBeNull() + expect(report.usage.graphify).toBeNull() + }) + it('promotes Gemini-reported input and total tokens into compare summaries', async () => { const graph = makeGraph() writeProjectFiles() From 90f0c7acc2409df510943dc11ee5de40b283d3fc Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 21:38:31 +0400 Subject: [PATCH 14/18] test: tighten Gemini fallback assertions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/compare.test.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index 6674217..43a738e 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -950,6 +950,7 @@ describe('compare runtime', () => { const graph = makeGraph() writeProjectFiles() const graphPath = writeGraphFixture(graph) + const rawStdout = '{not valid json' const result = await executeCompareRuns( { @@ -963,7 +964,7 @@ describe('compare runtime', () => { { runner: async () => ({ exitCode: 0, - stdout: '{not valid json', + stdout: rawStdout, stderr: '', elapsedMs: 11, }), @@ -971,8 +972,8 @@ describe('compare runtime', () => { ) const report = result.reports[0]! - expect(readFileSync(report.answer_paths.baseline, 'utf8')).toContain('{not valid json') - expect(readFileSync(report.answer_paths.graphify, 'utf8')).toContain('{not valid json') + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe(rawStdout) + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe(rawStdout) expect(report.usage.baseline).toBeNull() expect(report.usage.graphify).toBeNull() }) From f5afba88a254eb35ca189d5aca39eb400f781e0a Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 22:02:34 +0400 Subject: [PATCH 15/18] test: cover Gemini usage-only artifacts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/compare.test.ts | 88 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/unit/compare.test.ts b/tests/unit/compare.test.ts index 43a738e..bee03b2 100644 --- a/tests/unit/compare.test.ts +++ b/tests/unit/compare.test.ts @@ -855,6 +855,94 @@ describe('compare runtime', () => { ) }) + it('does not write Gemini structured stdout JSON into answer artifacts when usage metadata is present without answer text', async () => { + const graph = makeGraph() + writeProjectFiles() + const graphPath = writeGraphFixture(graph) + + const result = await executeCompareRuns( + { + graphPath, + question: 'how does login create a session', + outputDir: COMPARE_OUTPUT_ROOT, + execTemplate: 'runner --prompt {prompt_file} --mode {mode} --out {output_file}', + baselineMode: 'full', + now: new Date('2026-04-24T19:30:00.000Z'), + }, + { + runner: async (execution) => ({ + exitCode: 0, + stdout: JSON.stringify({ + candidates: [ + { + content: { + parts: [{ inlineData: { mimeType: 'text/plain' } }], + }, + }, + ], + usageMetadata: + execution.mode === 'baseline' + ? { + promptTokenCount: 1200, + candidatesTokenCount: 90, + totalTokenCount: 1290, + } + : { + promptTokenCount: 400, + candidatesTokenCount: 70, + totalTokenCount: 470, + }, + }), + stderr: '', + elapsedMs: execution.mode === 'baseline' ? 11 : 17, + }), + }, + ) + + const report = result.reports[0]! + expect(readFileSync(report.answer_paths.baseline, 'utf8')).toBe('') + expect(readFileSync(report.answer_paths.graphify, 'utf8')).toBe('') + expect(report.usage.baseline).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 1200, + output_tokens: 90, + total_tokens: 1290, + }), + ) + expect(report.usage.graphify).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 70, + total_tokens: 470, + }), + ) + + const savedReport = JSON.parse(readFileSync(report.paths.report, 'utf8')) as { + usage: { + baseline: Record | null + graphify: Record | null + } + } + expect(savedReport.usage.baseline).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 1200, + output_tokens: 90, + total_tokens: 1290, + }), + ) + expect(savedReport.usage.graphify).toEqual( + expect.objectContaining({ + provider: 'gemini', + input_tokens: 400, + output_tokens: 70, + total_tokens: 470, + }), + ) + }) + it('saves Gemini answers when structured usage metadata is missing and keeps estimate summaries', async () => { const graph = makeGraph() writeProjectFiles() From 686e57150f35d8fa0b96c370fb1c8a217edf588f Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 22:09:42 +0400 Subject: [PATCH 16/18] feat: support Gemini compare usage capture Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 1 + README.md | 13 +++++++++++-- docs/proof-workflows.md | 12 ++++++++++-- examples/why-graphify.md | 19 ++++++++++++++++--- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccc05a9..7e90189 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to the TypeScript package will be documented in this file. ### Improved - **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results +- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces ## [0.8.7] - 2026-04-27 diff --git a/README.md b/README.md index 442c67c..d51fbc1 100644 --- a/README.md +++ b/README.md @@ -79,16 +79,25 @@ node dist/src/cli/bin.js compare "How does login create a session?" \ --yes ``` +Gemini-safe installed-CLI invocation: + +```bash +graphify-ts compare "How does auth work?" \ + --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --yes +``` + What `compare` does: - Prints a warning before execution because it may consume paid model tokens. Use `--yes` for non-interactive runs and CI. - Expands runner placeholders: `{prompt_file}`, `{question}`, `{mode}`, and `{output_file}`. - For large prompts, pass `{prompt_file}` through stdin or file redirection. Avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`), which can hit OS argument-length limits. - Writes a proof bundle under `graphify-out/compare//` with `baseline-prompt.txt`, `graphify-prompt.txt`, `baseline-answer.txt`, `graphify-answer.txt`, and `report.json`. -- Reports prompt-token counts as local `cl100k_base` estimates, not provider billing tokens. +- Promotes provider-reported usage into `report.json` and the terminal summary when the runner emits structured JSON with usage (for Gemini, `usageMetadata` from `--output-format json`; for Claude, structured JSON with `usage`). +- Falls back to labeled local `cl100k_base` prompt estimates when the runner only returns answer text or malformed JSON, so the token source stays explicit. - Preserves partial artifacts when one side fails, and classifies prompt-size failures such as `Prompt is too long` as `context_overflow` evidence in `report.json`. -Use `compare` when you want a showcase or a customer-proof run. Use `benchmark` and `eval` when you want repeatable local measurements without calling a model. +Use `compare` when you want a showcase or a customer-proof run. Use `benchmark` and `eval` when you want repeatable local measurements without calling a model; they remain offline estimate surfaces rather than provider-reported usage surfaces. ## Graph time travel (ref-to-ref graph compare) diff --git a/docs/proof-workflows.md b/docs/proof-workflows.md index c2f2360..2d35e3a 100644 --- a/docs/proof-workflows.md +++ b/docs/proof-workflows.md @@ -32,6 +32,14 @@ node dist/src/cli/bin.js compare "How does login create a session?" \ --yes ``` +Gemini-safe installed-CLI invocation: + +```bash +graphify-ts compare "How does auth work?" \ + --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --yes +``` + What gets saved under `graphify-out/compare//`: - `baseline-prompt.txt` @@ -40,7 +48,7 @@ What gets saved under `graphify-out/compare//`: - `graphify-answer.txt` - `report.json` -Use this when you need customer-proof or your own apples-to-apples answer comparison. It can spend paid model tokens, so it is intentionally separate from the local benchmark/eval path. +When Gemini emits structured JSON with `usageMetadata`, `compare` captures real reported input and total tokens in `report.json` and the terminal summary. If the runner only returns answer text or malformed JSON, `compare` falls back to labeled local `cl100k_base` prompt estimates instead. Use this when you need customer-proof or your own apples-to-apples answer comparison. It can spend paid model tokens, so it is intentionally separate from the local benchmark/eval path. `benchmark` and `eval` remain offline estimate surfaces. ## 3. Production and multi-repo proof @@ -78,7 +86,7 @@ What this proves that a single-repo demo cannot: |---|---| | "Does the graph improve retrieval quality on a labeled set?" | `eval` | | "Does the graph reduce prompt size while keeping expected evidence?" | `benchmark` | -| "Will my actual model answer better with graphify than with a naive baseline?" | `compare` | +| "Will my actual model answer better with graphify than with a naive baseline, and optionally capture provider-reported usage?" | `compare` | | "Can this work across frontend/backend/shared repos?" | `federate` + `serve --stdio` | For the narrative production benchmark and the GoValidate numbers, see [`examples/why-graphify.md`](../examples/why-graphify.md). For exact support coverage by language and file type, see [`language-capability-matrix.md`](./language-capability-matrix.md). diff --git a/examples/why-graphify.md b/examples/why-graphify.md index d7b5b19..da88406 100644 --- a/examples/why-graphify.md +++ b/examples/why-graphify.md @@ -141,14 +141,22 @@ node dist/src/cli/bin.js compare "How does login create a session?" \ --yes ``` +Gemini-safe installed-CLI invocation: + +```bash +graphify-ts compare "How does auth work?" \ + --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --yes +``` + What this gives you: - one baseline prompt and one graphify prompt for the same question - two real model answers from your own terminal runner - a saved proof bundle in `graphify-out/compare//` -- prompt-token counts and run statuses in `report.json` +- prompt-token counts, usage-source labels, and run statuses in `report.json` -Important: `compare` may spend paid model tokens. It prints a warning before execution and requires `--yes` in non-interactive runs. For large prompts, use stdin or file redirection with `{prompt_file}`; avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`) because shell argument expansion can fail on full-repo baselines. +Important: `compare` may spend paid model tokens. It prints a warning before execution and requires `--yes` in non-interactive runs. For large prompts, use stdin or file redirection with `{prompt_file}`; avoid shell command substitution around `{prompt_file}` (for example `$(cat {prompt_file})`) because shell argument expansion can fail on full-repo baselines. If Gemini emits structured JSON with `usageMetadata`, `compare` records real reported input and total tokens. If the runner only returns answer text or malformed JSON, `compare` falls back to labeled local `cl100k_base` prompt estimates instead. `benchmark` and `eval` stay offline estimate surfaces. ## Run It on Your Own Codebase @@ -168,6 +176,11 @@ graphify-ts eval graphify-out/graph.json --questions benchmark-questions.json # If you want a real same-model A/B proof run graphify-ts compare "How does auth work?" --exec 'cat {prompt_file} | claude -p' --yes +# Gemini-safe compare runner with structured usage capture +graphify-ts compare "How does auth work?" \ + --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --yes + # Set up your AI agent graphify-ts claude install # writes .mcp.json with MCP server graphify-ts cursor install # writes .cursor/mcp.json @@ -187,7 +200,7 @@ For an internal team rollout, the most convincing sequence is usually: That progression keeps the proof honest: - `benchmark` and `eval` are local graph-quality measurements -- `compare` is the model-facing proof +- `compare` is the model-facing proof, with reported usage when the runner emits structured JSON and labeled estimates otherwise - `federate` is the production architecture proof for frontend/backend/shared or microservice splits ## Capability Coverage Matters From 29599cdfd92d53205b252e22791996a33cbe9907 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 22:15:10 +0400 Subject: [PATCH 17/18] docs: fix Gemini compare invocation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 2 +- docs/proof-workflows.md | 2 +- examples/why-graphify.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d51fbc1..3b38d19 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ Gemini-safe installed-CLI invocation: ```bash graphify-ts compare "How does auth work?" \ - --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --exec 'cat {prompt_file} | gemini -p "" --output-format json' \ --yes ``` diff --git a/docs/proof-workflows.md b/docs/proof-workflows.md index 2d35e3a..754c3d5 100644 --- a/docs/proof-workflows.md +++ b/docs/proof-workflows.md @@ -36,7 +36,7 @@ Gemini-safe installed-CLI invocation: ```bash graphify-ts compare "How does auth work?" \ - --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --exec 'cat {prompt_file} | gemini -p "" --output-format json' \ --yes ``` diff --git a/examples/why-graphify.md b/examples/why-graphify.md index da88406..4b16d16 100644 --- a/examples/why-graphify.md +++ b/examples/why-graphify.md @@ -145,7 +145,7 @@ Gemini-safe installed-CLI invocation: ```bash graphify-ts compare "How does auth work?" \ - --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --exec 'cat {prompt_file} | gemini -p "" --output-format json' \ --yes ``` @@ -178,7 +178,7 @@ graphify-ts compare "How does auth work?" --exec 'cat {prompt_file} | claude -p' # Gemini-safe compare runner with structured usage capture graphify-ts compare "How does auth work?" \ - --exec 'cat {prompt_file} | gemini -p --output-format json' \ + --exec 'cat {prompt_file} | gemini -p "" --output-format json' \ --yes # Set up your AI agent From b281904e09b92b6458153eb747238934010b9e6d Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Mon, 27 Apr 2026 22:18:51 +0400 Subject: [PATCH 18/18] docs: fix Gemini changelog example Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e90189..657bc5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ All notable changes to the TypeScript package will be documented in this file. ### Improved - **Retrieval quality**: improved retrieval ranking with relation-aware expansion so connected evidence surfaces more effectively, and strengthened recall/MRR eval guardrails to prevent misleading benchmark results -- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces +- **Gemini compare docs**: documented the stdin-safe Gemini JSON runner (`cat {prompt_file} | gemini -p "" --output-format json`), clarified that `compare` uses reported Gemini/Claude usage when structured JSON includes it, falls back to labeled local estimates otherwise, and that `benchmark`/`eval` remain offline estimate surfaces ## [0.8.7] - 2026-04-27