@@ -6,11 +6,14 @@ use crate::truncate::approx_token_count;
66use crate :: truncate:: approx_tokens_from_byte_count_i64;
77use crate :: truncate:: truncate_function_output_items_with_policy;
88use crate :: truncate:: truncate_text;
9+ use base64:: Engine ;
10+ use base64:: engine:: general_purpose:: STANDARD as BASE64_STANDARD ;
911use codex_protocol:: models:: BaseInstructions ;
1012use codex_protocol:: models:: ContentItem ;
1113use codex_protocol:: models:: FunctionCallOutputBody ;
1214use codex_protocol:: models:: FunctionCallOutputContentItem ;
1315use codex_protocol:: models:: FunctionCallOutputPayload ;
16+ use codex_protocol:: models:: ImageDetail ;
1417use codex_protocol:: models:: ResponseItem ;
1518use codex_protocol:: openai_models:: InputModality ;
1619use codex_protocol:: protocol:: TokenUsage ;
@@ -428,7 +431,15 @@ fn estimate_item_token_count(item: &ResponseItem) -> i64 {
428431///
429432/// The estimator later converts bytes to tokens using a 4-bytes/token heuristic
430433/// with ceiling division, so 7,373 bytes maps to approximately 1,844 tokens.
431- const IMAGE_BYTES_ESTIMATE : i64 = 7373 ;
434+ const RESIZED_IMAGE_BYTES_ESTIMATE : i64 = 7373 ;
435+ // See https://developers.openai.com/api/docs/guides/images-vision#calculating-costs.
436+ // Use the documented GPT-5 high-detail calculation only for `detail: "original"`;
437+ // all other image inputs continue to use `RESIZED_IMAGE_BYTES_ESTIMATE`.
438+ const ORIGINAL_IMAGE_MAX_DIM : u32 = 2048 ;
439+ const ORIGINAL_IMAGE_TARGET_SHORT_SIDE : u32 = 768 ;
440+ const ORIGINAL_IMAGE_TILE_SIZE : u32 = 512 ;
441+ const ORIGINAL_IMAGE_BASE_TOKENS : i64 = 70 ;
442+ const ORIGINAL_IMAGE_TILE_TOKENS : i64 = 140 ;
432443
433444pub ( crate ) fn estimate_response_item_model_visible_bytes ( item : & ResponseItem ) -> i64 {
434445 match item {
@@ -444,15 +455,15 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
444455 let raw = serde_json:: to_string ( item)
445456 . map ( |serialized| i64:: try_from ( serialized. len ( ) ) . unwrap_or ( i64:: MAX ) )
446457 . unwrap_or_default ( ) ;
447- let ( payload_bytes, image_count ) = image_data_url_estimate_adjustment ( item) ;
448- if payload_bytes == 0 || image_count == 0 {
458+ let ( payload_bytes, replacement_bytes ) = image_data_url_estimate_adjustment ( item) ;
459+ if payload_bytes == 0 || replacement_bytes == 0 {
449460 raw
450461 } else {
451- // Replace raw base64 payload bytes with a fixed per-image cost .
452- // We intentionally preserve the data URL prefix and JSON wrapper
453- // bytes already included in `raw`.
462+ // Replace raw base64 payload bytes with a per-image estimate .
463+ // We intentionally preserve the data URL prefix and JSON
464+ // wrapper bytes already included in `raw`.
454465 raw. saturating_sub ( payload_bytes)
455- . saturating_add ( image_count . saturating_mul ( IMAGE_BYTES_ESTIMATE ) )
466+ . saturating_add ( replacement_bytes )
456467 }
457468 }
458469 }
@@ -463,7 +474,7 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
463474///
464475/// We only discount payloads for `data:image/...;base64,...` URLs (case
465476/// insensitive markers) and leave everything else at raw serialized size.
466- fn base64_data_url_payload_len ( url : & str ) -> Option < usize > {
477+ fn parse_base64_image_data_url ( url : & str ) -> Option < & str > {
467478 if !url
468479 . get ( .."data:" . len ( ) )
469480 . is_some_and ( |prefix| prefix. eq_ignore_ascii_case ( "data:" ) )
@@ -489,47 +500,94 @@ fn base64_data_url_payload_len(url: &str) -> Option<usize> {
489500 if !has_base64_marker {
490501 return None ;
491502 }
492- Some ( payload. len ( ) )
503+ Some ( payload)
504+ }
505+
506+ fn base64_data_url_payload_len ( url : & str ) -> Option < usize > {
507+ parse_base64_image_data_url ( url) . map ( str:: len)
508+ }
509+
510+ fn estimate_original_image_bytes ( image_url : & str ) -> Option < i64 > {
511+ let payload = parse_base64_image_data_url ( image_url) ?;
512+ let bytes = BASE64_STANDARD . decode ( payload) . ok ( ) ?;
513+ let dynamic = image:: load_from_memory ( & bytes) . ok ( ) ?;
514+ let mut width = dynamic. width ( ) ;
515+ let mut height = dynamic. height ( ) ;
516+
517+ if width > ORIGINAL_IMAGE_MAX_DIM || height > ORIGINAL_IMAGE_MAX_DIM {
518+ let scale = f64:: min (
519+ f64:: from ( ORIGINAL_IMAGE_MAX_DIM ) / f64:: from ( width) ,
520+ f64:: from ( ORIGINAL_IMAGE_MAX_DIM ) / f64:: from ( height) ,
521+ ) ;
522+ width = ( f64:: from ( width) * scale) . round ( ) . max ( 1.0 ) as u32 ;
523+ height = ( f64:: from ( height) * scale) . round ( ) . max ( 1.0 ) as u32 ;
524+ }
525+
526+ let shortest_side = width. min ( height) ;
527+ if shortest_side > 0 {
528+ let scale = f64:: from ( ORIGINAL_IMAGE_TARGET_SHORT_SIDE ) / f64:: from ( shortest_side) ;
529+ width = ( f64:: from ( width) * scale) . round ( ) . max ( 1.0 ) as u32 ;
530+ height = ( f64:: from ( height) * scale) . round ( ) . max ( 1.0 ) as u32 ;
531+ }
532+
533+ let tiles_wide = i64:: from (
534+ width. saturating_add ( ORIGINAL_IMAGE_TILE_SIZE . saturating_sub ( 1 ) ) / ORIGINAL_IMAGE_TILE_SIZE ,
535+ ) ;
536+ let tiles_high = i64:: from (
537+ height. saturating_add ( ORIGINAL_IMAGE_TILE_SIZE . saturating_sub ( 1 ) )
538+ / ORIGINAL_IMAGE_TILE_SIZE ,
539+ ) ;
540+ let tile_count = tiles_wide. saturating_mul ( tiles_high) ;
541+ let tokens = ORIGINAL_IMAGE_BASE_TOKENS
542+ . saturating_add ( tile_count. saturating_mul ( ORIGINAL_IMAGE_TILE_TOKENS ) ) ;
543+ Some ( tokens. saturating_mul ( 4 ) )
493544}
494545
495546/// Scans one response item for discount-eligible inline image data URLs and
496547/// returns:
497548/// - total base64 payload bytes to subtract from raw serialized size
498- /// - count of qualifying images to replace with `IMAGE_BYTES_ESTIMATE`
549+ /// - total replacement byte estimate for those images
499550fn image_data_url_estimate_adjustment ( item : & ResponseItem ) -> ( i64 , i64 ) {
500551 let mut payload_bytes = 0i64 ;
501- let mut image_count = 0i64 ;
552+ let mut replacement_bytes = 0i64 ;
502553
503- let mut accumulate = |image_url : & str | {
554+ let mut accumulate = |image_url : & str , detail : Option < ImageDetail > | {
504555 if let Some ( payload_len) = base64_data_url_payload_len ( image_url) {
505556 payload_bytes =
506557 payload_bytes. saturating_add ( i64:: try_from ( payload_len) . unwrap_or ( i64:: MAX ) ) ;
507- image_count = image_count. saturating_add ( 1 ) ;
558+ replacement_bytes = replacement_bytes. saturating_add ( match detail {
559+ Some ( ImageDetail :: Original ) => {
560+ estimate_original_image_bytes ( image_url) . unwrap_or ( RESIZED_IMAGE_BYTES_ESTIMATE )
561+ }
562+ _ => RESIZED_IMAGE_BYTES_ESTIMATE ,
563+ } ) ;
508564 }
509565 } ;
510566
511567 match item {
512568 ResponseItem :: Message { content, .. } => {
513569 for content_item in content {
514570 if let ContentItem :: InputImage { image_url } = content_item {
515- accumulate ( image_url) ;
571+ accumulate ( image_url, None ) ;
516572 }
517573 }
518574 }
519575 ResponseItem :: FunctionCallOutput { output, .. }
520576 | ResponseItem :: CustomToolCallOutput { output, .. } => {
521577 if let FunctionCallOutputBody :: ContentItems ( items) = & output. body {
522578 for content_item in items {
523- if let FunctionCallOutputContentItem :: InputImage { image_url } = content_item {
524- accumulate ( image_url) ;
579+ if let FunctionCallOutputContentItem :: InputImage { image_url, detail } =
580+ content_item
581+ {
582+ accumulate ( image_url, * detail) ;
525583 }
526584 }
527585 }
528586 }
529587 _ => { }
530588 }
531589
532- ( payload_bytes, image_count )
590+ ( payload_bytes, replacement_bytes )
533591}
534592
535593fn is_model_generated_item ( item : & ResponseItem ) -> bool {
0 commit comments