Skip to content

Commit ab3e4c2

Browse files
committed
Add original-resolution view_image support
git-stack-id: fjord/original_image_res---4hw7wu3447irye git-stack-title: Add original-resolution view_image support
1 parent 3b5996f commit ab3e4c2

File tree

15 files changed

+581
-48
lines changed

15 files changed

+581
-48
lines changed

codex-rs/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

codex-rs/app-server/tests/suite/v2/dynamic_tools.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,10 @@ async fn dynamic_tool_call_round_trip_sends_content_items_to_model() -> Result<(
399399
FunctionCallOutputContentItem::InputText { text }
400400
}
401401
DynamicToolCallOutputContentItem::InputImage { image_url } => {
402-
FunctionCallOutputContentItem::InputImage { image_url }
402+
FunctionCallOutputContentItem::InputImage {
403+
image_url,
404+
detail: None,
405+
}
403406
}
404407
})
405408
.collect::<Vec<FunctionCallOutputContentItem>>();

codex-rs/core/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ codex-protocol = { workspace = true }
4646
codex-rmcp-client = { workspace = true }
4747
codex-state = { workspace = true }
4848
codex-utils-absolute-path = { workspace = true }
49+
codex-utils-image = { workspace = true }
4950
codex-utils-home-dir = { workspace = true }
5051
codex-utils-pty = { workspace = true }
5152
codex-utils-readiness = { workspace = true }
@@ -62,6 +63,7 @@ eventsource-stream = { workspace = true }
6263
futures = { workspace = true }
6364
http = { workspace = true }
6465
iana-time-zone = { workspace = true }
66+
image = { workspace = true, features = ["jpeg", "png"] }
6567
indexmap = { workspace = true }
6668
keyring = { workspace = true, features = ["crypto-rust"] }
6769
libc = { workspace = true }
@@ -86,7 +88,6 @@ sha2 = { workspace = true }
8688
shlex = { workspace = true }
8789
similar = { workspace = true }
8890
tempfile = { workspace = true }
89-
test-case = "3.3.1"
9091
test-log = { workspace = true }
9192
thiserror = { workspace = true }
9293
time = { workspace = true, features = [
@@ -155,11 +156,11 @@ codex-test-macros = { workspace = true }
155156
codex-utils-cargo-bin = { workspace = true }
156157
core_test_support = { workspace = true }
157158
ctor = { workspace = true }
158-
image = { workspace = true, features = ["jpeg", "png"] }
159159
insta = { workspace = true }
160160
maplit = { workspace = true }
161161
predicates = { workspace = true }
162162
pretty_assertions = { workspace = true }
163+
test-case = "3.3.1"
163164
opentelemetry_sdk = { workspace = true, features = [
164165
"experimental_metrics_custom_reader",
165166
"metrics",

codex-rs/core/config.schema.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,9 @@
430430
"use_linux_sandbox_bwrap": {
431431
"type": "boolean"
432432
},
433+
"view_image_original_resolution": {
434+
"type": "boolean"
435+
},
433436
"voice_transcription": {
434437
"type": "boolean"
435438
},
@@ -1761,6 +1764,9 @@
17611764
"use_linux_sandbox_bwrap": {
17621765
"type": "boolean"
17631766
},
1767+
"view_image_original_resolution": {
1768+
"type": "boolean"
1769+
},
17641770
"voice_transcription": {
17651771
"type": "boolean"
17661772
},

codex-rs/core/src/context_manager/history.rs

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@ use crate::truncate::approx_token_count;
66
use crate::truncate::approx_tokens_from_byte_count_i64;
77
use crate::truncate::truncate_function_output_items_with_policy;
88
use crate::truncate::truncate_text;
9+
use base64::Engine;
10+
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
911
use codex_protocol::models::BaseInstructions;
1012
use codex_protocol::models::ContentItem;
1113
use codex_protocol::models::FunctionCallOutputBody;
1214
use codex_protocol::models::FunctionCallOutputContentItem;
1315
use codex_protocol::models::FunctionCallOutputPayload;
16+
use codex_protocol::models::ImageDetail;
1417
use codex_protocol::models::ResponseItem;
1518
use codex_protocol::openai_models::InputModality;
1619
use codex_protocol::protocol::TokenUsage;
@@ -428,7 +431,15 @@ fn estimate_item_token_count(item: &ResponseItem) -> i64 {
428431
///
429432
/// The estimator later converts bytes to tokens using a 4-bytes/token heuristic
430433
/// with ceiling division, so 7,373 bytes maps to approximately 1,844 tokens.
431-
const IMAGE_BYTES_ESTIMATE: i64 = 7373;
434+
const RESIZED_IMAGE_BYTES_ESTIMATE: i64 = 7373;
435+
// See https://developers.openai.com/api/docs/guides/images-vision#calculating-costs.
436+
// Use the documented GPT-5 high-detail calculation only for `detail: "original"`;
437+
// all other image inputs continue to use `RESIZED_IMAGE_BYTES_ESTIMATE`.
438+
const ORIGINAL_IMAGE_MAX_DIM: u32 = 2048;
439+
const ORIGINAL_IMAGE_TARGET_SHORT_SIDE: u32 = 768;
440+
const ORIGINAL_IMAGE_TILE_SIZE: u32 = 512;
441+
const ORIGINAL_IMAGE_BASE_TOKENS: i64 = 70;
442+
const ORIGINAL_IMAGE_TILE_TOKENS: i64 = 140;
432443

433444
pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> i64 {
434445
match item {
@@ -444,15 +455,15 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
444455
let raw = serde_json::to_string(item)
445456
.map(|serialized| i64::try_from(serialized.len()).unwrap_or(i64::MAX))
446457
.unwrap_or_default();
447-
let (payload_bytes, image_count) = image_data_url_estimate_adjustment(item);
448-
if payload_bytes == 0 || image_count == 0 {
458+
let (payload_bytes, replacement_bytes) = image_data_url_estimate_adjustment(item);
459+
if payload_bytes == 0 || replacement_bytes == 0 {
449460
raw
450461
} else {
451-
// Replace raw base64 payload bytes with a fixed per-image cost.
452-
// We intentionally preserve the data URL prefix and JSON wrapper
453-
// bytes already included in `raw`.
462+
// Replace raw base64 payload bytes with a per-image estimate.
463+
// We intentionally preserve the data URL prefix and JSON
464+
// wrapper bytes already included in `raw`.
454465
raw.saturating_sub(payload_bytes)
455-
.saturating_add(image_count.saturating_mul(IMAGE_BYTES_ESTIMATE))
466+
.saturating_add(replacement_bytes)
456467
}
457468
}
458469
}
@@ -463,7 +474,7 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
463474
///
464475
/// We only discount payloads for `data:image/...;base64,...` URLs (case
465476
/// insensitive markers) and leave everything else at raw serialized size.
466-
fn base64_data_url_payload_len(url: &str) -> Option<usize> {
477+
fn parse_base64_image_data_url(url: &str) -> Option<&str> {
467478
if !url
468479
.get(.."data:".len())
469480
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
@@ -489,47 +500,94 @@ fn base64_data_url_payload_len(url: &str) -> Option<usize> {
489500
if !has_base64_marker {
490501
return None;
491502
}
492-
Some(payload.len())
503+
Some(payload)
504+
}
505+
506+
fn base64_data_url_payload_len(url: &str) -> Option<usize> {
507+
parse_base64_image_data_url(url).map(str::len)
508+
}
509+
510+
fn estimate_original_image_bytes(image_url: &str) -> Option<i64> {
511+
let payload = parse_base64_image_data_url(image_url)?;
512+
let bytes = BASE64_STANDARD.decode(payload).ok()?;
513+
let dynamic = image::load_from_memory(&bytes).ok()?;
514+
let mut width = dynamic.width();
515+
let mut height = dynamic.height();
516+
517+
if width > ORIGINAL_IMAGE_MAX_DIM || height > ORIGINAL_IMAGE_MAX_DIM {
518+
let scale = f64::min(
519+
f64::from(ORIGINAL_IMAGE_MAX_DIM) / f64::from(width),
520+
f64::from(ORIGINAL_IMAGE_MAX_DIM) / f64::from(height),
521+
);
522+
width = (f64::from(width) * scale).round().max(1.0) as u32;
523+
height = (f64::from(height) * scale).round().max(1.0) as u32;
524+
}
525+
526+
let shortest_side = width.min(height);
527+
if shortest_side > 0 {
528+
let scale = f64::from(ORIGINAL_IMAGE_TARGET_SHORT_SIDE) / f64::from(shortest_side);
529+
width = (f64::from(width) * scale).round().max(1.0) as u32;
530+
height = (f64::from(height) * scale).round().max(1.0) as u32;
531+
}
532+
533+
let tiles_wide = i64::from(
534+
width.saturating_add(ORIGINAL_IMAGE_TILE_SIZE.saturating_sub(1)) / ORIGINAL_IMAGE_TILE_SIZE,
535+
);
536+
let tiles_high = i64::from(
537+
height.saturating_add(ORIGINAL_IMAGE_TILE_SIZE.saturating_sub(1))
538+
/ ORIGINAL_IMAGE_TILE_SIZE,
539+
);
540+
let tile_count = tiles_wide.saturating_mul(tiles_high);
541+
let tokens = ORIGINAL_IMAGE_BASE_TOKENS
542+
.saturating_add(tile_count.saturating_mul(ORIGINAL_IMAGE_TILE_TOKENS));
543+
Some(tokens.saturating_mul(4))
493544
}
494545

495546
/// Scans one response item for discount-eligible inline image data URLs and
496547
/// returns:
497548
/// - total base64 payload bytes to subtract from raw serialized size
498-
/// - count of qualifying images to replace with `IMAGE_BYTES_ESTIMATE`
549+
/// - total replacement byte estimate for those images
499550
fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
500551
let mut payload_bytes = 0i64;
501-
let mut image_count = 0i64;
552+
let mut replacement_bytes = 0i64;
502553

503-
let mut accumulate = |image_url: &str| {
554+
let mut accumulate = |image_url: &str, detail: Option<ImageDetail>| {
504555
if let Some(payload_len) = base64_data_url_payload_len(image_url) {
505556
payload_bytes =
506557
payload_bytes.saturating_add(i64::try_from(payload_len).unwrap_or(i64::MAX));
507-
image_count = image_count.saturating_add(1);
558+
replacement_bytes = replacement_bytes.saturating_add(match detail {
559+
Some(ImageDetail::Original) => {
560+
estimate_original_image_bytes(image_url).unwrap_or(RESIZED_IMAGE_BYTES_ESTIMATE)
561+
}
562+
_ => RESIZED_IMAGE_BYTES_ESTIMATE,
563+
});
508564
}
509565
};
510566

511567
match item {
512568
ResponseItem::Message { content, .. } => {
513569
for content_item in content {
514570
if let ContentItem::InputImage { image_url } = content_item {
515-
accumulate(image_url);
571+
accumulate(image_url, None);
516572
}
517573
}
518574
}
519575
ResponseItem::FunctionCallOutput { output, .. }
520576
| ResponseItem::CustomToolCallOutput { output, .. } => {
521577
if let FunctionCallOutputBody::ContentItems(items) = &output.body {
522578
for content_item in items {
523-
if let FunctionCallOutputContentItem::InputImage { image_url } = content_item {
524-
accumulate(image_url);
579+
if let FunctionCallOutputContentItem::InputImage { image_url, detail } =
580+
content_item
581+
{
582+
accumulate(image_url, *detail);
525583
}
526584
}
527585
}
528586
}
529587
_ => {}
530588
}
531589

532-
(payload_bytes, image_count)
590+
(payload_bytes, replacement_bytes)
533591
}
534592

535593
fn is_model_generated_item(item: &ResponseItem) -> bool {

0 commit comments

Comments
 (0)