codio · sergei-bronnikov · Mar 16, 2026 · Mar 16, 2026 · Apr 9, 2026 · Apr 10, 2026
diff --git a/internal/provider/openai/cost.go b/internal/provider/openai/cost.go
@@ -149,8 +149,33 @@ var OpenAiPerThousandTokenCost = map[string]map[string]float64{
 	},
 	"audio": {
 		"whisper-1": 0.006,
-		"tts-1":     0.015,
-		"tts-1-hd":  0.03,
+
+		"tts-1":    0.015,
+		"tts-1-hd": 0.03,
+
+		"gpt-4o-transcribe":         0.006,
+		"gpt-4o-transcribe-diarize": 0.006,
+		"gpt-4o-mini-transcribe":    0.003,
+
+		"gpt-4o-mini-tts": 0.012,
+	},
+	"transcription-input": {
+		"gpt-4o-transcribe":         0.0025,
+		"gpt-4o-transcribe-diarize": 0.0025,
+		"gpt-4o-mini-transcribe":    0.00125,
+	},
+	"transcription-output": {
+		"gpt-4o-transcribe":         0.01,
+		"gpt-4o-transcribe-diarize": 0.01,
+		"gpt-4o-mini-transcribe":    0.005,
+	},
+	"video": { // $ per sec
+		"sora-2":          0.1,
+		"sora-2-pro":      0.30,
+		"sora-2-720":      0.1,
+		"sora-2-pro-720":  0.30,
+		"sora-2-pro-1024": 0.5,
+		"sora-2-pro-1080": 0.7,
 	},
 	"completion": {
 		"gpt-image-1.5":        0.010,
@@ -649,7 +674,30 @@ func prepareGptImageQuality(quality string) (string, error) {
 	return quality, nil
 }
 
-func (ce *CostEstimator) EstimateTranscriptionCost(secs float64, model string) (float64, error) {
+func (ce *CostEstimator) EstimateTranscriptionCost(secs float64, model string, usage *TranscriptionResponseUsage) (float64, error) {
+	if usage != nil {
+		inputTokens := usage.InputTokens
+		costMap, ok := ce.tokenCostMap["transcription-input"]
+		if !ok {
+			return 0, errors.New("transcription input token cost map is not provided")
+		}
+		inputCost, ok := costMap[model]
+		if !ok {
+			return 0, errors.New("model is not present in the transcription input token cost map")
+		}
+
+		outputTokens := usage.OutputTokens
+		costMap, ok = ce.tokenCostMap["transcription-output"]
+		if !ok {
+			return 0, errors.New("transcription output token cost map is not provided")
+		}
+		outputCost, ok := costMap[model]
+		if !ok {
+			return 0, errors.New("model is not present in the transcription output token cost map")
+		}
+
+		return (float64(inputTokens)/1000)*inputCost + (float64(outputTokens)/1000)*outputCost, nil
+	}
 	costMap, ok := ce.tokenCostMap["audio"]
 	if !ok {
 		return 0, errors.New("audio cost map is not provided")
@@ -769,6 +817,40 @@ func (ce *CostEstimator) EstimateResponseApiToolCreateContainerCost(req *Respons
 	return totalCost, nil
 }
 
+func (ce *CostEstimator) EstimateVideoCost(metadata *VideoResponseMetadata) (float64, error) {
+	if metadata == nil {
+		return 0, errors.New("metadata is nil")
+	}
+	costMap, ok := ce.tokenCostMap["video"]
+	if !ok {
+		return 0, errors.New("video cost map is not provided")
+	}
+	model := metadata.Model
+	size, err := normalizedVideoSize(metadata.Size)
+	if err != nil {
+		return 0, err
+	}
+	costKey := fmt.Sprintf("%s-%s", model, size)
+	cost, ok := costMap[costKey]
+	if !ok {
+		return 0, errors.New("model with provided size is not present in the video cost map")
+	}
+	return cost * metadata.GetSecondsAsFloat(), nil
+}
+
+func normalizedVideoSize(size string) (string, error) {
+	switch size {
+	case "720x1280", "1280x720":
+		return "720", nil
+	case "1024x1792", "1792x1024":
+		return "1024", nil
+	case "1080x1920", "1920x1080":
+		return "1080", nil
+	default:
+		return "", errors.New("size is not valid")
+	}
+}
+
 var reasoningModelPrefix = []string{"gpt-5", "o1", "o2", "o3"}
 
 func extendedToolType(toolType, model string) string {

diff --git a/internal/provider/openai/types.go b/internal/provider/openai/types.go
@@ -1,5 +1,7 @@
 package openai
 
+import "strconv"
+
 type ResponseRequest struct {
 	Background         *bool                      `json:"background,omitzero"`
 	Conversation       *any                       `json:"conversation,omitzero"`
@@ -89,3 +91,58 @@ type ImageResponseMetadata struct {
 	Size    string             `json:"size,omitempty"`
 	Usage   ImageResponseUsage `json:"usage,omitempty"`
 }
+
+type VideoResponseMetadata struct {
+	Model   string `json:"model,omitempty"`
+	Size    string `json:"size,omitempty"`
+	Seconds string `json:"seconds,omitempty"`
+}
+
+func (v *VideoResponseMetadata) GetSecondsAsFloat() float64 {
+	if secondsFloat, err := strconv.ParseFloat(v.Seconds, 64); err == nil {
+		return secondsFloat
+	}
+	return 0
+}
+
+type TranscriptionResponseUsageInputTokenDetails struct {
+	TextTokens  int `json:"text_tokens,omitempty"`
+	AudioTokens int `json:"audio_tokens,omitempty"`
+}
+type TranscriptionResponseUsage struct {
+	Type              string                                      `json:"type"`
+	TotalTokens       int                                         `json:"total_tokens,omitempty"`
+	InputTokens       int                                         `json:"input_tokens,omitempty"`
+	InputTokenDetails TranscriptionResponseUsageInputTokenDetails `json:"input_token_details,omitempty"`
+	OutputTokens      int                                         `json:"output_tokens,omitempty"`
+}
+type TranscriptionResponse struct {
+	Text  string                     `json:"text,omitempty"`
+	Usage TranscriptionResponseUsage `json:"usage,omitempty"`
+}
+
+type TranscriptionStreamChunk struct {
+	Type  string                     `json:"type"`
+	Delta string                     `json:"delta,omitempty"`
+	Text  string                     `json:"text,omitempty"`
+	Usage TranscriptionResponseUsage `json:"usage,omitempty"`
+}
+
+func (c *TranscriptionStreamChunk) IsDone() bool {
+	return c.Type == "transcript.text.done"
+}
+
+func (c *TranscriptionStreamChunk) IsDelta() bool {
+	return c.Type == "transcript.text.delta"
+}
+
+func (c *TranscriptionStreamChunk) IsSegment() bool {
+	return c.Type == "transcript.text.segment"
+}
+
+func (c *TranscriptionStreamChunk) GetText() string {
+	if c.IsDelta() {
+		return c.Delta
+	}
+	return c.Text
+}
diff --git a/internal/server/web/proxy/audio.go b/internal/server/web/proxy/audio.go
@@ -169,6 +169,12 @@ func getContentType(format string) string {
 
 func getTranscriptionsHandler(prod bool, client http.Client, e estimator) gin.HandlerFunc {
 	return func(c *gin.Context) {
+		model := c.PostForm("model")
+		if model == "gpt-4o-transcribe" || model == "gpt-4o-transcribe-diarize" || model == "gpt-4o-mini-transcribe" {
+			processGPTTranscriptions(c, prod, client, e, model)
+			return
+		}
+
 		log := util.GetLogFromCtx(c)
 		telemetry.Incr("bricksllm.proxy.get_transcriptions_handler.requests", nil, 1)
 
@@ -291,7 +297,7 @@ func getTranscriptionsHandler(prod bool, client http.Client, e estimator) gin.Ha
 			}
 
 			if err == nil {
-				cost, err := e.EstimateTranscriptionCost(ar.Duration, c.GetString("model"))
+				cost, err := e.EstimateTranscriptionCost(ar.Duration, c.GetString("model"), nil)
 				if err != nil {
 					telemetry.Incr("bricksllm.proxy.get_transcriptions_handler.estimate_total_cost_error", nil, 1)
 					logError(log, "error when estimating openai cost", prod, err)
@@ -333,6 +339,11 @@ func getTranscriptionsHandler(prod bool, client http.Client, e estimator) gin.Ha
 
 func getTranslationsHandler(prod bool, client http.Client, e estimator) gin.HandlerFunc {
 	return func(c *gin.Context) {
+		model := c.PostForm("model")
+		if model == "gpt-4o-transcribe" || model == "gpt-4o-transcribe-diarize" || model == "gpt-4o-mini-transcribe" {
+			processGPTTranslations(c, prod, client, e, model)
+			return
+		}
 		log := util.GetLogFromCtx(c)
 		telemetry.Incr("bricksllm.proxy.get_translations_handler.requests", nil, 1)
 
@@ -451,7 +462,7 @@ func getTranslationsHandler(prod bool, client http.Client, e estimator) gin.Hand
 			}
 
 			if err == nil {
-				cost, err := e.EstimateTranscriptionCost(ar.Duration, c.GetString("model"))
+				cost, err := e.EstimateTranscriptionCost(ar.Duration, c.GetString("model"), nil)
 				if err != nil {
 					telemetry.Incr("bricksllm.proxy.get_translations_handler.estimate_total_cost_error", nil, 1)
 					logError(log, "error when estimating openai cost", prod, err)