feat: add native Responses API support for hosted_vllm provider (#22298)

anencore94 · claude · web-flow · commit 1b4cfc2c9465 · 2026-02-27T19:39:50.000-08:00
Register HostedVLLMResponsesAPIConfig so that litellm.responses(model="hosted_vllm/...")
routes directly to vLLM's /v1/responses endpoint instead of falling back to the
chat completions → responses conversion pipeline.

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -1516,6 +1516,7 @@ def set_global_gitlab_config(config: Dict[str, Any]) -> None:
     from .llms.azure.completion.transformation import AzureOpenAITextConfig as AzureOpenAITextConfig
     from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig as HostedVLLMChatConfig
     from .llms.hosted_vllm.embedding.transformation import HostedVLLMEmbeddingConfig as HostedVLLMEmbeddingConfig
+    from .llms.hosted_vllm.responses.transformation import HostedVLLMResponsesAPIConfig as HostedVLLMResponsesAPIConfig
     from .llms.github_copilot.chat.transformation import GithubCopilotConfig as GithubCopilotConfig
     from .llms.github_copilot.responses.transformation import GithubCopilotResponsesAPIConfig as GithubCopilotResponsesAPIConfig
     from .llms.github_copilot.embedding.transformation import GithubCopilotEmbeddingConfig as GithubCopilotEmbeddingConfig
diff --git a/litellm/_lazy_imports_registry.py b/litellm/_lazy_imports_registry.py
@@ -226,6 +226,7 @@
     "AzureOpenAIOSeriesResponsesAPIConfig",
     "XAIResponsesAPIConfig",
     "LiteLLMProxyResponsesAPIConfig",
+    "HostedVLLMResponsesAPIConfig",
     "VolcEngineResponsesAPIConfig",
     "PerplexityResponsesConfig",
     "DatabricksResponsesAPIConfig",
@@ -897,6 +898,10 @@
         ".llms.litellm_proxy.responses.transformation",
         "LiteLLMProxyResponsesAPIConfig",
     ),
+    "HostedVLLMResponsesAPIConfig": (
+        ".llms.hosted_vllm.responses.transformation",
+        "HostedVLLMResponsesAPIConfig",
+    ),
     "VolcEngineResponsesAPIConfig": (
         ".llms.volcengine.responses.transformation",
         "VolcEngineResponsesAPIConfig",
diff --git a/litellm/llms/hosted_vllm/responses/transformation.py b/litellm/llms/hosted_vllm/responses/transformation.py
@@ -0,0 +1,71 @@
+"""
+Responses API transformation for Hosted VLLM provider.
+
+vLLM natively supports the OpenAI-compatible /v1/responses endpoint,
+so this config enables direct routing instead of falling back to
+the chat completions → responses conversion pipeline.
+"""
+
+from typing import Optional
+
+from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
+from litellm.secret_managers.main import get_secret_str
+from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LlmProviders
+
+
+class HostedVLLMResponsesAPIConfig(OpenAIResponsesAPIConfig):
+    """
+    Configuration for Hosted VLLM Responses API support.
+
+    Extends OpenAI's config since vLLM follows OpenAI's API spec,
+    but uses HOSTED_VLLM_API_BASE for the base URL and defaults
+    to "fake-api-key" when no API key is provided (vLLM does not
+    require authentication by default).
+    """
+
+    @property
+    def custom_llm_provider(self) -> LlmProviders:
+        return LlmProviders.HOSTED_VLLM
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        litellm_params: Optional[GenericLiteLLMParams],
+    ) -> dict:
+        litellm_params = litellm_params or GenericLiteLLMParams()
+        api_key = (
+            litellm_params.api_key
+            or get_secret_str("HOSTED_VLLM_API_KEY")
+            or "fake-api-key"
+        )  # vllm does not require an api key
+        headers.update(
+            {
+                "Authorization": f"Bearer {api_key}",
+            }
+        )
+        return headers
+
+    def get_complete_url(
+        self,
+        api_base: Optional[str],
+        litellm_params: dict,
+    ) -> str:
+        api_base = api_base or get_secret_str("HOSTED_VLLM_API_BASE")
+
+        if api_base is None:
+            raise ValueError(
+                "api_base not set for Hosted VLLM responses API. "
+                "Set via api_base parameter or HOSTED_VLLM_API_BASE environment variable"
+            )
+
+        # Remove trailing slashes
+        api_base = api_base.rstrip("/")
+
+        # If api_base already ends with /v1, append /responses
+        # Otherwise append /v1/responses
+        if api_base.endswith("/v1"):
+            return f"{api_base}/responses"
+
+        return f"{api_base}/v1/responses"
diff --git a/litellm/utils.py b/litellm/utils.py
@@ -8310,6 +8310,8 @@ def get_provider_responses_api_config(
             if model and "gpt" in model.lower():
                 return litellm.DatabricksResponsesAPIConfig()
             return None
+        elif litellm.LlmProviders.HOSTED_VLLM == provider:
+            return litellm.HostedVLLMResponsesAPIConfig()
         return None
 
     @staticmethod
diff --git a/tests/test_litellm/llms/hosted_vllm/responses/test_hosted_vllm_responses.py b/tests/test_litellm/llms/hosted_vllm/responses/test_hosted_vllm_responses.py
@@ -12,27 +12,48 @@
 import sys
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 sys.path.insert(
     0, os.path.abspath("../../../../..")
 )  # Adds the parent directory to the system path
 
 import litellm
+from litellm.llms.hosted_vllm.responses.transformation import (
+    HostedVLLMResponsesAPIConfig,
+)
+from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LlmProviders
+from litellm.utils import ProviderConfigManager
 
 
-def _make_mock_chat_completion_response(content: str = "Hello! I'm doing well.") -> dict:
+def _make_mock_responses_api_response(content: str = "Hello! I'm doing well.") -> dict:
     return {
-        "id": "chatcmpl-test123",
-        "object": "chat.completion",
-        "created": 1234567890,
+        "id": "resp-test123",
+        "object": "response",
+        "created_at": 1234567890,
         "model": "Qwen/Qwen3-8B",
-        "choices": [
+        "output": [
             {
-                "index": 0,
-                "message": {"role": "assistant", "content": content},
-                "finish_reason": "stop",
+                "type": "message",
+                "id": "msg-test123",
+                "status": "completed",
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "output_text",
+                        "text": content,
+                        "annotations": [],
+                    }
+                ],
             }
         ],
-        "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+        "status": "completed",
+        "usage": {
+            "input_tokens": 10,
+            "output_tokens": 20,
+            "total_tokens": 30,
+        },
     }
 
 
@@ -49,18 +70,11 @@ def _make_mock_http_client(response_body: dict) -> MagicMock:
 
 def test_hosted_vllm_responses_create_with_string_input():
     """
-    Regression test: responses.create() with string input must not raise
-    TypeError: 'NoneType' object is not a mapping.
-
-    Root cause: extra_body=None was passed explicitly through the
-    responses→completion pipeline. In add_provider_specific_params_to_optional_params(),
-    passed_params.pop("extra_body", {}) returned None (key existed with value None),
-    and **None raised TypeError at dict unpacking.
-
-    Fix: normalize None to {} for both extra_body and optional_params["extra_body"].
+    Test that hosted_vllm routes directly to the native /v1/responses endpoint
+    when the Responses API config is registered, and correctly parses the response.
     """
     mock_client = _make_mock_http_client(
-        _make_mock_chat_completion_response("I'm doing well, thanks!")
+        _make_mock_responses_api_response("I'm doing well, thanks!")
     )
 
     with patch(
@@ -101,3 +115,78 @@ def test_hosted_vllm_responses_create_with_explicit_none_extra_body():
 
     # extra_body=None should be normalized to an empty dict (or absent)
     assert optional_params.get("extra_body") is not None or "extra_body" not in optional_params
+
+
+def test_hosted_vllm_provider_config_registration():
+    """Test that ProviderConfigManager returns HostedVLLMResponsesAPIConfig for hosted_vllm."""
+    config = ProviderConfigManager.get_provider_responses_api_config(
+        model="hosted_vllm/Qwen/Qwen3-8B",
+        provider=LlmProviders.HOSTED_VLLM,
+    )
+
+    assert config is not None
+    assert isinstance(config, HostedVLLMResponsesAPIConfig)
+    assert config.custom_llm_provider == LlmProviders.HOSTED_VLLM
+
+
+def test_hosted_vllm_responses_api_url():
+    """Test get_complete_url() constructs the correct URL."""
+    config = HostedVLLMResponsesAPIConfig()
+
+    # api_base without /v1
+    url = config.get_complete_url(
+        api_base="http://localhost:8000",
+        litellm_params={},
+    )
+    assert url == "http://localhost:8000/v1/responses"
+
+    # api_base with /v1
+    url_with_v1 = config.get_complete_url(
+        api_base="http://localhost:8000/v1",
+        litellm_params={},
+    )
+    assert url_with_v1 == "http://localhost:8000/v1/responses"
+
+    # api_base with trailing slash
+    url_with_slash = config.get_complete_url(
+        api_base="http://localhost:8000/v1/",
+        litellm_params={},
+    )
+    assert url_with_slash == "http://localhost:8000/v1/responses"
+
+
+def test_hosted_vllm_responses_api_url_requires_api_base():
+    """Test get_complete_url() raises ValueError when api_base is not set."""
+    config = HostedVLLMResponsesAPIConfig()
+
+    with pytest.raises(ValueError, match="api_base not set"):
+        config.get_complete_url(
+            api_base=None,
+            litellm_params={},
+        )
+
+
+def test_hosted_vllm_validate_environment_default_api_key():
+    """Test validate_environment() defaults to 'fake-api-key' when no key is provided."""
+    config = HostedVLLMResponsesAPIConfig()
+
+    headers = config.validate_environment(
+        headers={},
+        model="Qwen/Qwen3-8B",
+        litellm_params=GenericLiteLLMParams(),
+    )
+
+    assert headers.get("Authorization") == "Bearer fake-api-key"
+
+
+def test_hosted_vllm_validate_environment_custom_api_key():
+    """Test validate_environment() uses the provided api_key."""
+    config = HostedVLLMResponsesAPIConfig()
+
+    headers = config.validate_environment(
+        headers={},
+        model="Qwen/Qwen3-8B",
+        litellm_params=GenericLiteLLMParams(api_key="my-custom-key"),
+    )
+
+    assert headers.get("Authorization") == "Bearer my-custom-key"