From 065e6c9395cfd6cb7ce31dbd434c4a8933637f33 Mon Sep 17 00:00:00 2001
From: Dmitry Persiyanov <persiyanov@phystech.edu>
Date: Mon, 7 Apr 2025 17:58:52 +0200
Subject: [PATCH 1/3] support --dimensions parameter in cli

---
 libs/infinity_emb/infinity_emb/args.py        |  6 +++-
 libs/infinity_emb/infinity_emb/cli.py         |  7 +++++
 libs/infinity_emb/infinity_emb/engine.py      |  1 +
 libs/infinity_emb/infinity_emb/env.py         |  6 ++++
 .../infinity_emb/inference/batch_handler.py   |  6 ++++
 .../infinity_emb/infinity_server.py           |  1 +
 .../end_to_end/test_api_with_dummymodel.py    | 29 +++++++++++++++++++
 7 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/libs/infinity_emb/infinity_emb/args.py b/libs/infinity_emb/infinity_emb/args.py
index 9c4b26ec..a5bf07df 100644
--- a/libs/infinity_emb/infinity_emb/args.py
+++ b/libs/infinity_emb/infinity_emb/args.py
@@ -34,6 +34,7 @@ class EngineArgs:
     Args:
         model_name_or_path, str:  Defaults to "michaelfeil/bge-small-en-v1.5".
         batch_size, int: Defaults to 32.
+        dimensions, int: Defaults to 0 (no matryoshka slicing).
         revision, str: Defaults to None.
         trust_remote_code, bool: Defaults to True.
         engine, InferenceEngine or str: backend for inference.
@@ -54,6 +55,7 @@ class EngineArgs:
 
     model_name_or_path: str = MANAGER.model_id[0]
     batch_size: int = MANAGER.batch_size[0]
+    dimensions: int = MANAGER.dimensions[0]
     revision: Optional[str] = MANAGER.revision[0]
     trust_remote_code: bool = MANAGER.trust_remote_code[0]
     engine: InferenceEngine = InferenceEngine[MANAGER.engine[0]]
@@ -148,6 +150,7 @@ def from_env(cls) -> list["EngineArgs"]:
             EngineArgs(
                 model_name_or_path=model_name_or_path,
                 batch_size=batch_size,
+                dimensions=dimensions,
                 revision=revision,
                 trust_remote_code=trust_remote_code,
                 engine=engine,
@@ -161,9 +164,10 @@ def from_env(cls) -> list["EngineArgs"]:
                 embedding_dtype=embedding_dtype,
                 served_model_name=served_model_name,
             )
-            for model_name_or_path, batch_size, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name in zip_longest(
+            for model_name_or_path, batch_size, dimensions, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name in zip_longest(
                 MANAGER.model_id,
                 MANAGER.batch_size,
+                MANAGER.dimensions,
                 MANAGER.revision,
                 MANAGER.trust_remote_code,
                 MANAGER.engine,
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index 567f95f5..23f2bd42 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -113,6 +113,7 @@ def v1(
         model_name_or_path: str = MANAGER.model_id[0],
         served_model_name: str = MANAGER.served_model_name[0],
         batch_size: int = MANAGER.batch_size[0],
+        dimensions: int = MANAGER.dimensions[0],
         revision: str = MANAGER.revision[0],
         trust_remote_code: bool = MANAGER.trust_remote_code[0],
         redirect_slash: str = MANAGER.redirect_slash,
@@ -153,6 +154,7 @@ def v1(
             model_id=[model_name_or_path],
             served_model_name=[served_model_name],  # type: ignore
             batch_size=[batch_size],
+            dimensions=[dimensions],
             revision=[revision],  # type: ignore
             trust_remote_code=[trust_remote_code],
             engine=[engine],
@@ -192,6 +194,9 @@ def v2(
         batch_size: list[int] = typer.Option(
             **_construct("batch_size"), help="maximum batch size for inference"
         ),
+        dimensions: list[int] = typer.Option(
+            **_construct("dimensions"), help="default dimensions for inference"
+        ),
         revision: list[str] = typer.Option(
             **_construct("revision"), help="huggingface  model repo revision."
         ),
@@ -285,6 +290,7 @@ def v2(
             Defaults to `INFINITY_MODEL_ID`
         served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"]
         batch_size, list[int]: batch size for forward pass.
+        dimensions, list[int]: default dimensions for inference.
         revision: list[str]: revision of the model.
         trust_remote_code, list[bool]: trust remote code.
         url_prefix, str: prefix for api. typically "".
@@ -316,6 +322,7 @@ def v2(
             length=len(model_id),
             model_name_or_path=model_id,
             batch_size=batch_size,
+            dimensions=dimensions,
             revision=revision,
             trust_remote_code=trust_remote_code,
             engine=engine,
diff --git a/libs/infinity_emb/infinity_emb/engine.py b/libs/infinity_emb/infinity_emb/engine.py
index 153e15ba..bbc9dcd2 100644
--- a/libs/infinity_emb/infinity_emb/engine.py
+++ b/libs/infinity_emb/infinity_emb/engine.py
@@ -88,6 +88,7 @@ async def astart(self):
                 self.running = True
                 self._batch_handler = BatchHandler(
                     max_batch_size=self._engine_args.batch_size,
+                    matryoshka_dim=self._engine_args.dimensions,
                     model_replicas=self._model_replicas,
                     # batch_delay=self._min_inference_t / 2,
                     vector_disk_cache_path=self._engine_args.vector_disk_cache_path,
diff --git a/libs/infinity_emb/infinity_emb/env.py b/libs/infinity_emb/infinity_emb/env.py
index a3261624..413856e6 100644
--- a/libs/infinity_emb/infinity_emb/env.py
+++ b/libs/infinity_emb/infinity_emb/env.py
@@ -107,6 +107,12 @@ def batch_size(self):
             self._optional_infinity_var_multiple("batch_size", default=["32"])
         )
 
+    @cached_property
+    def dimensions(self):
+        return self._to_int_multiple(
+            self._optional_infinity_var_multiple("dimensions", default=["0"])
+        )
+
     @cached_property
     def revision(self):
         return self._optional_infinity_var_multiple("revision", default=[""])
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 24a7c49b..0584d277 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -79,6 +79,7 @@ def __init__(
         self,
         model_replicas: list["BaseTypeHint"],
         max_batch_size: int,
+        matryoshka_dim: Optional[int] = None,
         max_queue_wait: int = MANAGER.queue_size,
         batch_delay: float = 5e-3,
         vector_disk_cache_path: str = "",
@@ -92,6 +93,7 @@ def __init__(
         Args:
             model (BaseTransformer): the base class of the model to be used
             max_batch_size (int): max batch size of dynamic batch size
+            matryoshka_dim (int, optional): default dimensions for matryoshka slicing.
             max_queue_wait (int, optional): max items to queue in the batch, default 32_000
             batch_delay (float, optional): sleep in seconds, wait time for pre/post methods.
                 Best result: setting to 1/2 the minimal expected
@@ -112,6 +114,7 @@ def __init__(
         self._result_queue: Queue = Queue(8)
 
         self.max_batch_size = max_batch_size
+        self.matryoshka_dim = matryoshka_dim
         self._verbose = verbose
         self.batch_delay = batch_delay
 
@@ -172,6 +175,7 @@ async def embed(
         input_sentences = [EmbeddingSingle(sentence=s) for s in sentences]
 
         embeddings, usage = await self._schedule(input_sentences)
+        matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim
         return matryososka_slice(embeddings, matryoshka_dim), usage
 
     async def rerank(
@@ -278,6 +282,7 @@ async def image_embed(
 
         items = await resolve_images(images)
         embeddings, usage = await self._schedule(items)
+        matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim
         return matryososka_slice(embeddings, matryoshka_dim), usage
 
     async def audio_embed(
@@ -308,6 +313,7 @@ async def audio_embed(
             getattr(self.model_worker[0]._model, "sampling_rate", -42),
         )
         embeddings, usage = await self._schedule(items)
+        matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim
         return matryososka_slice(embeddings, matryoshka_dim), usage
 
     async def _schedule(self, list_queueitem: Sequence[AbstractSingle]) -> tuple[list[Any], int]:
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 43488012..80ea2815 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -57,6 +57,7 @@ def create_server(
     permissive_cors: bool = MANAGER.permissive_cors,
     api_key: str = MANAGER.api_key,
     proxy_root_path: str = MANAGER.proxy_root_path,
+    dimensions: int = MANAGER.dimensions,
 ):
     """
     creates the FastAPI server for a set of EngineArgs.
diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
index 1e8d1aa4..73bdfd4e 100644
--- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
+++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
@@ -19,6 +19,8 @@
 PREFIX = ""
 MODEL_NAME = "dummy-number-1"
 MODEL_NAME_2 = "dummy-number-2"
+MODEL_NAME_3 = "dummy-number-3"
+DEFAULT_DIMENSIONS = 5
 BATCH_SIZE = 16
 
 PATH_OPENAPI = pathlib.Path(__file__).parent.parent.parent.parent.parent.joinpath(
@@ -38,6 +40,12 @@
             batch_size=BATCH_SIZE,
             engine=InferenceEngine.debugengine,
         ),
+        EngineArgs(
+            model_name_or_path=MODEL_NAME_3,
+            batch_size=BATCH_SIZE,
+            dimensions=DEFAULT_DIMENSIONS,
+            engine=InferenceEngine.debugengine,
+        ),
     ],
 )
 
@@ -193,3 +201,24 @@ async def test_matryoshka_embedding(client):
         for embedding, sentence in zip(rdata["data"], inp):
             assert len(sentence) == embedding["embedding"][0]
             assert len(embedding["embedding"]) == matryoshka_dim
+
+
+@pytest.mark.anyio
+async def test_matryoshka_embedding_default_dimensions(client):
+    possible_inputs = [
+        ["This is a test sentence."],
+        ["This is a test sentence.", "This is another test sentence."],
+    ]
+    for inp in possible_inputs:
+        response = await client.post(
+            f"{PREFIX}/embeddings",
+            json=dict(input=inp, model=MODEL_NAME_3),
+        )
+        assert response.status_code == 200, f"{response.status_code}, {response.text}"
+        rdata = response.json()
+        assert "data" in rdata and isinstance(rdata["data"], list)
+        assert all("embedding" in d for d in rdata["data"])
+        assert len(rdata["data"]) == len(inp)
+        for embedding, sentence in zip(rdata["data"], inp):
+            assert len(sentence) == embedding["embedding"][0]
+            assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS
\ No newline at end of file

From 9f6f07fd624575d824891eb7da6b70116bbd6bc0 Mon Sep 17 00:00:00 2001
From: Dmitry Persiyanov <persiyanov@phystech.edu>
Date: Mon, 7 Apr 2025 19:00:32 +0200
Subject: [PATCH 2/3] upd docs

---
 docs/docs/cli_v2.md | 255 +++++++++++++++++++-------------------------
 1 file changed, 107 insertions(+), 148 deletions(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 0bae7fce..e319df25 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -6,153 +6,112 @@ $ infinity_emb v2 --help
 ```
 
 ```
-                                                                                                                        
- Usage: infinity_emb v2 [OPTIONS]                                                                                       
-                                                                                                                        
- Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
- Multiple Model CLI Playbook:                                                                                           
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`   
- - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
- INFINITY_BATCH_SIZE="8;4;"                                                                                             
- - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
- 8` both models have batch-size 8.                                                                                      
-                                                                                                                        
-╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --model-id                                             TEXT                           Huggingface model repo id.     │
-│                                                                                       Subset of possible models:     │
-│                                                                                       https://huggingface.co/models… │
-│                                                                                       [env var: `INFINITY_MODEL_ID`] │
-│                                                                                       [default:                      │
-│                                                                                       michaelfeil/bge-small-en-v1.5] │
-│ --served-model-name                                    TEXT                           the nickname for the API,      │
-│                                                                                       under which the model_id can   │
-│                                                                                       be selected                    │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_SERVED_MODEL_NAME`]  │
-│ --batch-size                                           INTEGER                        maximum batch size for         │
-│                                                                                       inference                      │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_BATCH_SIZE`]         │
-│                                                                                       [default: 32]                  │
-│ --revision                                             TEXT                           huggingface  model repo        │
-│                                                                                       revision.                      │
-│                                                                                       [env var: `INFINITY_REVISION`] │
-│ --trust-remote-code       --no-trust-remote-code                                      if potential remote modeling   │
-│                                                                                       code from huggingface repo is  │
-│                                                                                       trusted.                       │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_TRUST_REMOTE_CODE`]  │
-│                                                                                       [default: trust-remote-code]   │
-│ --engine                                               [torch|ctranslate2|optimum|ne  Which backend to use. `torch`  │
-│                                                        uron|debugengine]              uses Pytorch GPU/CPU, optimum  │
-│                                                                                       uses ONNX on                   │
-│                                                                                       GPU/CPU/NVIDIA-TensorRT,       │
-│                                                                                       `CTranslate2` uses             │
-│                                                                                       torch+ctranslate2 on CPU/GPU.  │
-│                                                                                       [env var: `INFINITY_ENGINE`]   │
-│                                                                                       [default: torch]               │
-│ --model-warmup            --no-model-warmup                                           if model should be warmed up   │
-│                                                                                       after startup, and before      │
-│                                                                                       ready.                         │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_MODEL_WARMUP`]       │
-│                                                                                       [default: model-warmup]        │
-│ --vector-disk-cache       --no-vector-disk-cache                                      If hash(request)/results       │
-│                                                                                       should be cached to SQLite for │
-│                                                                                       latency improvement.           │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_VECTOR_DISK_CACHE`]  │
-│                                                                                       [default: vector-disk-cache]   │
-│ --device                                               [cpu|cuda|mps|tensorrt|auto]   device to use for computing    │
-│                                                                                       the model forward pass.        │
-│                                                                                       [env var: `INFINITY_DEVICE`]   │
-│                                                                                       [default: auto]                │
-│ --device-id                                            TEXT                           device id defines the model    │
-│                                                                                       placement. e.g. `0,1` will     │
-│                                                                                       place the model on             │
-│                                                                                       MPS/CUDA/GPU 0 and 1 each      │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_DEVICE_ID`]          │
-│ --lengths-via-tokenize    --no-lengths-via-tokenize                                   if True, returned tokens is    │
-│                                                                                       based on actual tokenizer      │
-│                                                                                       count. If false, uses          │
-│                                                                                       len(input) as proxy.           │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_LENGTHS_VIA_TOKENIZ… │
-│                                                                                       [default:                      │
-│                                                                                       lengths-via-tokenize]          │
-│ --dtype                                                [float32|float16|bfloat16|int  dtype for the model weights.   │
-│                                                        8|fp8|auto]                    [env var: `INFINITY_DTYPE`]    │
-│                                                                                       [default: auto]                │
-│ --embedding-dtype                                      [float32|int8|uint8|binary|ub  dtype post-forward pass. If != │
-│                                                        inary]                         `float32`, using Post-Forward  │
-│                                                                                       Static quantization.           │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_EMBEDDING_DTYPE`]    │
-│                                                                                       [default: float32]             │
-│ --pooling-method                                       [mean|cls|auto]                overwrite the pooling method   │
-│                                                                                       if inferred incorrectly.       │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_POOLING_METHOD`]     │
-│                                                                                       [default: auto]                │
-│ --compile                 --no-compile                                                Enable usage of                │
-│                                                                                       `torch.compile(dynamic=True)`  │
-│                                                                                       if engine relies on it.        │
-│                                                                                       [env var: `INFINITY_COMPILE`]  │
-│                                                                                       [default: compile]             │
-│ --bettertransformer       --no-bettertransformer                                      Enables varlen                 │
-│                                                                                       flash-attention-2 via the      │
-│                                                                                       `BetterTransformer`            │
-│                                                                                       implementation. If available   │
-│                                                                                       for this model.                │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_BETTERTRANSFORMER`]  │
-│                                                                                       [default: bettertransformer]   │
-│ --preload-only            --no-preload-only                                           If true, only downloads models │
-│                                                                                       and verifies setup, then exit. │
-│                                                                                       Recommended for pre-caching    │
-│                                                                                       the download in a Dockerfile.  │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_PRELOAD_ONLY`]       │
-│                                                                                       [default: no-preload-only]     │
-│ --host                                                 TEXT                           host for the FastAPI uvicorn   │
-│                                                                                       server                         │
-│                                                                                       [env var: `INFINITY_HOST`]     │
-│                                                                                       [default: 0.0.0.0]             │
-│ --port                                                 INTEGER                        port for the FastAPI uvicorn   │
-│                                                                                       server                         │
-│                                                                                       [env var: `INFINITY_PORT`]     │
-│                                                                                       [default: 7997]                │
-│ --url-prefix                                           TEXT                           prefix for all routes of the   │
-│                                                                                       FastAPI uvicorn server. Useful │
-│                                                                                       if you run behind a proxy /    │
-│                                                                                       cascaded API.                  │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_URL_PREFIX`]         │
-│ --redirect-slash                                       TEXT                           where to redirect `/` requests │
-│                                                                                       to.                            │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_REDIRECT_SLASH`]     │
-│                                                                                       [default: /docs]               │
-│ --log-level                                            [critical|error|warning|info|  console log level.             │
-│                                                        debug|trace]                   [env var:                      │
-│                                                                                       `INFINITY_LOG_LEVEL`]          │
-│                                                                                       [default: info]                │
-│ --permissive-cors         --no-permissive-cors                                        whether to allow permissive    │
-│                                                                                       cors.                          │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_PERMISSIVE_CORS`]    │
-│                                                                                       [default: no-permissive-cors]  │
-│ --api-key                                              TEXT                           api_key used for               │
-│                                                                                       authentication headers.        │
-│                                                                                       [env var: `INFINITY_API_KEY`]  │
-│ --proxy-root-path                                      TEXT                           Proxy prefix for the           │
-│                                                                                       application. See:              │
-│                                                                                       https://fastapi.tiangolo.com/… │
-│                                                                                       [env var:                      │
-│                                                                                       `INFINITY_PROXY_ROOT_PATH`]    │
-│ --help                                                                                Show this message and exit.    │
-╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-
+ Usage: infinity_emb v2 [OPTIONS]                                                                                                                          
+                                                                                                                                                           
+ Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                  
+ Multiple Model CLI Playbook:                                                                                                                              
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`                                      
+ - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"              
+ - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.   
+                                                                                                                                                           
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-id                                             TEXT                                            Huggingface model repo id. Subset of possible    │
+│                                                                                                        models:                                          │
+│                                                                                                        https://huggingface.co/models?other=text-embedd… │
+│                                                                                                        [env var: `INFINITY_MODEL_ID`]                   │
+│                                                                                                        [default: michaelfeil/bge-small-en-v1.5]         │
+│ --served-model-name                                    TEXT                                            the nickname for the API, under which the        │
+│                                                                                                        model_id can be selected                         │
+│                                                                                                        [env var: `INFINITY_SERVED_MODEL_NAME`]          │
+│ --batch-size                                           INTEGER                                         maximum batch size for inference                 │
+│                                                                                                        [env var: `INFINITY_BATCH_SIZE`]                 │
+│                                                                                                        [default: 32]                                    │
+│ --dimensions                                           INTEGER                                         default dimensions for inference                 │
+│                                                                                                        [env var: `INFINITY_DIMENSIONS`]                 │
+│                                                                                                        [default: 0]                                     │
+│ --revision                                             TEXT                                            huggingface  model repo revision.                │
+│                                                                                                        [env var: `INFINITY_REVISION`]                   │
+│ --trust-remote-code       --no-trust-remote-code                                                       if potential remote modeling code from           │
+│                                                                                                        huggingface repo is trusted.                     │
+│                                                                                                        [env var: `INFINITY_TRUST_REMOTE_CODE`]          │
+│                                                                                                        [default: trust-remote-code]                     │
+│ --engine                                               [torch|ctranslate2|optimum|neuron|debugengine]  Which backend to use. `torch` uses Pytorch       │
+│                                                                                                        GPU/CPU, optimum uses ONNX on                    │
+│                                                                                                        GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses      │
+│                                                                                                        torch+ctranslate2 on CPU/GPU.                    │
+│                                                                                                        [env var: `INFINITY_ENGINE`]                     │
+│                                                                                                        [default: torch]                                 │
+│ --model-warmup            --no-model-warmup                                                            if model should be warmed up after startup, and  │
+│                                                                                                        before ready.                                    │
+│                                                                                                        [env var: `INFINITY_MODEL_WARMUP`]               │
+│                                                                                                        [default: model-warmup]                          │
+│ --vector-disk-cache       --no-vector-disk-cache                                                       If hash(request)/results should be cached to     │
+│                                                                                                        SQLite for latency improvement.                  │
+│                                                                                                        [env var: `INFINITY_VECTOR_DISK_CACHE`]          │
+│                                                                                                        [default: vector-disk-cache]                     │
+│ --device                                               [cpu|cuda|mps|tensorrt|auto]                    device to use for computing the model forward    │
+│                                                                                                        pass.                                            │
+│                                                                                                        [env var: `INFINITY_DEVICE`]                     │
+│                                                                                                        [default: auto]                                  │
+│ --device-id                                            TEXT                                            device id defines the model placement. e.g.      │
+│                                                                                                        `0,1` will place the model on MPS/CUDA/GPU 0 and │
+│                                                                                                        1 each                                           │
+│                                                                                                        [env var: `INFINITY_DEVICE_ID`]                  │
+│ --lengths-via-tokenize    --no-lengths-via-tokenize                                                    if True, returned tokens is based on actual      │
+│                                                                                                        tokenizer count. If false, uses len(input) as    │
+│                                                                                                        proxy.                                           │
+│                                                                                                        [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`]       │
+│                                                                                                        [default: lengths-via-tokenize]                  │
+│ --dtype                                                [float32|float16|bfloat16|int8|fp8|auto]        dtype for the model weights.                     │
+│                                                                                                        [env var: `INFINITY_DTYPE`]                      │
+│                                                                                                        [default: auto]                                  │
+│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinary]             dtype post-forward pass. If != `float32`, using  │
+│                                                                                                        Post-Forward Static quantization.                │
+│                                                                                                        [env var: `INFINITY_EMBEDDING_DTYPE`]            │
+│                                                                                                        [default: float32]                               │
+│ --pooling-method                                       [mean|cls|auto]                                 overwrite the pooling method if inferred         │
+│                                                                                                        incorrectly.                                     │
+│                                                                                                        [env var: `INFINITY_POOLING_METHOD`]             │
+│                                                                                                        [default: auto]                                  │
+│ --compile                 --no-compile                                                                 Enable usage of `torch.compile(dynamic=True)` if │
+│                                                                                                        engine relies on it.                             │
+│                                                                                                        [env var: `INFINITY_COMPILE`]                    │
+│                                                                                                        [default: compile]                               │
+│ --bettertransformer       --no-bettertransformer                                                       Enables varlen flash-attention-2 via the         │
+│                                                                                                        `BetterTransformer` implementation. If available │
+│                                                                                                        for this model.                                  │
+│                                                                                                        [env var: `INFINITY_BETTERTRANSFORMER`]          │
+│                                                                                                        [default: bettertransformer]                     │
+│ --preload-only            --no-preload-only                                                            If true, only downloads models and verifies      │
+│                                                                                                        setup, then exit. Recommended for pre-caching    │
+│                                                                                                        the download in a Dockerfile.                    │
+│                                                                                                        [env var: `INFINITY_PRELOAD_ONLY`]               │
+│                                                                                                        [default: no-preload-only]                       │
+│ --host                                                 TEXT                                            host for the FastAPI uvicorn server              │
+│                                                                                                        [env var: `INFINITY_HOST`]                       │
+│                                                                                                        [default: 0.0.0.0]                               │
+│ --port                                                 INTEGER                                         port for the FastAPI uvicorn server              │
+│                                                                                                        [env var: `INFINITY_PORT`]                       │
+│                                                                                                        [default: 7997]                                  │
+│ --url-prefix                                           TEXT                                            prefix for all routes of the FastAPI uvicorn     │
+│                                                                                                        server. Useful if you run behind a proxy /       │
+│                                                                                                        cascaded API.                                    │
+│                                                                                                        [env var: `INFINITY_URL_PREFIX`]                 │
+│ --redirect-slash                                       TEXT                                            where to redirect `/` requests to.               │
+│                                                                                                        [env var: `INFINITY_REDIRECT_SLASH`]             │
+│                                                                                                        [default: /docs]                                 │
+│ --log-level                                            [critical|error|warning|info|debug|trace]       console log level.                               │
+│                                                                                                        [env var: `INFINITY_LOG_LEVEL`]                  │
+│                                                                                                        [default: info]                                  │
+│ --permissive-cors         --no-permissive-cors                                                         whether to allow permissive cors.                │
+│                                                                                                        [env var: `INFINITY_PERMISSIVE_CORS`]            │
+│                                                                                                        [default: no-permissive-cors]                    │
+│ --api-key                                              TEXT                                            api_key used for authentication headers.         │
+│                                                                                                        [env var: `INFINITY_API_KEY`]                    │
+│ --proxy-root-path                                      TEXT                                            Proxy prefix for the application. See:           │
+│                                                                                                        https://fastapi.tiangolo.com/advanced/behind-a-… │
+│                                                                                                        [env var: `INFINITY_PROXY_ROOT_PATH`]            │
+│ --help                                                                                                 Show this message and exit.                      │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
 Note: This doc is auto-generated. Do not edit this file directly.

From e9fdf0d20e0c6a75736860acea52a6484ba926f8 Mon Sep 17 00:00:00 2001
From: Dmitry Persiyanov <persiyanov@phystech.edu>
Date: Mon, 7 Apr 2025 19:11:51 +0200
Subject: [PATCH 3/3] regen cli_v2.md

---
 docs/docs/cli_v2.md                           | 260 +++++++++++-------
 .../end_to_end/test_api_with_dummymodel.py    |   2 +-
 2 files changed, 154 insertions(+), 108 deletions(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index e319df25..cca8d59f 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -6,112 +6,158 @@ $ infinity_emb v2 --help
 ```
 
 ```
- Usage: infinity_emb v2 [OPTIONS]                                                                                                                          
-                                                                                                                                                           
- Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                  
- Multiple Model CLI Playbook:                                                                                                                              
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`                                      
- - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"              
- - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.   
-                                                                                                                                                           
-╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --model-id                                             TEXT                                            Huggingface model repo id. Subset of possible    │
-│                                                                                                        models:                                          │
-│                                                                                                        https://huggingface.co/models?other=text-embedd… │
-│                                                                                                        [env var: `INFINITY_MODEL_ID`]                   │
-│                                                                                                        [default: michaelfeil/bge-small-en-v1.5]         │
-│ --served-model-name                                    TEXT                                            the nickname for the API, under which the        │
-│                                                                                                        model_id can be selected                         │
-│                                                                                                        [env var: `INFINITY_SERVED_MODEL_NAME`]          │
-│ --batch-size                                           INTEGER                                         maximum batch size for inference                 │
-│                                                                                                        [env var: `INFINITY_BATCH_SIZE`]                 │
-│                                                                                                        [default: 32]                                    │
-│ --dimensions                                           INTEGER                                         default dimensions for inference                 │
-│                                                                                                        [env var: `INFINITY_DIMENSIONS`]                 │
-│                                                                                                        [default: 0]                                     │
-│ --revision                                             TEXT                                            huggingface  model repo revision.                │
-│                                                                                                        [env var: `INFINITY_REVISION`]                   │
-│ --trust-remote-code       --no-trust-remote-code                                                       if potential remote modeling code from           │
-│                                                                                                        huggingface repo is trusted.                     │
-│                                                                                                        [env var: `INFINITY_TRUST_REMOTE_CODE`]          │
-│                                                                                                        [default: trust-remote-code]                     │
-│ --engine                                               [torch|ctranslate2|optimum|neuron|debugengine]  Which backend to use. `torch` uses Pytorch       │
-│                                                                                                        GPU/CPU, optimum uses ONNX on                    │
-│                                                                                                        GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses      │
-│                                                                                                        torch+ctranslate2 on CPU/GPU.                    │
-│                                                                                                        [env var: `INFINITY_ENGINE`]                     │
-│                                                                                                        [default: torch]                                 │
-│ --model-warmup            --no-model-warmup                                                            if model should be warmed up after startup, and  │
-│                                                                                                        before ready.                                    │
-│                                                                                                        [env var: `INFINITY_MODEL_WARMUP`]               │
-│                                                                                                        [default: model-warmup]                          │
-│ --vector-disk-cache       --no-vector-disk-cache                                                       If hash(request)/results should be cached to     │
-│                                                                                                        SQLite for latency improvement.                  │
-│                                                                                                        [env var: `INFINITY_VECTOR_DISK_CACHE`]          │
-│                                                                                                        [default: vector-disk-cache]                     │
-│ --device                                               [cpu|cuda|mps|tensorrt|auto]                    device to use for computing the model forward    │
-│                                                                                                        pass.                                            │
-│                                                                                                        [env var: `INFINITY_DEVICE`]                     │
-│                                                                                                        [default: auto]                                  │
-│ --device-id                                            TEXT                                            device id defines the model placement. e.g.      │
-│                                                                                                        `0,1` will place the model on MPS/CUDA/GPU 0 and │
-│                                                                                                        1 each                                           │
-│                                                                                                        [env var: `INFINITY_DEVICE_ID`]                  │
-│ --lengths-via-tokenize    --no-lengths-via-tokenize                                                    if True, returned tokens is based on actual      │
-│                                                                                                        tokenizer count. If false, uses len(input) as    │
-│                                                                                                        proxy.                                           │
-│                                                                                                        [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`]       │
-│                                                                                                        [default: lengths-via-tokenize]                  │
-│ --dtype                                                [float32|float16|bfloat16|int8|fp8|auto]        dtype for the model weights.                     │
-│                                                                                                        [env var: `INFINITY_DTYPE`]                      │
-│                                                                                                        [default: auto]                                  │
-│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinary]             dtype post-forward pass. If != `float32`, using  │
-│                                                                                                        Post-Forward Static quantization.                │
-│                                                                                                        [env var: `INFINITY_EMBEDDING_DTYPE`]            │
-│                                                                                                        [default: float32]                               │
-│ --pooling-method                                       [mean|cls|auto]                                 overwrite the pooling method if inferred         │
-│                                                                                                        incorrectly.                                     │
-│                                                                                                        [env var: `INFINITY_POOLING_METHOD`]             │
-│                                                                                                        [default: auto]                                  │
-│ --compile                 --no-compile                                                                 Enable usage of `torch.compile(dynamic=True)` if │
-│                                                                                                        engine relies on it.                             │
-│                                                                                                        [env var: `INFINITY_COMPILE`]                    │
-│                                                                                                        [default: compile]                               │
-│ --bettertransformer       --no-bettertransformer                                                       Enables varlen flash-attention-2 via the         │
-│                                                                                                        `BetterTransformer` implementation. If available │
-│                                                                                                        for this model.                                  │
-│                                                                                                        [env var: `INFINITY_BETTERTRANSFORMER`]          │
-│                                                                                                        [default: bettertransformer]                     │
-│ --preload-only            --no-preload-only                                                            If true, only downloads models and verifies      │
-│                                                                                                        setup, then exit. Recommended for pre-caching    │
-│                                                                                                        the download in a Dockerfile.                    │
-│                                                                                                        [env var: `INFINITY_PRELOAD_ONLY`]               │
-│                                                                                                        [default: no-preload-only]                       │
-│ --host                                                 TEXT                                            host for the FastAPI uvicorn server              │
-│                                                                                                        [env var: `INFINITY_HOST`]                       │
-│                                                                                                        [default: 0.0.0.0]                               │
-│ --port                                                 INTEGER                                         port for the FastAPI uvicorn server              │
-│                                                                                                        [env var: `INFINITY_PORT`]                       │
-│                                                                                                        [default: 7997]                                  │
-│ --url-prefix                                           TEXT                                            prefix for all routes of the FastAPI uvicorn     │
-│                                                                                                        server. Useful if you run behind a proxy /       │
-│                                                                                                        cascaded API.                                    │
-│                                                                                                        [env var: `INFINITY_URL_PREFIX`]                 │
-│ --redirect-slash                                       TEXT                                            where to redirect `/` requests to.               │
-│                                                                                                        [env var: `INFINITY_REDIRECT_SLASH`]             │
-│                                                                                                        [default: /docs]                                 │
-│ --log-level                                            [critical|error|warning|info|debug|trace]       console log level.                               │
-│                                                                                                        [env var: `INFINITY_LOG_LEVEL`]                  │
-│                                                                                                        [default: info]                                  │
-│ --permissive-cors         --no-permissive-cors                                                         whether to allow permissive cors.                │
-│                                                                                                        [env var: `INFINITY_PERMISSIVE_CORS`]            │
-│                                                                                                        [default: no-permissive-cors]                    │
-│ --api-key                                              TEXT                                            api_key used for authentication headers.         │
-│                                                                                                        [env var: `INFINITY_API_KEY`]                    │
-│ --proxy-root-path                                      TEXT                                            Proxy prefix for the application. See:           │
-│                                                                                                        https://fastapi.tiangolo.com/advanced/behind-a-… │
-│                                                                                                        [env var: `INFINITY_PROXY_ROOT_PATH`]            │
-│ --help                                                                                                 Show this message and exit.                      │
-╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+                                                                                                                        
+ Usage: infinity_emb v2 [OPTIONS]                                                                                       
+                                                                                                                        
+ Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
+ Multiple Model CLI Playbook:                                                                                           
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`   
+ - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
+ INFINITY_BATCH_SIZE="8;4;"                                                                                             
+ - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
+ 8` both models have batch-size 8.                                                                                      
+                                                                                                                        
+╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-id                                             TEXT                           Huggingface model repo id.     │
+│                                                                                       Subset of possible models:     │
+│                                                                                       https://huggingface.co/models… │
+│                                                                                       [env var: `INFINITY_MODEL_ID`] │
+│                                                                                       [default:                      │
+│                                                                                       michaelfeil/bge-small-en-v1.5] │
+│ --served-model-name                                    TEXT                           the nickname for the API,      │
+│                                                                                       under which the model_id can   │
+│                                                                                       be selected                    │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_SERVED_MODEL_NAME`]  │
+│ --batch-size                                           INTEGER                        maximum batch size for         │
+│                                                                                       inference                      │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_BATCH_SIZE`]         │
+│                                                                                       [default: 32]                  │
+│ --dimensions                                           INTEGER                        default dimensions for         │
+│                                                                                       inference                      │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_DIMENSIONS`]         │
+│                                                                                       [default: 0]                   │
+│ --revision                                             TEXT                           huggingface  model repo        │
+│                                                                                       revision.                      │
+│                                                                                       [env var: `INFINITY_REVISION`] │
+│ --trust-remote-code       --no-trust-remote-code                                      if potential remote modeling   │
+│                                                                                       code from huggingface repo is  │
+│                                                                                       trusted.                       │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_TRUST_REMOTE_CODE`]  │
+│                                                                                       [default: trust-remote-code]   │
+│ --engine                                               [torch|ctranslate2|optimum|ne  Which backend to use. `torch`  │
+│                                                        uron|debugengine]              uses Pytorch GPU/CPU, optimum  │
+│                                                                                       uses ONNX on                   │
+│                                                                                       GPU/CPU/NVIDIA-TensorRT,       │
+│                                                                                       `CTranslate2` uses             │
+│                                                                                       torch+ctranslate2 on CPU/GPU.  │
+│                                                                                       [env var: `INFINITY_ENGINE`]   │
+│                                                                                       [default: torch]               │
+│ --model-warmup            --no-model-warmup                                           if model should be warmed up   │
+│                                                                                       after startup, and before      │
+│                                                                                       ready.                         │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_MODEL_WARMUP`]       │
+│                                                                                       [default: model-warmup]        │
+│ --vector-disk-cache       --no-vector-disk-cache                                      If hash(request)/results       │
+│                                                                                       should be cached to SQLite for │
+│                                                                                       latency improvement.           │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_VECTOR_DISK_CACHE`]  │
+│                                                                                       [default: vector-disk-cache]   │
+│ --device                                               [cpu|cuda|mps|tensorrt|auto]   device to use for computing    │
+│                                                                                       the model forward pass.        │
+│                                                                                       [env var: `INFINITY_DEVICE`]   │
+│                                                                                       [default: auto]                │
+│ --device-id                                            TEXT                           device id defines the model    │
+│                                                                                       placement. e.g. `0,1` will     │
+│                                                                                       place the model on             │
+│                                                                                       MPS/CUDA/GPU 0 and 1 each      │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_DEVICE_ID`]          │
+│ --lengths-via-tokenize    --no-lengths-via-tokenize                                   if True, returned tokens is    │
+│                                                                                       based on actual tokenizer      │
+│                                                                                       count. If false, uses          │
+│                                                                                       len(input) as proxy.           │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_LENGTHS_VIA_TOKENIZ… │
+│                                                                                       [default:                      │
+│                                                                                       lengths-via-tokenize]          │
+│ --dtype                                                [float32|float16|bfloat16|int  dtype for the model weights.   │
+│                                                        8|fp8|auto]                    [env var: `INFINITY_DTYPE`]    │
+│                                                                                       [default: auto]                │
+│ --embedding-dtype                                      [float32|int8|uint8|binary|ub  dtype post-forward pass. If != │
+│                                                        inary]                         `float32`, using Post-Forward  │
+│                                                                                       Static quantization.           │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_EMBEDDING_DTYPE`]    │
+│                                                                                       [default: float32]             │
+│ --pooling-method                                       [mean|cls|auto]                overwrite the pooling method   │
+│                                                                                       if inferred incorrectly.       │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_POOLING_METHOD`]     │
+│                                                                                       [default: auto]                │
+│ --compile                 --no-compile                                                Enable usage of                │
+│                                                                                       `torch.compile(dynamic=True)`  │
+│                                                                                       if engine relies on it.        │
+│                                                                                       [env var: `INFINITY_COMPILE`]  │
+│                                                                                       [default: compile]             │
+│ --bettertransformer       --no-bettertransformer                                      Enables varlen                 │
+│                                                                                       flash-attention-2 via the      │
+│                                                                                       `BetterTransformer`            │
+│                                                                                       implementation. If available   │
+│                                                                                       for this model.                │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_BETTERTRANSFORMER`]  │
+│                                                                                       [default: bettertransformer]   │
+│ --preload-only            --no-preload-only                                           If true, only downloads models │
+│                                                                                       and verifies setup, then exit. │
+│                                                                                       Recommended for pre-caching    │
+│                                                                                       the download in a Dockerfile.  │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_PRELOAD_ONLY`]       │
+│                                                                                       [default: no-preload-only]     │
+│ --host                                                 TEXT                           host for the FastAPI uvicorn   │
+│                                                                                       server                         │
+│                                                                                       [env var: `INFINITY_HOST`]     │
+│                                                                                       [default: 0.0.0.0]             │
+│ --port                                                 INTEGER                        port for the FastAPI uvicorn   │
+│                                                                                       server                         │
+│                                                                                       [env var: `INFINITY_PORT`]     │
+│                                                                                       [default: 7997]                │
+│ --url-prefix                                           TEXT                           prefix for all routes of the   │
+│                                                                                       FastAPI uvicorn server. Useful │
+│                                                                                       if you run behind a proxy /    │
+│                                                                                       cascaded API.                  │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_URL_PREFIX`]         │
+│ --redirect-slash                                       TEXT                           where to redirect `/` requests │
+│                                                                                       to.                            │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_REDIRECT_SLASH`]     │
+│                                                                                       [default: /docs]               │
+│ --log-level                                            [critical|error|warning|info|  console log level.             │
+│                                                        debug|trace]                   [env var:                      │
+│                                                                                       `INFINITY_LOG_LEVEL`]          │
+│                                                                                       [default: info]                │
+│ --permissive-cors         --no-permissive-cors                                        whether to allow permissive    │
+│                                                                                       cors.                          │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_PERMISSIVE_CORS`]    │
+│                                                                                       [default: no-permissive-cors]  │
+│ --api-key                                              TEXT                           api_key used for               │
+│                                                                                       authentication headers.        │
+│                                                                                       [env var: `INFINITY_API_KEY`]  │
+│ --proxy-root-path                                      TEXT                           Proxy prefix for the           │
+│                                                                                       application. See:              │
+│                                                                                       https://fastapi.tiangolo.com/… │
+│                                                                                       [env var:                      │
+│                                                                                       `INFINITY_PROXY_ROOT_PATH`]    │
+│ --help                                                                                Show this message and exit.    │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
 ```
 Note: This doc is auto-generated. Do not edit this file directly.
diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
index 73bdfd4e..16d86ef4 100644
--- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
+++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
@@ -221,4 +221,4 @@ async def test_matryoshka_embedding_default_dimensions(client):
         assert len(rdata["data"]) == len(inp)
         for embedding, sentence in zip(rdata["data"], inp):
             assert len(sentence) == embedding["embedding"][0]
-            assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS
\ No newline at end of file
+            assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS