From 065e6c9395cfd6cb7ce31dbd434c4a8933637f33 Mon Sep 17 00:00:00 2001 From: Dmitry Persiyanov Date: Mon, 7 Apr 2025 17:58:52 +0200 Subject: [PATCH 1/3] support --dimensions parameter in cli --- libs/infinity_emb/infinity_emb/args.py | 6 +++- libs/infinity_emb/infinity_emb/cli.py | 7 +++++ libs/infinity_emb/infinity_emb/engine.py | 1 + libs/infinity_emb/infinity_emb/env.py | 6 ++++ .../infinity_emb/inference/batch_handler.py | 6 ++++ .../infinity_emb/infinity_server.py | 1 + .../end_to_end/test_api_with_dummymodel.py | 29 +++++++++++++++++++ 7 files changed, 55 insertions(+), 1 deletion(-) diff --git a/libs/infinity_emb/infinity_emb/args.py b/libs/infinity_emb/infinity_emb/args.py index 9c4b26ec..a5bf07df 100644 --- a/libs/infinity_emb/infinity_emb/args.py +++ b/libs/infinity_emb/infinity_emb/args.py @@ -34,6 +34,7 @@ class EngineArgs: Args: model_name_or_path, str: Defaults to "michaelfeil/bge-small-en-v1.5". batch_size, int: Defaults to 32. + dimensions, int: Defaults to 0 (no matryoshka slicing). revision, str: Defaults to None. trust_remote_code, bool: Defaults to True. engine, InferenceEngine or str: backend for inference. @@ -54,6 +55,7 @@ class EngineArgs: model_name_or_path: str = MANAGER.model_id[0] batch_size: int = MANAGER.batch_size[0] + dimensions: int = MANAGER.dimensions[0] revision: Optional[str] = MANAGER.revision[0] trust_remote_code: bool = MANAGER.trust_remote_code[0] engine: InferenceEngine = InferenceEngine[MANAGER.engine[0]] @@ -148,6 +150,7 @@ def from_env(cls) -> list["EngineArgs"]: EngineArgs( model_name_or_path=model_name_or_path, batch_size=batch_size, + dimensions=dimensions, revision=revision, trust_remote_code=trust_remote_code, engine=engine, @@ -161,9 +164,10 @@ def from_env(cls) -> list["EngineArgs"]: embedding_dtype=embedding_dtype, served_model_name=served_model_name, ) - for model_name_or_path, batch_size, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name in zip_longest( + for model_name_or_path, batch_size, dimensions, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name in zip_longest( MANAGER.model_id, MANAGER.batch_size, + MANAGER.dimensions, MANAGER.revision, MANAGER.trust_remote_code, MANAGER.engine, diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py index 567f95f5..23f2bd42 100644 --- a/libs/infinity_emb/infinity_emb/cli.py +++ b/libs/infinity_emb/infinity_emb/cli.py @@ -113,6 +113,7 @@ def v1( model_name_or_path: str = MANAGER.model_id[0], served_model_name: str = MANAGER.served_model_name[0], batch_size: int = MANAGER.batch_size[0], + dimensions: int = MANAGER.dimensions[0], revision: str = MANAGER.revision[0], trust_remote_code: bool = MANAGER.trust_remote_code[0], redirect_slash: str = MANAGER.redirect_slash, @@ -153,6 +154,7 @@ def v1( model_id=[model_name_or_path], served_model_name=[served_model_name], # type: ignore batch_size=[batch_size], + dimensions=[dimensions], revision=[revision], # type: ignore trust_remote_code=[trust_remote_code], engine=[engine], @@ -192,6 +194,9 @@ def v2( batch_size: list[int] = typer.Option( **_construct("batch_size"), help="maximum batch size for inference" ), + dimensions: list[int] = typer.Option( + **_construct("dimensions"), help="default dimensions for inference" + ), revision: list[str] = typer.Option( **_construct("revision"), help="huggingface model repo revision." ), @@ -285,6 +290,7 @@ def v2( Defaults to `INFINITY_MODEL_ID` served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"] batch_size, list[int]: batch size for forward pass. + dimensions, list[int]: default dimensions for inference. revision: list[str]: revision of the model. trust_remote_code, list[bool]: trust remote code. url_prefix, str: prefix for api. typically "". @@ -316,6 +322,7 @@ def v2( length=len(model_id), model_name_or_path=model_id, batch_size=batch_size, + dimensions=dimensions, revision=revision, trust_remote_code=trust_remote_code, engine=engine, diff --git a/libs/infinity_emb/infinity_emb/engine.py b/libs/infinity_emb/infinity_emb/engine.py index 153e15ba..bbc9dcd2 100644 --- a/libs/infinity_emb/infinity_emb/engine.py +++ b/libs/infinity_emb/infinity_emb/engine.py @@ -88,6 +88,7 @@ async def astart(self): self.running = True self._batch_handler = BatchHandler( max_batch_size=self._engine_args.batch_size, + matryoshka_dim=self._engine_args.dimensions, model_replicas=self._model_replicas, # batch_delay=self._min_inference_t / 2, vector_disk_cache_path=self._engine_args.vector_disk_cache_path, diff --git a/libs/infinity_emb/infinity_emb/env.py b/libs/infinity_emb/infinity_emb/env.py index a3261624..413856e6 100644 --- a/libs/infinity_emb/infinity_emb/env.py +++ b/libs/infinity_emb/infinity_emb/env.py @@ -107,6 +107,12 @@ def batch_size(self): self._optional_infinity_var_multiple("batch_size", default=["32"]) ) + @cached_property + def dimensions(self): + return self._to_int_multiple( + self._optional_infinity_var_multiple("dimensions", default=["0"]) + ) + @cached_property def revision(self): return self._optional_infinity_var_multiple("revision", default=[""]) diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py index 24a7c49b..0584d277 100644 --- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py +++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py @@ -79,6 +79,7 @@ def __init__( self, model_replicas: list["BaseTypeHint"], max_batch_size: int, + matryoshka_dim: Optional[int] = None, max_queue_wait: int = MANAGER.queue_size, batch_delay: float = 5e-3, vector_disk_cache_path: str = "", @@ -92,6 +93,7 @@ def __init__( Args: model (BaseTransformer): the base class of the model to be used max_batch_size (int): max batch size of dynamic batch size + matryoshka_dim (int, optional): default dimensions for matryoshka slicing. max_queue_wait (int, optional): max items to queue in the batch, default 32_000 batch_delay (float, optional): sleep in seconds, wait time for pre/post methods. Best result: setting to 1/2 the minimal expected @@ -112,6 +114,7 @@ def __init__( self._result_queue: Queue = Queue(8) self.max_batch_size = max_batch_size + self.matryoshka_dim = matryoshka_dim self._verbose = verbose self.batch_delay = batch_delay @@ -172,6 +175,7 @@ async def embed( input_sentences = [EmbeddingSingle(sentence=s) for s in sentences] embeddings, usage = await self._schedule(input_sentences) + matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim return matryososka_slice(embeddings, matryoshka_dim), usage async def rerank( @@ -278,6 +282,7 @@ async def image_embed( items = await resolve_images(images) embeddings, usage = await self._schedule(items) + matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim return matryososka_slice(embeddings, matryoshka_dim), usage async def audio_embed( @@ -308,6 +313,7 @@ async def audio_embed( getattr(self.model_worker[0]._model, "sampling_rate", -42), ) embeddings, usage = await self._schedule(items) + matryoshka_dim = matryoshka_dim if matryoshka_dim else self.matryoshka_dim return matryososka_slice(embeddings, matryoshka_dim), usage async def _schedule(self, list_queueitem: Sequence[AbstractSingle]) -> tuple[list[Any], int]: diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py index 43488012..80ea2815 100644 --- a/libs/infinity_emb/infinity_emb/infinity_server.py +++ b/libs/infinity_emb/infinity_emb/infinity_server.py @@ -57,6 +57,7 @@ def create_server( permissive_cors: bool = MANAGER.permissive_cors, api_key: str = MANAGER.api_key, proxy_root_path: str = MANAGER.proxy_root_path, + dimensions: int = MANAGER.dimensions, ): """ creates the FastAPI server for a set of EngineArgs. diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py index 1e8d1aa4..73bdfd4e 100644 --- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py +++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py @@ -19,6 +19,8 @@ PREFIX = "" MODEL_NAME = "dummy-number-1" MODEL_NAME_2 = "dummy-number-2" +MODEL_NAME_3 = "dummy-number-3" +DEFAULT_DIMENSIONS = 5 BATCH_SIZE = 16 PATH_OPENAPI = pathlib.Path(__file__).parent.parent.parent.parent.parent.joinpath( @@ -38,6 +40,12 @@ batch_size=BATCH_SIZE, engine=InferenceEngine.debugengine, ), + EngineArgs( + model_name_or_path=MODEL_NAME_3, + batch_size=BATCH_SIZE, + dimensions=DEFAULT_DIMENSIONS, + engine=InferenceEngine.debugengine, + ), ], ) @@ -193,3 +201,24 @@ async def test_matryoshka_embedding(client): for embedding, sentence in zip(rdata["data"], inp): assert len(sentence) == embedding["embedding"][0] assert len(embedding["embedding"]) == matryoshka_dim + + +@pytest.mark.anyio +async def test_matryoshka_embedding_default_dimensions(client): + possible_inputs = [ + ["This is a test sentence."], + ["This is a test sentence.", "This is another test sentence."], + ] + for inp in possible_inputs: + response = await client.post( + f"{PREFIX}/embeddings", + json=dict(input=inp, model=MODEL_NAME_3), + ) + assert response.status_code == 200, f"{response.status_code}, {response.text}" + rdata = response.json() + assert "data" in rdata and isinstance(rdata["data"], list) + assert all("embedding" in d for d in rdata["data"]) + assert len(rdata["data"]) == len(inp) + for embedding, sentence in zip(rdata["data"], inp): + assert len(sentence) == embedding["embedding"][0] + assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS \ No newline at end of file From 9f6f07fd624575d824891eb7da6b70116bbd6bc0 Mon Sep 17 00:00:00 2001 From: Dmitry Persiyanov Date: Mon, 7 Apr 2025 19:00:32 +0200 Subject: [PATCH 2/3] upd docs --- docs/docs/cli_v2.md | 255 +++++++++++++++++++------------------------- 1 file changed, 107 insertions(+), 148 deletions(-) diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md index 0bae7fce..e319df25 100644 --- a/docs/docs/cli_v2.md +++ b/docs/docs/cli_v2.md @@ -6,153 +6,112 @@ $ infinity_emb v2 --help ``` ``` - - Usage: infinity_emb v2 [OPTIONS] - - Infinity API ♾️ cli v2. MIT License. Copyright (c) 2023-now Michael Feil - Multiple Model CLI Playbook: - - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` - - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && - INFINITY_BATCH_SIZE="8;4;" - - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size - 8` both models have batch-size 8. - -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --model-id TEXT Huggingface model repo id. │ -│ Subset of possible models: │ -│ https://huggingface.co/models… │ -│ [env var: `INFINITY_MODEL_ID`] │ -│ [default: │ -│ michaelfeil/bge-small-en-v1.5] │ -│ --served-model-name TEXT the nickname for the API, │ -│ under which the model_id can │ -│ be selected │ -│ [env var: │ -│ `INFINITY_SERVED_MODEL_NAME`] │ -│ --batch-size INTEGER maximum batch size for │ -│ inference │ -│ [env var: │ -│ `INFINITY_BATCH_SIZE`] │ -│ [default: 32] │ -│ --revision TEXT huggingface model repo │ -│ revision. │ -│ [env var: `INFINITY_REVISION`] │ -│ --trust-remote-code --no-trust-remote-code if potential remote modeling │ -│ code from huggingface repo is │ -│ trusted. │ -│ [env var: │ -│ `INFINITY_TRUST_REMOTE_CODE`] │ -│ [default: trust-remote-code] │ -│ --engine [torch|ctranslate2|optimum|ne Which backend to use. `torch` │ -│ uron|debugengine] uses Pytorch GPU/CPU, optimum │ -│ uses ONNX on │ -│ GPU/CPU/NVIDIA-TensorRT, │ -│ `CTranslate2` uses │ -│ torch+ctranslate2 on CPU/GPU. │ -│ [env var: `INFINITY_ENGINE`] │ -│ [default: torch] │ -│ --model-warmup --no-model-warmup if model should be warmed up │ -│ after startup, and before │ -│ ready. │ -│ [env var: │ -│ `INFINITY_MODEL_WARMUP`] │ -│ [default: model-warmup] │ -│ --vector-disk-cache --no-vector-disk-cache If hash(request)/results │ -│ should be cached to SQLite for │ -│ latency improvement. │ -│ [env var: │ -│ `INFINITY_VECTOR_DISK_CACHE`] │ -│ [default: vector-disk-cache] │ -│ --device [cpu|cuda|mps|tensorrt|auto] device to use for computing │ -│ the model forward pass. │ -│ [env var: `INFINITY_DEVICE`] │ -│ [default: auto] │ -│ --device-id TEXT device id defines the model │ -│ placement. e.g. `0,1` will │ -│ place the model on │ -│ MPS/CUDA/GPU 0 and 1 each │ -│ [env var: │ -│ `INFINITY_DEVICE_ID`] │ -│ --lengths-via-tokenize --no-lengths-via-tokenize if True, returned tokens is │ -│ based on actual tokenizer │ -│ count. If false, uses │ -│ len(input) as proxy. │ -│ [env var: │ -│ `INFINITY_LENGTHS_VIA_TOKENIZ… │ -│ [default: │ -│ lengths-via-tokenize] │ -│ --dtype [float32|float16|bfloat16|int dtype for the model weights. │ -│ 8|fp8|auto] [env var: `INFINITY_DTYPE`] │ -│ [default: auto] │ -│ --embedding-dtype [float32|int8|uint8|binary|ub dtype post-forward pass. If != │ -│ inary] `float32`, using Post-Forward │ -│ Static quantization. │ -│ [env var: │ -│ `INFINITY_EMBEDDING_DTYPE`] │ -│ [default: float32] │ -│ --pooling-method [mean|cls|auto] overwrite the pooling method │ -│ if inferred incorrectly. │ -│ [env var: │ -│ `INFINITY_POOLING_METHOD`] │ -│ [default: auto] │ -│ --compile --no-compile Enable usage of │ -│ `torch.compile(dynamic=True)` │ -│ if engine relies on it. │ -│ [env var: `INFINITY_COMPILE`] │ -│ [default: compile] │ -│ --bettertransformer --no-bettertransformer Enables varlen │ -│ flash-attention-2 via the │ -│ `BetterTransformer` │ -│ implementation. If available │ -│ for this model. │ -│ [env var: │ -│ `INFINITY_BETTERTRANSFORMER`] │ -│ [default: bettertransformer] │ -│ --preload-only --no-preload-only If true, only downloads models │ -│ and verifies setup, then exit. │ -│ Recommended for pre-caching │ -│ the download in a Dockerfile. │ -│ [env var: │ -│ `INFINITY_PRELOAD_ONLY`] │ -│ [default: no-preload-only] │ -│ --host TEXT host for the FastAPI uvicorn │ -│ server │ -│ [env var: `INFINITY_HOST`] │ -│ [default: 0.0.0.0] │ -│ --port INTEGER port for the FastAPI uvicorn │ -│ server │ -│ [env var: `INFINITY_PORT`] │ -│ [default: 7997] │ -│ --url-prefix TEXT prefix for all routes of the │ -│ FastAPI uvicorn server. Useful │ -│ if you run behind a proxy / │ -│ cascaded API. │ -│ [env var: │ -│ `INFINITY_URL_PREFIX`] │ -│ --redirect-slash TEXT where to redirect `/` requests │ -│ to. │ -│ [env var: │ -│ `INFINITY_REDIRECT_SLASH`] │ -│ [default: /docs] │ -│ --log-level [critical|error|warning|info| console log level. │ -│ debug|trace] [env var: │ -│ `INFINITY_LOG_LEVEL`] │ -│ [default: info] │ -│ --permissive-cors --no-permissive-cors whether to allow permissive │ -│ cors. │ -│ [env var: │ -│ `INFINITY_PERMISSIVE_CORS`] │ -│ [default: no-permissive-cors] │ -│ --api-key TEXT api_key used for │ -│ authentication headers. │ -│ [env var: `INFINITY_API_KEY`] │ -│ --proxy-root-path TEXT Proxy prefix for the │ -│ application. See: │ -│ https://fastapi.tiangolo.com/… │ -│ [env var: │ -│ `INFINITY_PROXY_ROOT_PATH`] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ - + Usage: infinity_emb v2 [OPTIONS] + + Infinity API ♾️ cli v2. MIT License. Copyright (c) 2023-now Michael Feil + Multiple Model CLI Playbook: + - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` + - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" + - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. + +╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --model-id TEXT Huggingface model repo id. Subset of possible │ +│ models: │ +│ https://huggingface.co/models?other=text-embedd… │ +│ [env var: `INFINITY_MODEL_ID`] │ +│ [default: michaelfeil/bge-small-en-v1.5] │ +│ --served-model-name TEXT the nickname for the API, under which the │ +│ model_id can be selected │ +│ [env var: `INFINITY_SERVED_MODEL_NAME`] │ +│ --batch-size INTEGER maximum batch size for inference │ +│ [env var: `INFINITY_BATCH_SIZE`] │ +│ [default: 32] │ +│ --dimensions INTEGER default dimensions for inference │ +│ [env var: `INFINITY_DIMENSIONS`] │ +│ [default: 0] │ +│ --revision TEXT huggingface model repo revision. │ +│ [env var: `INFINITY_REVISION`] │ +│ --trust-remote-code --no-trust-remote-code if potential remote modeling code from │ +│ huggingface repo is trusted. │ +│ [env var: `INFINITY_TRUST_REMOTE_CODE`] │ +│ [default: trust-remote-code] │ +│ --engine [torch|ctranslate2|optimum|neuron|debugengine] Which backend to use. `torch` uses Pytorch │ +│ GPU/CPU, optimum uses ONNX on │ +│ GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses │ +│ torch+ctranslate2 on CPU/GPU. │ +│ [env var: `INFINITY_ENGINE`] │ +│ [default: torch] │ +│ --model-warmup --no-model-warmup if model should be warmed up after startup, and │ +│ before ready. │ +│ [env var: `INFINITY_MODEL_WARMUP`] │ +│ [default: model-warmup] │ +│ --vector-disk-cache --no-vector-disk-cache If hash(request)/results should be cached to │ +│ SQLite for latency improvement. │ +│ [env var: `INFINITY_VECTOR_DISK_CACHE`] │ +│ [default: vector-disk-cache] │ +│ --device [cpu|cuda|mps|tensorrt|auto] device to use for computing the model forward │ +│ pass. │ +│ [env var: `INFINITY_DEVICE`] │ +│ [default: auto] │ +│ --device-id TEXT device id defines the model placement. e.g. │ +│ `0,1` will place the model on MPS/CUDA/GPU 0 and │ +│ 1 each │ +│ [env var: `INFINITY_DEVICE_ID`] │ +│ --lengths-via-tokenize --no-lengths-via-tokenize if True, returned tokens is based on actual │ +│ tokenizer count. If false, uses len(input) as │ +│ proxy. │ +│ [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`] │ +│ [default: lengths-via-tokenize] │ +│ --dtype [float32|float16|bfloat16|int8|fp8|auto] dtype for the model weights. │ +│ [env var: `INFINITY_DTYPE`] │ +│ [default: auto] │ +│ --embedding-dtype [float32|int8|uint8|binary|ubinary] dtype post-forward pass. If != `float32`, using │ +│ Post-Forward Static quantization. │ +│ [env var: `INFINITY_EMBEDDING_DTYPE`] │ +│ [default: float32] │ +│ --pooling-method [mean|cls|auto] overwrite the pooling method if inferred │ +│ incorrectly. │ +│ [env var: `INFINITY_POOLING_METHOD`] │ +│ [default: auto] │ +│ --compile --no-compile Enable usage of `torch.compile(dynamic=True)` if │ +│ engine relies on it. │ +│ [env var: `INFINITY_COMPILE`] │ +│ [default: compile] │ +│ --bettertransformer --no-bettertransformer Enables varlen flash-attention-2 via the │ +│ `BetterTransformer` implementation. If available │ +│ for this model. │ +│ [env var: `INFINITY_BETTERTRANSFORMER`] │ +│ [default: bettertransformer] │ +│ --preload-only --no-preload-only If true, only downloads models and verifies │ +│ setup, then exit. Recommended for pre-caching │ +│ the download in a Dockerfile. │ +│ [env var: `INFINITY_PRELOAD_ONLY`] │ +│ [default: no-preload-only] │ +│ --host TEXT host for the FastAPI uvicorn server │ +│ [env var: `INFINITY_HOST`] │ +│ [default: 0.0.0.0] │ +│ --port INTEGER port for the FastAPI uvicorn server │ +│ [env var: `INFINITY_PORT`] │ +│ [default: 7997] │ +│ --url-prefix TEXT prefix for all routes of the FastAPI uvicorn │ +│ server. Useful if you run behind a proxy / │ +│ cascaded API. │ +│ [env var: `INFINITY_URL_PREFIX`] │ +│ --redirect-slash TEXT where to redirect `/` requests to. │ +│ [env var: `INFINITY_REDIRECT_SLASH`] │ +│ [default: /docs] │ +│ --log-level [critical|error|warning|info|debug|trace] console log level. │ +│ [env var: `INFINITY_LOG_LEVEL`] │ +│ [default: info] │ +│ --permissive-cors --no-permissive-cors whether to allow permissive cors. │ +│ [env var: `INFINITY_PERMISSIVE_CORS`] │ +│ [default: no-permissive-cors] │ +│ --api-key TEXT api_key used for authentication headers. │ +│ [env var: `INFINITY_API_KEY`] │ +│ --proxy-root-path TEXT Proxy prefix for the application. See: │ +│ https://fastapi.tiangolo.com/advanced/behind-a-… │ +│ [env var: `INFINITY_PROXY_ROOT_PATH`] │ +│ --help Show this message and exit. │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` Note: This doc is auto-generated. Do not edit this file directly. From e9fdf0d20e0c6a75736860acea52a6484ba926f8 Mon Sep 17 00:00:00 2001 From: Dmitry Persiyanov Date: Mon, 7 Apr 2025 19:11:51 +0200 Subject: [PATCH 3/3] regen cli_v2.md --- docs/docs/cli_v2.md | 260 +++++++++++------- .../end_to_end/test_api_with_dummymodel.py | 2 +- 2 files changed, 154 insertions(+), 108 deletions(-) diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md index e319df25..cca8d59f 100644 --- a/docs/docs/cli_v2.md +++ b/docs/docs/cli_v2.md @@ -6,112 +6,158 @@ $ infinity_emb v2 --help ``` ``` - Usage: infinity_emb v2 [OPTIONS] - - Infinity API ♾️ cli v2. MIT License. Copyright (c) 2023-now Michael Feil - Multiple Model CLI Playbook: - - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` - - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" - - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. - -╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --model-id TEXT Huggingface model repo id. Subset of possible │ -│ models: │ -│ https://huggingface.co/models?other=text-embedd… │ -│ [env var: `INFINITY_MODEL_ID`] │ -│ [default: michaelfeil/bge-small-en-v1.5] │ -│ --served-model-name TEXT the nickname for the API, under which the │ -│ model_id can be selected │ -│ [env var: `INFINITY_SERVED_MODEL_NAME`] │ -│ --batch-size INTEGER maximum batch size for inference │ -│ [env var: `INFINITY_BATCH_SIZE`] │ -│ [default: 32] │ -│ --dimensions INTEGER default dimensions for inference │ -│ [env var: `INFINITY_DIMENSIONS`] │ -│ [default: 0] │ -│ --revision TEXT huggingface model repo revision. │ -│ [env var: `INFINITY_REVISION`] │ -│ --trust-remote-code --no-trust-remote-code if potential remote modeling code from │ -│ huggingface repo is trusted. │ -│ [env var: `INFINITY_TRUST_REMOTE_CODE`] │ -│ [default: trust-remote-code] │ -│ --engine [torch|ctranslate2|optimum|neuron|debugengine] Which backend to use. `torch` uses Pytorch │ -│ GPU/CPU, optimum uses ONNX on │ -│ GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses │ -│ torch+ctranslate2 on CPU/GPU. │ -│ [env var: `INFINITY_ENGINE`] │ -│ [default: torch] │ -│ --model-warmup --no-model-warmup if model should be warmed up after startup, and │ -│ before ready. │ -│ [env var: `INFINITY_MODEL_WARMUP`] │ -│ [default: model-warmup] │ -│ --vector-disk-cache --no-vector-disk-cache If hash(request)/results should be cached to │ -│ SQLite for latency improvement. │ -│ [env var: `INFINITY_VECTOR_DISK_CACHE`] │ -│ [default: vector-disk-cache] │ -│ --device [cpu|cuda|mps|tensorrt|auto] device to use for computing the model forward │ -│ pass. │ -│ [env var: `INFINITY_DEVICE`] │ -│ [default: auto] │ -│ --device-id TEXT device id defines the model placement. e.g. │ -│ `0,1` will place the model on MPS/CUDA/GPU 0 and │ -│ 1 each │ -│ [env var: `INFINITY_DEVICE_ID`] │ -│ --lengths-via-tokenize --no-lengths-via-tokenize if True, returned tokens is based on actual │ -│ tokenizer count. If false, uses len(input) as │ -│ proxy. │ -│ [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`] │ -│ [default: lengths-via-tokenize] │ -│ --dtype [float32|float16|bfloat16|int8|fp8|auto] dtype for the model weights. │ -│ [env var: `INFINITY_DTYPE`] │ -│ [default: auto] │ -│ --embedding-dtype [float32|int8|uint8|binary|ubinary] dtype post-forward pass. If != `float32`, using │ -│ Post-Forward Static quantization. │ -│ [env var: `INFINITY_EMBEDDING_DTYPE`] │ -│ [default: float32] │ -│ --pooling-method [mean|cls|auto] overwrite the pooling method if inferred │ -│ incorrectly. │ -│ [env var: `INFINITY_POOLING_METHOD`] │ -│ [default: auto] │ -│ --compile --no-compile Enable usage of `torch.compile(dynamic=True)` if │ -│ engine relies on it. │ -│ [env var: `INFINITY_COMPILE`] │ -│ [default: compile] │ -│ --bettertransformer --no-bettertransformer Enables varlen flash-attention-2 via the │ -│ `BetterTransformer` implementation. If available │ -│ for this model. │ -│ [env var: `INFINITY_BETTERTRANSFORMER`] │ -│ [default: bettertransformer] │ -│ --preload-only --no-preload-only If true, only downloads models and verifies │ -│ setup, then exit. Recommended for pre-caching │ -│ the download in a Dockerfile. │ -│ [env var: `INFINITY_PRELOAD_ONLY`] │ -│ [default: no-preload-only] │ -│ --host TEXT host for the FastAPI uvicorn server │ -│ [env var: `INFINITY_HOST`] │ -│ [default: 0.0.0.0] │ -│ --port INTEGER port for the FastAPI uvicorn server │ -│ [env var: `INFINITY_PORT`] │ -│ [default: 7997] │ -│ --url-prefix TEXT prefix for all routes of the FastAPI uvicorn │ -│ server. Useful if you run behind a proxy / │ -│ cascaded API. │ -│ [env var: `INFINITY_URL_PREFIX`] │ -│ --redirect-slash TEXT where to redirect `/` requests to. │ -│ [env var: `INFINITY_REDIRECT_SLASH`] │ -│ [default: /docs] │ -│ --log-level [critical|error|warning|info|debug|trace] console log level. │ -│ [env var: `INFINITY_LOG_LEVEL`] │ -│ [default: info] │ -│ --permissive-cors --no-permissive-cors whether to allow permissive cors. │ -│ [env var: `INFINITY_PERMISSIVE_CORS`] │ -│ [default: no-permissive-cors] │ -│ --api-key TEXT api_key used for authentication headers. │ -│ [env var: `INFINITY_API_KEY`] │ -│ --proxy-root-path TEXT Proxy prefix for the application. See: │ -│ https://fastapi.tiangolo.com/advanced/behind-a-… │ -│ [env var: `INFINITY_PROXY_ROOT_PATH`] │ -│ --help Show this message and exit. │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + Usage: infinity_emb v2 [OPTIONS] + + Infinity API ♾️ cli v2. MIT License. Copyright (c) 2023-now Michael Feil + Multiple Model CLI Playbook: + - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` + - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && + INFINITY_BATCH_SIZE="8;4;" + - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size + 8` both models have batch-size 8. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --model-id TEXT Huggingface model repo id. │ +│ Subset of possible models: │ +│ https://huggingface.co/models… │ +│ [env var: `INFINITY_MODEL_ID`] │ +│ [default: │ +│ michaelfeil/bge-small-en-v1.5] │ +│ --served-model-name TEXT the nickname for the API, │ +│ under which the model_id can │ +│ be selected │ +│ [env var: │ +│ `INFINITY_SERVED_MODEL_NAME`] │ +│ --batch-size INTEGER maximum batch size for │ +│ inference │ +│ [env var: │ +│ `INFINITY_BATCH_SIZE`] │ +│ [default: 32] │ +│ --dimensions INTEGER default dimensions for │ +│ inference │ +│ [env var: │ +│ `INFINITY_DIMENSIONS`] │ +│ [default: 0] │ +│ --revision TEXT huggingface model repo │ +│ revision. │ +│ [env var: `INFINITY_REVISION`] │ +│ --trust-remote-code --no-trust-remote-code if potential remote modeling │ +│ code from huggingface repo is │ +│ trusted. │ +│ [env var: │ +│ `INFINITY_TRUST_REMOTE_CODE`] │ +│ [default: trust-remote-code] │ +│ --engine [torch|ctranslate2|optimum|ne Which backend to use. `torch` │ +│ uron|debugengine] uses Pytorch GPU/CPU, optimum │ +│ uses ONNX on │ +│ GPU/CPU/NVIDIA-TensorRT, │ +│ `CTranslate2` uses │ +│ torch+ctranslate2 on CPU/GPU. │ +│ [env var: `INFINITY_ENGINE`] │ +│ [default: torch] │ +│ --model-warmup --no-model-warmup if model should be warmed up │ +│ after startup, and before │ +│ ready. │ +│ [env var: │ +│ `INFINITY_MODEL_WARMUP`] │ +│ [default: model-warmup] │ +│ --vector-disk-cache --no-vector-disk-cache If hash(request)/results │ +│ should be cached to SQLite for │ +│ latency improvement. │ +│ [env var: │ +│ `INFINITY_VECTOR_DISK_CACHE`] │ +│ [default: vector-disk-cache] │ +│ --device [cpu|cuda|mps|tensorrt|auto] device to use for computing │ +│ the model forward pass. │ +│ [env var: `INFINITY_DEVICE`] │ +│ [default: auto] │ +│ --device-id TEXT device id defines the model │ +│ placement. e.g. `0,1` will │ +│ place the model on │ +│ MPS/CUDA/GPU 0 and 1 each │ +│ [env var: │ +│ `INFINITY_DEVICE_ID`] │ +│ --lengths-via-tokenize --no-lengths-via-tokenize if True, returned tokens is │ +│ based on actual tokenizer │ +│ count. If false, uses │ +│ len(input) as proxy. │ +│ [env var: │ +│ `INFINITY_LENGTHS_VIA_TOKENIZ… │ +│ [default: │ +│ lengths-via-tokenize] │ +│ --dtype [float32|float16|bfloat16|int dtype for the model weights. │ +│ 8|fp8|auto] [env var: `INFINITY_DTYPE`] │ +│ [default: auto] │ +│ --embedding-dtype [float32|int8|uint8|binary|ub dtype post-forward pass. If != │ +│ inary] `float32`, using Post-Forward │ +│ Static quantization. │ +│ [env var: │ +│ `INFINITY_EMBEDDING_DTYPE`] │ +│ [default: float32] │ +│ --pooling-method [mean|cls|auto] overwrite the pooling method │ +│ if inferred incorrectly. │ +│ [env var: │ +│ `INFINITY_POOLING_METHOD`] │ +│ [default: auto] │ +│ --compile --no-compile Enable usage of │ +│ `torch.compile(dynamic=True)` │ +│ if engine relies on it. │ +│ [env var: `INFINITY_COMPILE`] │ +│ [default: compile] │ +│ --bettertransformer --no-bettertransformer Enables varlen │ +│ flash-attention-2 via the │ +│ `BetterTransformer` │ +│ implementation. If available │ +│ for this model. │ +│ [env var: │ +│ `INFINITY_BETTERTRANSFORMER`] │ +│ [default: bettertransformer] │ +│ --preload-only --no-preload-only If true, only downloads models │ +│ and verifies setup, then exit. │ +│ Recommended for pre-caching │ +│ the download in a Dockerfile. │ +│ [env var: │ +│ `INFINITY_PRELOAD_ONLY`] │ +│ [default: no-preload-only] │ +│ --host TEXT host for the FastAPI uvicorn │ +│ server │ +│ [env var: `INFINITY_HOST`] │ +│ [default: 0.0.0.0] │ +│ --port INTEGER port for the FastAPI uvicorn │ +│ server │ +│ [env var: `INFINITY_PORT`] │ +│ [default: 7997] │ +│ --url-prefix TEXT prefix for all routes of the │ +│ FastAPI uvicorn server. Useful │ +│ if you run behind a proxy / │ +│ cascaded API. │ +│ [env var: │ +│ `INFINITY_URL_PREFIX`] │ +│ --redirect-slash TEXT where to redirect `/` requests │ +│ to. │ +│ [env var: │ +│ `INFINITY_REDIRECT_SLASH`] │ +│ [default: /docs] │ +│ --log-level [critical|error|warning|info| console log level. │ +│ debug|trace] [env var: │ +│ `INFINITY_LOG_LEVEL`] │ +│ [default: info] │ +│ --permissive-cors --no-permissive-cors whether to allow permissive │ +│ cors. │ +│ [env var: │ +│ `INFINITY_PERMISSIVE_CORS`] │ +│ [default: no-permissive-cors] │ +│ --api-key TEXT api_key used for │ +│ authentication headers. │ +│ [env var: `INFINITY_API_KEY`] │ +│ --proxy-root-path TEXT Proxy prefix for the │ +│ application. See: │ +│ https://fastapi.tiangolo.com/… │ +│ [env var: │ +│ `INFINITY_PROXY_ROOT_PATH`] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + ``` Note: This doc is auto-generated. Do not edit this file directly. diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py index 73bdfd4e..16d86ef4 100644 --- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py +++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py @@ -221,4 +221,4 @@ async def test_matryoshka_embedding_default_dimensions(client): assert len(rdata["data"]) == len(inp) for embedding, sentence in zip(rdata["data"], inp): assert len(sentence) == embedding["embedding"][0] - assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS \ No newline at end of file + assert len(embedding["embedding"]) == DEFAULT_DIMENSIONS