diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml index c4c8ae02..c7d6fd64 100644 --- a/libs/infinity_emb/Docker.template.yaml +++ b/libs/infinity_emb/Docker.template.yaml @@ -15,7 +15,24 @@ cpu: main_install: | # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh + RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" + RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops + # RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + # openvino-tokenizers[transformers]==2024.5.* \ + # openvino==2024.5.* \ + # nncf>=2.11.0 \ + # sentence_transformers==3.1.1 \ + # openai \ + # "transformers>4.45" \ + # einops amd: # 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto index 4da2bf59..6c1fea82 100644 --- a/libs/infinity_emb/Dockerfile.cpu_auto +++ b/libs/infinity_emb/Dockerfile.cpu_auto @@ -39,13 +39,47 @@ COPY poetry.lock poetry.toml pyproject.toml README.md /app/ # # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops COPY infinity_emb infinity_emb # Install dependency with infinity_emb package # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops # @@ -54,7 +88,24 @@ FROM builder as testing # install lint and test dependencies # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops # lint RUN poetry run ruff check . diff --git a/libs/infinity_emb/Dockerfile.intel_auto b/libs/infinity_emb/Dockerfile.intel_auto new file mode 100644 index 00000000..2a8ffe99 --- /dev/null +++ b/libs/infinity_emb/Dockerfile.intel_auto @@ -0,0 +1,132 @@ +# Autogenerated warning: +# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly. +# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd + +FROM ubuntu:22.04 AS base + +ENV PYTHONUNBUFFERED=1 \ + \ + # pip + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + \ + # make poetry create the virtual environment in the project's root + # it gets named `.venv` + POETRY_VIRTUALENVS_CREATE="true" \ + POETRY_VIRTUALENVS_IN_PROJECT="true" \ + # do not ask any interactive question + POETRY_NO_INTERACTION=1 \ + EXTRAS="all" \ + PYTHON="python3.11" +RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl +WORKDIR /app + +FROM base as builder +# Set the working directory for the app +# Define the version of Poetry to install (default is 1.7.1) +# Define the directory to install Poetry to (default is /opt/poetry) +ARG POETRY_VERSION=1.8.4 +ARG POETRY_HOME=/opt/poetry +# Create a Python virtual environment for Poetry and install it +RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON - +ENV PATH=$POETRY_HOME/bin:$PATH +# Test if Poetry is installed in the expected path +RUN echo "Poetry version:" && poetry --version +# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes) +COPY poetry.lock poetry.toml pyproject.toml README.md /app/ +# Install dependencies only +# +# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" + +RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" + +COPY infinity_emb infinity_emb +# Install dependency with infinity_emb package +# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" + +# + + +FROM builder as testing +# install lint and test dependencies +# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" + +# # lint +# # RUN poetry run ruff check . +# # RUN poetry run mypy . +# # pytest +# COPY tests tests +# # run end to end tests because of duration of build in github ci. +# # Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu +# # poetry run python -m pytest tests/end_to_end -x # TODO: does not work. +# RUN if [ -z "$TARGETPLATFORM" ]; then \ +# ARCH=$(uname -m); \ +# if [ "$ARCH" = "x86_64" ]; then \ +# TARGETPLATFORM="linux/amd64"; \ +# elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \ +# TARGETPLATFORM="linux/arm64"; \ +# else \ +# echo "Unsupported architecture: $ARCH"; exit 1; \ +# fi; \ +# fi; \ +# echo "Running tests on TARGETPLATFORM=$TARGETPLATFORM"; \ +# if [ "$TARGETPLATFORM" = "linux/arm64" ] ; then \ +# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \ +# else \ +# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py tests/end_to_end/test_sentence_transformers.py -m "not performance" -x ; \ +# fi +# RUN echo "all tests passed" > "test_results.txt" + + +# # Use a multi-stage build -> production version, with download +# FROM base AS tested-builder +# COPY --from=builder /app /app +# # force testing stage to run +# COPY --from=testing /app/test_results.txt /app/test_results.txt +# ENV HF_HOME=/app/.cache/huggingface +# ENV PATH=/app/.venv/bin:$PATH +# # do nothing +# RUN echo "copied all files" + + +# Export with tensorrt, not recommended. +# docker buildx build --target=production-tensorrt -f Dockerfile . +# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt +# ENV PYTHONUNBUFFERED=1 \ +# PIP_NO_CACHE_DIR=off \ +# PYTHON="python3.11" +# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y +# COPY --from=builder /app /app +# # force testing stage to run +# COPY --from=testing /app/test_results.txt /app/test_results.txt +# ENV HF_HOME=/app/.cache/torch +# ENV PATH=/app/.venv/bin:$PATH +# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*" +# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH} +# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH} +# ENTRYPOINT ["infinity_emb"] + + +# # Use a multi-stage build -> production version, with download +# # docker buildx build --target=production-with-download \ +# # --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small . +# FROM tested-builder AS production-with-download +# # collect model name and engine from build args +# ARG MODEL_NAME +# RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi +# ARG ENGINE +# RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi +# # will exit with 3 if model is downloaded # TODO: better exit code +# RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ] +# ENTRYPOINT ["infinity_emb"] + +# # Use a multi-stage build -> production version +# FROM tested-builder AS production +# ENTRYPOINT ["infinity_emb"] diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py index 63193aac..886aeef8 100644 --- a/libs/infinity_emb/infinity_emb/_optional_imports.py +++ b/libs/infinity_emb/infinity_emb/_optional_imports.py @@ -68,6 +68,7 @@ def _raise_error(self) -> None: "optimum.neuron", "", ) +CHECK_OPTIMUM_INTEL = OptionalImports("optimum.intel", "optimum") CHECK_PIL = OptionalImports("PIL", "vision") CHECK_POSTHOG = OptionalImports("posthog", "server") CHECK_PYDANTIC = OptionalImports("pydantic", "server") diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py index 66bcf512..1e727447 100644 --- a/libs/infinity_emb/infinity_emb/primitives.py +++ b/libs/infinity_emb/infinity_emb/primitives.py @@ -106,6 +106,7 @@ def default_value(): class Device(EnumType): cpu = "cpu" + openvino = "openvino" cuda = "cuda" mps = "mps" tensorrt = "tensorrt" diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py index e32639de..648f4932 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py @@ -6,7 +6,11 @@ import numpy as np -from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TRANSFORMERS +from infinity_emb._optional_imports import ( + CHECK_ONNXRUNTIME, + CHECK_TRANSFORMERS, + CHECK_OPTIMUM_INTEL, +) from infinity_emb.args import EngineArgs from infinity_emb.primitives import EmbeddingReturnType, PoolingMethod from infinity_emb.transformer.abstract import BaseEmbedder @@ -14,7 +18,7 @@ from infinity_emb.transformer.utils_optimum import ( cls_token_pooling, device_to_onnx, - get_onnx_files, + # get_onnx_files, mean_pooling, normalize, optimize_model, @@ -25,43 +29,80 @@ from optimum.onnxruntime import ( # type: ignore[import-untyped] ORTModelForFeatureExtraction, ) + from infinity_emb.transformer.utils_optimum import get_onnx_files except (ImportError, RuntimeError, Exception) as ex: CHECK_ONNXRUNTIME.mark_dirty(ex) + +if CHECK_OPTIMUM_INTEL.is_available: + try: + from optimum.intel import OVModelForFeatureExtraction # type: ignore[import-untyped] + from infinity_emb.transformer.utils_optimum import get_openvino_files + + except (ImportError, RuntimeError, Exception) as ex: + CHECK_OPTIMUM_INTEL.mark_dirty(ex) + + if CHECK_TRANSFORMERS.is_available: from transformers import AutoConfig, AutoTokenizer # type: ignore[import-untyped] class OptimumEmbedder(BaseEmbedder): def __init__(self, *, engine_args: EngineArgs): - CHECK_ONNXRUNTIME.mark_required() provider = device_to_onnx(engine_args.device) + self.provider = provider + + if provider == "OpenVINOExecutionProvider": + CHECK_OPTIMUM_INTEL.mark_required() + filename = "" + try: + openvino_file = get_openvino_files( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + use_auth_token=True, + ) + filename = openvino_file.as_posix() + except Exception as e: # show error then let the optimum intel compress on the fly + print(str(e)) + + self.model = optimize_model( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + trust_remote_code=engine_args.trust_remote_code, + execution_provider=provider, + file_name=filename, + optimize_model=not os.environ.get( + "INFINITY_ONNX_DISABLE_OPTIMIZE", False + ), # TODO: make this env variable public + model_class=OVModelForFeatureExtraction, + ) - onnx_file = get_onnx_files( - model_name_or_path=engine_args.model_name_or_path, - revision=engine_args.revision, - use_auth_token=True, - prefer_quantized="cpu" in provider.lower(), - ) + elif provider == "CPUExecutionProvider": + CHECK_ONNXRUNTIME.mark_required() + onnx_file = get_onnx_files( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + use_auth_token=True, + prefer_quantized="cpu" in provider.lower(), + ) + self.model = optimize_model( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + trust_remote_code=engine_args.trust_remote_code, + execution_provider=provider, + file_name=onnx_file.as_posix(), + optimize_model=not os.environ.get( + "INFINITY_ONNX_DISABLE_OPTIMIZE", False + ), # TODO: make this env variable public + model_class=ORTModelForFeatureExtraction, + ) + self.model.use_io_binding = False self.pooling = ( mean_pooling if engine_args.pooling_method == PoolingMethod.mean else cls_token_pooling ) - self.model = optimize_model( - model_name_or_path=engine_args.model_name_or_path, - revision=engine_args.revision, - trust_remote_code=engine_args.trust_remote_code, - execution_provider=provider, - file_name=onnx_file.as_posix(), - optimize_model=not os.environ.get( - "INFINITY_ONNX_DISABLE_OPTIMIZE", False - ), # TODO: make this env variable public - model_class=ORTModelForFeatureExtraction, - ) - self.model.use_io_binding = False - self.tokenizer = AutoTokenizer.from_pretrained( engine_args.model_name_or_path, revision=engine_args.revision, diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index aa5bc712..1aafa87c 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -8,10 +8,11 @@ from huggingface_hub import HfApi, HfFolder # type: ignore from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore -from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH +from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH, CHECK_OPTIMUM_INTEL from infinity_emb.log_handler import logger from infinity_emb.primitives import Device + if CHECK_ONNXRUNTIME.is_available: try: from optimum.modeling_base import OptimizedModel # type: ignore @@ -23,6 +24,25 @@ except (ImportError, RuntimeError, Exception) as ex: CHECK_ONNXRUNTIME.mark_dirty(ex) +if CHECK_OPTIMUM_INTEL.is_available: + from optimum.intel import ( + OVModelForFeatureExtraction, # type: ignore[import-untyped] + OVWeightQuantizationConfig, + OVConfig, + OVQuantizer, + ) + # try: + # from optimum.intel import ( + # OVModelForFeatureExtraction, # type: ignore[import-untyped] + # OVWeightQuantizationConfig, + # OVConfig, + # OVQuantizer, + # ) + + # except (ImportError, RuntimeError, Exception) as ex: + # CHECK_OPTIMUM_INTEL.mark_dirty(ex) + + if CHECK_TORCH.is_available: import torch @@ -51,6 +71,8 @@ def normalize(input_array, p=2, dim=1, eps=1e-12): def device_to_onnx(device: Device) -> str: if device == Device.cpu: return "CPUExecutionProvider" + elif device == Device.openvino: + return "OpenVINOExecutionProvider" elif device == Device.cuda: return "CUDAExecutionProvider" elif device == Device.mps: @@ -87,12 +109,7 @@ def optimize_model( revision (Optional[str], optional): The revision to use. Defaults to None. trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True. """ - CHECK_ONNXRUNTIME.mark_required() - path_folder = ( - Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path - ) - OPTIMIZED_SUFFIX = "_optimized.onnx" - files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) + if execution_provider == "TensorrtExecutionProvider": return model_class.from_pretrained( model_name_or_path, @@ -110,15 +127,59 @@ def optimize_model( # "trt_int8_enable": "quantize" in file_name, }, ) - if files_optimized: - file_optimized = files_optimized[0] + + file_optimized: Path | str = "" + + extra_args = {} + + logger.info(f"file_name: {file_name}") + + if execution_provider == "OpenVINOExecutionProvider": # Optimum Intel OpenVINO path + CHECK_OPTIMUM_INTEL.mark_required() + path_folder = ( + Path(HUGGINGFACE_HUB_CACHE) + / "infinity_openvino" + / execution_provider + / model_name_or_path + ) + OPTIMIZED_PREFIX = "openvino_model" + files_optimized = sorted(list(path_folder.glob(f"**/{OPTIMIZED_PREFIX}*"))) + if files_optimized: + file_optimized = files_optimized[-1] + if file_name: + file_optimized = file_name + + extra_args = {"ov_config": {"INFERENCE_PRECISION_HINT": "bf16"}} + + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path + CHECK_ONNXRUNTIME.mark_required() + path_folder = ( + Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path + ) + OPTIMIZED_SUFFIX = "_optimized.onnx" + files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) + if files_optimized: + file_optimized = files_optimized[0] + else: + raise ValueError( + f"Does not support {execution_provider}." + "Optimum engine only support `OpenVINOExecutionProvider` " + "and `CPUExecutionProvider`." + ) + + if file_optimized: logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( - file_optimized.parent.as_posix(), + file_optimized.parent.as_posix() + if not isinstance(file_optimized, str) + else model_name_or_path, revision=revision, trust_remote_code=trust_remote_code, - provider=execution_provider, - file_name=file_optimized.name, + provider=execution_provider, # will be ignored by optimum intel + file_name=file_optimized.name + if not isinstance(file_optimized, str) + else file_optimized, + **extra_args, ) unoptimized_model = model_class.from_pretrained( @@ -132,33 +193,73 @@ def optimize_model( return unoptimized_model try: logger.info("Optimizing model") + if execution_provider == "OpenVINOExecutionProvider": + logger.info("Optimizing model OpenVINOExecutionProvider") + ov_model = OVModelForFeatureExtraction.from_pretrained( + model_name_or_path, + export=True, + # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 + # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + ov_config={ + "INFERENCE_PRECISION_HINT": "bf16" + }, # fp16 for now as it has better precision than bf16 + ) + quantizer = OVQuantizer.from_pretrained(ov_model, task="feature-extraction", export=True) + ov_config = OVConfig( + quantization_config=OVWeightQuantizationConfig( + bits=4, + sym=False, + ratio=1.0, + group_size=128, + all_layers=None, + ) + ) + quantizer.quantize(ov_config=ov_config, save_directory=path_folder.as_posix()) + model = OVModelForFeatureExtraction.from_pretrained( + path_folder.as_posix(), + # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 + # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + ov_config={ + "INFERENCE_PRECISION_HINT": "bf16" + }, # fp16 for now as it has better precision than bf16, + export=False, + ) + logger.info("Successfully load optimized model OpenVINOExecutionProvider") - optimizer = ORTOptimizer.from_pretrained(unoptimized_model) + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path + optimizer = ORTOptimizer.from_pretrained(unoptimized_model) - is_gpu = "cpu" not in execution_provider.lower() - optimization_config = OptimizationConfig( - optimization_level=99, - optimize_with_onnxruntime_only=False, - optimize_for_gpu=is_gpu, - fp16=is_gpu, - # enable_gelu_approximation=True, - # enable_gemm_fast_gelu_fusion=True, # might not work - ) + is_gpu = "cpu" not in execution_provider.lower() + optimization_config = OptimizationConfig( + optimization_level=99, + optimize_with_onnxruntime_only=False, + optimize_for_gpu=is_gpu, + fp16=is_gpu, + # enable_gelu_approximation=True, + # enable_gemm_fast_gelu_fusion=True, # might not work + ) - optimized_model_path = optimizer.optimize( - optimization_config=optimization_config, - save_dir=path_folder.as_posix(), - # if larger than 2gb use external data format - one_external_file=True, - ) + optimized_model_path = optimizer.optimize( + optimization_config=optimization_config, + save_dir=path_folder.as_posix(), + # if larger than 2gb use external data format + one_external_file=True, + ) + + model = model_class.from_pretrained( + optimized_model_path, + revision=revision, + trust_remote_code=trust_remote_code, + provider=execution_provider, + file_name=Path(file_name).name.replace(".onnx", OPTIMIZED_SUFFIX), + ) + else: + raise ValueError( + f"Does not support {execution_provider}." + "Optimum engine only support `OpenVINOExecutionProvider` " + "and `CPUExecutionProvider`." + ) - model = model_class.from_pretrained( - optimized_model_path, - revision=revision, - trust_remote_code=trust_remote_code, - provider=execution_provider, - file_name=Path(file_name).name.replace(".onnx", OPTIMIZED_SUFFIX), - ) except Exception as e: logger.warning(f"Optimization failed with {e}. Going to use the unoptimized model.") model = unoptimized_model @@ -215,3 +316,31 @@ def get_onnx_files( return onnx_files[0] else: raise ValueError(f"No onnx files found for {model_name_or_path} and revision {revision}") + + +def get_openvino_files( + *, + model_name_or_path: str, + revision: Union[str, None] = None, + use_auth_token: Union[bool, str] = True, +) -> Path: + """gets the onnx files from the repo""" + repo_files = _list_all_repo_files( + model_name_or_path=model_name_or_path, + revision=revision, + use_auth_token=use_auth_token, + ) + pattern = "**openvino_model.*" + openvino_files = sorted([p for p in repo_files if p.match(pattern)]) + + if len(openvino_files) > 1: + logger.info(f"Found {len(openvino_files)} openvino files: {openvino_files}") + openvino_file = openvino_files[-1] + logger.info(f"Using {openvino_file} as the model") + return openvino_file + elif len(openvino_files) == 1: + return openvino_files[0] + else: + raise ValueError( + f"No openvino files found for {model_name_or_path} and revision {revision}" + ) diff --git a/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py b/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py index c612d305..1ccef7a8 100644 --- a/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py +++ b/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py @@ -25,3 +25,26 @@ def test_embedder_optimum(size="large"): cosine_sim = np.dot(r, e) / (np.linalg.norm(e) * np.linalg.norm(r)) assert cosine_sim > 0.94 np.testing.assert_allclose(embeds, embeds_orig, atol=0.25) + + +def test_embedder_optimum_openvino_cpu(size="large"): + model = OptimumEmbedder( + engine_args=EngineArgs(model_name_or_path=f"BAAI/bge-{size}-en-v1.5", device="openvino") + ) + st_model = SentenceTransformer(model_name_or_path=f"BAAI/bge-{size}-en-v1.5", device="cpu") + + sentences = ["This is awesome.", "I am depressed."] + + encode_pre = model.encode_pre(sentences) + encode_core = model.encode_core(encode_pre) + embeds = model.encode_post(encode_core) + + embeds_orig = st_model.encode(sentences) + + assert len(embeds) == len(sentences) + + for r, e in zip(embeds, embeds_orig): + cosine_sim = np.dot(r, e) / (np.linalg.norm(e) * np.linalg.norm(r)) + assert cosine_sim > 0.94 + np.testing.assert_allclose(embeds, embeds_orig, atol=0.25) +