使用 OpenVINO 构建本地多模态管道¶

OpenVINO™ 是一个用于优化和部署 AI 推理的开源工具包。OpenVINO™ 运行时支持各种硬件设备，包括 x86 和 ARM CPU 以及英特尔 GPU。它可以帮助提升计算机视觉、自动语音识别、自然语言处理及其他常见任务中的深度学习性能。

通过 OpenVINOMultiModal 类，OpenVINO 可以支持 Hugging Face 多模态模型。

In [ ]

已复制!

%pip install llama-index-multi-modal-llms-openvino -q
%pip install llama-index-multi-modal-llms-openvino -q

In [ ]

已复制!

%pip install llama-index llama-index-readers-file -q
%pip install llama-index llama-index-readers-file -q

导出和压缩多模态模型¶

可以使用 CLI 将模型导出为 OpenVINO IR 格式，并从本地文件夹加载模型。

In [ ]

已复制!

from pathlib import Path

model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
model_path = Path(model_id.split("/")[-1]) / "FP16"

if not model_path.exists():
    !optimum-cli export openvino --model {model_id} --weight-format fp16 {model_path}
from pathlib import Path model_id = "llava-hf/llava-v1.6-mistral-7b-hf" model_path = Path(model_id.split("/")[-1]) / "FP16" if not model_path.exists(): !optimum-cli export openvino --model {model_id} --weight-format fp16 {model_path}

In [ ]

已复制!





import shutil
import nncf
import openvino as ov
import gc

core = ov.Core()

compression_config = {
    "mode": nncf.CompressWeightsMode.INT4_SYM,
    "group_size": 64,
    "ratio": 0.6,
}

compressed_model_path = model_path.parent / "INT4"
if not compressed_model_path.exists():
    ov_model = core.read_model(model_path / "openvino_language_model.xml")
    compressed_ov_model = nncf.compress_weights(ov_model, **compression_config)
    ov.save_model(
        compressed_ov_model,
        compressed_model_path / "openvino_language_model.xml",
    )
    del compressed_ov_model
    del ov_model
    gc.collect()
    for file_name in model_path.glob("*"):
        if file_name.name in [
            "openvino_language_model.xml",
            "openvino_language_model.bin",
        ]:
            continue
        shutil.copy(file_name, compressed_model_path)
import shutil import nncf import openvino as ov import gc core = ov.Core() compression_config = { "mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.6, } compressed_model_path = model_path.parent / "INT4" if not compressed_model_path.exists(): ov_model = core.read_model(model_path / "openvino_language_model.xml") compressed_ov_model = nncf.compress_weights(ov_model, **compression_config) ov.save_model( compressed_ov_model, compressed_model_path / "openvino_language_model.xml", ) del compressed_ov_model del ov_model gc.collect() for file_name in model_path.glob("*"): if file_name.name in [ "openvino_language_model.xml", "openvino_language_model.bin", ]: continue shutil.copy(file_name, compressed_model_path)

INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│              8 │ 2% (1 / 225)                │ 0% (0 / 224)                           │
├────────────────┼─────────────────────────────┼────────────────────────────────────────┤
│              4 │ 98% (224 / 225)             │ 100% (224 / 224)                       │
┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙

Output()

准备输入数据¶

In [ ]

已复制!

import os

os.makedirs("./input_images", exist_ok=True)

url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

image
import os os.makedirs("./input_images", exist_ok=True) url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" image = Image.open(requests.get(url, stream=True).raw) image

In [ ]

已复制!





from llama_index.multi_modal_llms.openvino import OpenVINOMultiModal
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(
    "llava-v1.6-mistral-7b-hf/INT4", trust_remote_code=True
)


def messages_to_prompt(messages, image_documents):
    """
    Prepares the input messages and images.
    """
    conversation = [{"type": "text", "text": messages[0].content}]
    images = []
    for img_doc in image_documents:
        images.append(img_doc)
        conversation.append({"type": "image"})
    messages = [
        {"role": "user", "content": conversation}
    ]  # Wrap conversation in a user role

    print(messages)

    # Apply a chat template to format the message with the processor
    text_prompt = processor.apply_chat_template(
        messages, add_generation_prompt=True
    )

    # Prepare the model inputs (text + images) and convert to tensor
    inputs = processor(text=text_prompt, images=images, return_tensors="pt")
    return inputs
from llama_index.multi_modal_llms.openvino import OpenVINOMultiModal from transformers import AutoProcessor processor = AutoProcessor.from_pretrained( "llava-v1.6-mistral-7b-hf/INT4", trust_remote_code=True ) def messages_to_prompt(messages, image_documents): """ 准备输入消息和图像。 """ conversation = [{"type": "text", "text": messages[0].content}] images = [] for img_doc in image_documents: images.append(img_doc) conversation.append({"type": "image"}) messages = [ {"role": "user", "content": conversation} ] # 将对话包装在用户角色中 print(messages) # 应用聊天模板，使用处理器格式化消息 text_prompt = processor.apply_chat_template( messages, add_generation_prompt=True ) # 准备模型输入（文本 + 图像）并转换为张量 inputs = processor(text=text_prompt, images=images, return_tensors="pt") return inputs

模型加载¶

可以使用 `OpenVINOMultiModal` 方法指定模型参数来加载模型。

如果您有英特尔 GPU，可以指定 `device_map="gpu"` 在其上运行推理。

In [ ]

已复制!





vlm = OpenVINOMultiModal(
    model_id_or_path="llava-v1.6-mistral-7b-hf/INT4",
    device="cpu",
    messages_to_prompt=messages_to_prompt,
    generate_kwargs={"do_sample": False},
)
vlm = OpenVINOMultiModal( model_id_or_path="llava-v1.6-mistral-7b-hf/INT4", device="cpu", messages_to_prompt=messages_to_prompt, generate_kwargs={"do_sample": False}, )

使用本地 OpenVINO 模型进行推理¶

In [ ]

已复制!

response = vlm.complete("Describe the images", image_documents=[image])
print(response.text)
response = vlm.complete("Describe the images", image_documents=[image]) print(response.text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.

[{'role': 'user', 'content': [{'type': 'text', 'text': 'Describe the images'}, {'type': 'image'}]}]
The image shows a person and a dog on a sandy beach. The person is sitting on the sand, facing the camera, and appears to be smiling. They are wearing a plaid shirt and dark pants. The dog is standing next to the person, looking up at the person's hand, which is extended towards the dog. The dog is wearing a harness and has a collar with a tag. The background features the ocean with waves, and the sky is clear with a warm glow, suggesting either sunrise or sunset. The overall atmosphere of the image is peaceful and joyful, capturing a moment of interaction between the person and the dog.

流式传输¶

In [ ]

已复制!

response = vlm.stream_complete("Describe the images", image_documents=[image])
for r in response:
    print(r.delta, end="")
response = vlm.stream_complete("Describe the images", image_documents=[image]) for r in response: print(r.delta, end="")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.

[{'role': 'user', 'content': [{'type': 'text', 'text': 'Describe the images'}, {'type': 'image'}]}]
The image shows a person and a dog on a sandy beach. The person is sitting on the sand, facing the camera, and appears to be smiling. They are wearing a plaid shirt and dark pants. The dog is standing next to the person, looking up at the person's hand, which is extended towards the dog. The dog is wearing a harness and has a collar with a tag. The background features the ocean with waves, and the sky is clear with a warm glow, suggesting either sunrise or sunset. The overall atmosphere of the image is peaceful and joyful, capturing a moment of interaction between the person and the dog.