vLLM 是一款專為大語言模型推理加速而設計的框架,實現了 KV 緩存內存幾乎零浪費,解決了內存管理瓶頸問題。
更多 vLLM 中文文檔及教程可訪問 →https://vllm.hyper.ai/
*在線運行 vLLM 入門教程:零基礎分步指南
源碼?examples/offline_inference/encoder_decoder_multimodal.py
# SPDX-License-Identifier: Apache-2.0"""
此示例顯示了如何使用 vLLM 進行離線推理
文本生成的 enc-dec LMMS 上的顯式/隱式提示格式。
"""
import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuplefrom vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParserclass ModelRequestData(NamedTuple):engine_args: EngineArgsprompts: Sequence[PromptType]def run_florence2():engine_args = EngineArgs(model="microsoft/Florence-2-large",tokenizer="facebook/bart-large",max_num_seqs=8,trust_remote_code=True,limit_mm_per_prompt={"image": 1},dtype="half",)prompts = [{ # implicit prompt with task token"prompt": "<DETAILED_CAPTION>","multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},},{ # explicit encoder/decoder prompt"encoder_prompt": {"prompt": "Describe in detail what is shown in the image.","multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},},"decoder_prompt": "",},]return ModelRequestData(engine_args=engine_args,prompts=prompts,)def run_mllama():engine_args = EngineArgs(model="meta-llama/Llama-3.2-11B-Vision-Instruct",max_model_len=4096,max_num_seqs=2,limit_mm_per_prompt={"image": 1},dtype="half",)prompts = [{ # Implicit prompt # 隱式提示"prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501"multi_modal_data": {"image": ImageAsset("stop_sign").pil_image,},},{ # Explicit prompt # 顯示提示"encoder_prompt": {"prompt": "<|image|>","multi_modal_data": {"image": ImageAsset("stop_sign").pil_image,},},"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501},]return ModelRequestData(engine_args=engine_args,prompts=prompts,)def run_whisper():engine_args = EngineArgs(model="openai/whisper-large-v3-turbo",max_model_len=448,max_num_seqs=16,limit_mm_per_prompt={"audio": 1},dtype="half",)prompts = [{ # Test implicit prompt # 測試隱式提示"prompt": "<|startoftranscript|>","multi_modal_data": {"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,},},{ # Test explicit encoder/decoder prompt # 測試顯式 編碼/解碼提示"encoder_prompt": {"prompt": "","multi_modal_data": {"audio": AudioAsset("winning_call").audio_and_sample_rate,},},"decoder_prompt": "<|startoftranscript|>",}]return ModelRequestData(engine_args=engine_args,prompts=prompts,)model_example_map = {"florence2": run_florence2,"mllama": run_mllama,"whisper": run_whisper,
}def main(args):model = args.model_typeif model not in model_example_map:raise ValueError(f"Model type {model} is not supported.")req_data = model_example_map[model]()engine_args = asdict(req_data.engine_args) | {"seed": args.seed}llm = LLM(**engine_args)prompts = req_data.prompts# 創建一個采樣參數對象。sampling_params = SamplingParams(temperature=0,top_p=1.0,max_tokens=64,)start = time.time()# 從提示中生成輸出 token 。# 輸出是包含提示的對象,生成了文本和其他信息。outputs = llm.generate(prompts, sampling_params)# 打印輸出。for output in outputs:prompt = output.promptgenerated_text = output.outputs[0].textprint(f"Decoder prompt: {prompt!r}, "f"Generated text: {generated_text!r}")duration = time.time() - startprint("Duration:", duration)print("RPS:", len(prompts) / duration)if __name__ == "__main__":parser = FlexibleArgumentParser(description='Demo on using vLLM for offline inference with ''vision language models for text generation')parser.add_argument('--model-type','-m',type=str,default="mllama",choices=model_example_map.keys(),help='Huggingface "model_type".')parser.add_argument("--seed",type=int,default=None,help="Set the seed when initializing `vllm.LLM`.")args = parser.parse_args()main(args)