【vLLM 学习】Disaggregated Prefill Lmcache

vLLM 是一款专为大语言模型推理加速而设计的框架，实现了 KV 缓存内存几乎零浪费，解决了内存管理瓶颈问题。
更多 vLLM 中文文档及教程可访问 →vllm.hyper.ai/
源码 examples/offline_inference/disaggregated_prefill_lmcache.py
# SPDX-License-Identifier: Apache-2.0"""该文件演示了分解预填充的示例用法与 LMCache。我们将启动2个 vLLM 实例 (Preill 的 GPU 0和 Decode 的 GPU 1) ，并启动额外的 LMCache 服务器。KV 缓存以以下方式传输:vLLM 预填充节点 - > lmcache Server-> vllm 解码节点。请注意，运行此示例需要运行 `pip install lmcache`。在 https://github.com/LMCache/LMCache 中了解有关 LMCache 的更多信息。"""import osimport subprocessimport timefrom multiprocessing import Event, Processfrom lmcache.experimental.cache_engine import LMCacheEngineBuilderfrom lmcache.integration.vllm.utils import ENGINE_NAMEfrom vllm import LLM, SamplingParamsfrom vllm.config import KVTransferConfig# 与 LMCache 相关的环境变量# 要启动 LMCache 服务器的端口port = 8100# 在 LMCache 中使用实验功能os.environ["LMCache_USE_EXPERIMENTAL"] = "True"# lmcache 设置为每块使用256个 tokenos.environ["LMCache_CHUNK_SIZE"] = "256"# 禁用 LMCache 中的本地 CPU 后端os.environ["LMCache_LOCAL_CPU"] = "False"# 将本地 CPU 内存缓冲区限制设置为5.0 GBos.environ["LMCache_MAX_LOCAL_CPU_SIZE"] = "5.0"# 设置 LMCache 服务器的远程 URLos.environ["LMCache_REMOTE_URL"] = f"lm://localhost:{port}"# 在 vLLM 和 LMCache 服务器之间设置序列化器/求职者# `naive` 表示使用张量的原始字符而无需任何压缩os.environ["LMCache_REMOTE_SERDE"] = "naive"def run_prefill(prefill_done, prompts):    # 我们将 GPU 0 用于预填充节点。    os.environ["CUDA_VISIBLE_DEVICES"] = "0"    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)    ktc = KVTransferConfig.from_cli(        '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'    )    # 将 GPU 内存利用设置为0.8，用于40GB 显存的 A40 GPU。    # 如果您的 GPU 的内存较少，则降低值。    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",              kv_transfer_config=ktc,              max_model_len=8000,              gpu_memory_utilization=0.8,              enforce_eager=True)    #llm.generate(prompts, sampling_params)    outputs = llm.generate(prompts, sampling_params)    for output in outputs:        generated_text = output.outputs[0].text        print(f"Generated text: {generated_text!r}")    print("Prefill node is finished.")    prefill_done.set()    # 清理 LMCache 后端    LMCacheEngineBuilder.destroy(ENGINE_NAME)def run_decode(prefill_done, prompts, timeout=1):    # 我们将 GPU 1 用于解码节点。    os.environ["CUDA_VISIBLE_DEVICES"] = "1"    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)    ktc = KVTransferConfig.from_cli(        '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'    )    # 将 GPU 内存利用设置为0.8，用于40GB 显存的 A40 GPU。    # 如果您的 GPU 的内存较少，则降低值。    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",              kv_transfer_config=ktc,              max_model_len=8000,              gpu_memory_utilization=0.8,              enforce_eager=True)    print("Waiting for prefill node to finish...")    prefill_done.wait()    time.sleep(timeout)    outputs = llm.generate(prompts, sampling_params)    for output in outputs:        generated_text = output.outputs[0].text        print(f"Generated text: {generated_text!r}")    # 清理 LMCache 后端    LMCacheEngineBuilder.destroy(ENGINE_NAME)def run_lmcache_server(port):    server_proc = subprocess.Popen([        "python", "-m", "lmcache.experimental.server", "localhost",        str(port)    ])    return server_procif __name__ == "__main__":    prompts = [        "Hello, how are you?" * 1000,    ]    prefill_done = Event()    prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))    decode_process = Process(target=run_decode, args=(prefill_done, prompts))    lmcache_server_process = run_lmcache_server(port)    # 开始预填充节点    prefill_process.start()    # 开始解码节点    decode_process.start()    # 清理过程    decode_process.join()    prefill_process.terminate()    lmcache_server_process.terminate()    lmcache_server_process.wait()
Fish AI Reader

FishAI

联系邮箱 441953276@qq.com

相关标签