vllm comps support openai API ChatCompletionRequest (#1032)

* vllm support openai API

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* test_llms_text-generation_vllm_langchain_on_intel_hpu.sh

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix time

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

---------

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
XinyaoWa
2024-12-13 17:56:24 +08:00
committed by GitHub
parent f5efaf1f18
commit 48ed589822
4 changed files with 121 additions and 41 deletions

View File

@@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
- streaming(true/false): return text response in streaming mode or non-streaming mode
```bash
# 1. Non-streaming mode
# stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
-H 'Content-Type: application/json'
# 2. Streaming mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'
# 3. Custom chat template with streaming mode
#Non-stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json'
4. # Chat with SearchedDoc (Retrieval context)
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-H 'Content-Type: application/json'
```
For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)

View File

@@ -7,6 +7,7 @@ from typing import Union
from fastapi.responses import StreamingResponse
from langchain_community.llms import VLLMOpenAI
from langchain_core.prompts import PromptTemplate
from openai import OpenAI
from template import ChatTemplate
from comps import (
@@ -194,6 +195,98 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)
else:
if logflag:
logger.info("[ ChatCompletionRequest ] input in opea format")
client = OpenAI(
api_key="EMPTY",
base_url=llm_endpoint + "/v1",
)
if isinstance(input.messages, str):
prompt = input.messages
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=input.messages)
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
else:
if input.documents:
# use rag default template
prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)
chat_completion = client.completions.create(
model=model_name,
prompt=prompt,
echo=input.echo,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
seed=input.seed,
stop=input.stop,
stream=input.stream,
suffix=input.suffix,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)
else:
if input.messages[0]["role"] == "system":
if "{context}" in input.messages[0]["content"]:
if input.documents is None or input.documents == []:
input.messages[0]["content"].format(context="")
else:
input.messages[0]["content"].format(context="\n".join(input.documents))
else:
if prompt_template:
system_prompt = prompt_template
if input_variables == ["context"]:
system_prompt = prompt_template.format(context="\n".join(input.documents))
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
)
input.messages.insert(0, {"role": "system", "content": system_prompt})
chat_completion = client.chat.completions.create(
model=model_name,
messages=input.messages,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
response_format=input.response_format,
seed=input.seed,
stop=input.stop,
stream=input.stream,
stream_options=input.stream_options,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)
if input.stream:
def stream_generator():
for c in chat_completion:
if logflag:
logger.info(c)
chunk = c.model_dump_json()
if chunk not in ["<|im_end|>", "<|endoftext|>"]:
yield f"data: {chunk}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
if logflag:
logger.info(chat_completion)
return chat_completion
if __name__ == "__main__":

View File

@@ -1,20 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
your_ip="0.0.0.0"
model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id')
curl http://${your_ip}:8008/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'$model'",
"prompt": "What is Deep Learning?",
"max_tokens": 32,
"temperature": 0
}'
##query microservice
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-H 'Content-Type: application/json'