vllm comps support openai API ChatCompletionRequest (#1032)
* vllm support openai API Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * test_llms_text-generation_vllm_langchain_on_intel_hpu.sh Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix time Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> --------- Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
|
||||
- streaming(true/false): return text response in streaming mode or non-streaming mode
|
||||
|
||||
```bash
|
||||
# 1. Non-streaming mode
|
||||
# stream mode
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
-X POST \
|
||||
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# 2. Streaming mode
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
-X POST \
|
||||
-d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# 3. Custom chat template with streaming mode
|
||||
#Non-stream mode
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
-X POST \
|
||||
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
4. # Chat with SearchedDoc (Retrieval context)
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Union
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain_community.llms import VLLMOpenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from openai import OpenAI
|
||||
from template import ChatTemplate
|
||||
|
||||
from comps import (
|
||||
@@ -194,6 +195,98 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
|
||||
logger.info(response)
|
||||
|
||||
return GeneratedDoc(text=response, prompt=input.query)
|
||||
else:
|
||||
if logflag:
|
||||
logger.info("[ ChatCompletionRequest ] input in opea format")
|
||||
client = OpenAI(
|
||||
api_key="EMPTY",
|
||||
base_url=llm_endpoint + "/v1",
|
||||
)
|
||||
|
||||
if isinstance(input.messages, str):
|
||||
prompt = input.messages
|
||||
if prompt_template:
|
||||
if sorted(input_variables) == ["context", "question"]:
|
||||
prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
|
||||
elif input_variables == ["question"]:
|
||||
prompt = prompt_template.format(question=input.messages)
|
||||
else:
|
||||
logger.info(
|
||||
f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
|
||||
)
|
||||
else:
|
||||
if input.documents:
|
||||
# use rag default template
|
||||
prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)
|
||||
|
||||
chat_completion = client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
echo=input.echo,
|
||||
frequency_penalty=input.frequency_penalty,
|
||||
max_tokens=input.max_tokens,
|
||||
n=input.n,
|
||||
presence_penalty=input.presence_penalty,
|
||||
seed=input.seed,
|
||||
stop=input.stop,
|
||||
stream=input.stream,
|
||||
suffix=input.suffix,
|
||||
temperature=input.temperature,
|
||||
top_p=input.top_p,
|
||||
user=input.user,
|
||||
)
|
||||
else:
|
||||
if input.messages[0]["role"] == "system":
|
||||
if "{context}" in input.messages[0]["content"]:
|
||||
if input.documents is None or input.documents == []:
|
||||
input.messages[0]["content"].format(context="")
|
||||
else:
|
||||
input.messages[0]["content"].format(context="\n".join(input.documents))
|
||||
else:
|
||||
if prompt_template:
|
||||
system_prompt = prompt_template
|
||||
if input_variables == ["context"]:
|
||||
system_prompt = prompt_template.format(context="\n".join(input.documents))
|
||||
else:
|
||||
logger.info(
|
||||
f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
|
||||
)
|
||||
|
||||
input.messages.insert(0, {"role": "system", "content": system_prompt})
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=input.messages,
|
||||
frequency_penalty=input.frequency_penalty,
|
||||
max_tokens=input.max_tokens,
|
||||
n=input.n,
|
||||
presence_penalty=input.presence_penalty,
|
||||
response_format=input.response_format,
|
||||
seed=input.seed,
|
||||
stop=input.stop,
|
||||
stream=input.stream,
|
||||
stream_options=input.stream_options,
|
||||
temperature=input.temperature,
|
||||
top_p=input.top_p,
|
||||
user=input.user,
|
||||
)
|
||||
|
||||
if input.stream:
|
||||
|
||||
def stream_generator():
|
||||
for c in chat_completion:
|
||||
if logflag:
|
||||
logger.info(c)
|
||||
chunk = c.model_dump_json()
|
||||
if chunk not in ["<|im_end|>", "<|endoftext|>"]:
|
||||
yield f"data: {chunk}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
if logflag:
|
||||
logger.info(chat_completion)
|
||||
return chat_completion
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
your_ip="0.0.0.0"
|
||||
model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id')
|
||||
|
||||
curl http://${your_ip}:8008/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'$model'",
|
||||
"prompt": "What is Deep Learning?",
|
||||
"max_tokens": 32,
|
||||
"temperature": 0
|
||||
}'
|
||||
|
||||
##query microservice
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
Reference in New Issue
Block a user