vllm comps support openai API ChatCompletionRequest (#1032)

* vllm support openai API Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * test_llms_text-generation_vllm_langchain_on_intel_hpu.sh Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix time Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> --------- Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-12-13 17:56:24 +08:00
parent f5efaf1f18
commit 48ed589822
4 changed files with 121 additions and 41 deletions
--- a/comps/llms/text-generation/vllm/langchain/README.md
+++ b/comps/llms/text-generation/vllm/langchain/README.md
@@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
 - streaming(true/false): return text response in streaming mode or non-streaming mode

 ```bash
-# 1. Non-streaming mode
+# stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
+    -H 'Content-Type: application/json'

-# 2. Streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+    -H 'Content-Type: application/json'

-# 3. Custom chat template with streaming mode
+#Non-stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
+    -H 'Content-Type: application/json'

-4. #  Chat with SearchedDoc (Retrieval context)
-curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-  -H 'Content-Type: application/json'
 ```
-
-For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
--- a/comps/llms/text-generation/vllm/langchain/llm.py
+++ b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -7,6 +7,7 @@ from typing import Union
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import VLLMOpenAI
 from langchain_core.prompts import PromptTemplate
+from openai import OpenAI
 from template import ChatTemplate

 from comps import (
@@ -194,6 +195,98 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
                logger.info(response)

            return GeneratedDoc(text=response, prompt=input.query)
+    else:
+        if logflag:
+            logger.info("[ ChatCompletionRequest ] input in opea format")
+        client = OpenAI(
+            api_key="EMPTY",
+            base_url=llm_endpoint + "/v1",
+        )
+
+        if isinstance(input.messages, str):
+            prompt = input.messages
+            if prompt_template:
+                if sorted(input_variables) == ["context", "question"]:
+                    prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
+                elif input_variables == ["question"]:
+                    prompt = prompt_template.format(question=input.messages)
+                else:
+                    logger.info(
+                        f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
+                    )
+            else:
+                if input.documents:
+                    # use rag default template
+                    prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)
+
+            chat_completion = client.completions.create(
+                model=model_name,
+                prompt=prompt,
+                echo=input.echo,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                suffix=input.suffix,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+        else:
+            if input.messages[0]["role"] == "system":
+                if "{context}" in input.messages[0]["content"]:
+                    if input.documents is None or input.documents == []:
+                        input.messages[0]["content"].format(context="")
+                    else:
+                        input.messages[0]["content"].format(context="\n".join(input.documents))
+            else:
+                if prompt_template:
+                    system_prompt = prompt_template
+                    if input_variables == ["context"]:
+                        system_prompt = prompt_template.format(context="\n".join(input.documents))
+                    else:
+                        logger.info(
+                            f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
+                        )
+
+                    input.messages.insert(0, {"role": "system", "content": system_prompt})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name,
+                messages=input.messages,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                response_format=input.response_format,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                stream_options=input.stream_options,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+
+        if input.stream:
+
+            def stream_generator():
+                for c in chat_completion:
+                    if logflag:
+                        logger.info(c)
+                    chunk = c.model_dump_json()
+                    if chunk not in ["<|im_end|>", "<|endoftext|>"]:
+                        yield f"data: {chunk}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            if logflag:
+                logger.info(chat_completion)
+            return chat_completion


 if __name__ == "__main__":
--- a/comps/llms/text-generation/vllm/langchain/query.sh
+++ b/comps/llms/text-generation/vllm/langchain/query.sh
@@ -1,20 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-your_ip="0.0.0.0"
-model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id')
-
-curl http://${your_ip}:8008/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "'$model'",
-  "prompt": "What is Deep Learning?",
-  "max_tokens": 32,
-  "temperature": 0
-  }'
-
-##query microservice
-curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-  -H 'Content-Type: application/json'