Rename streaming to stream to align with OpenAI API (#1332)
Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
This commit is contained in:
@@ -49,7 +49,7 @@ services:
|
||||
model: ${LLM_MODEL_ID}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/worker_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
|
||||
@@ -83,7 +83,7 @@ services:
|
||||
model: ${LLM_MODEL_ID}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/supervisor_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -19,7 +19,7 @@ services:
|
||||
model: ${model}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/worker_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
|
||||
@@ -51,7 +51,7 @@ services:
|
||||
model: ${model}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/supervisor_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -21,7 +21,7 @@ services:
|
||||
model: ${LLM_MODEL_ID}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/worker_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
|
||||
@@ -55,7 +55,7 @@ services:
|
||||
model: ${LLM_MODEL_ID}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/supervisor_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -7,6 +7,7 @@ WORKPATH=$(dirname "$PWD")
|
||||
export WORKDIR=$WORKPATH/../../
|
||||
echo "WORKDIR=${WORKDIR}"
|
||||
export ip_address=$(hostname -I | awk '{print $1}')
|
||||
export host_ip=${ip_address}
|
||||
|
||||
export HF_CACHE_DIR=$WORKDIR/hf_cache
|
||||
if [ ! -d "$HF_CACHE_DIR" ]; then
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
set -xe
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
export WORKDIR=$WORKPATH/../../
|
||||
@@ -82,4 +82,4 @@ echo "=================== #5 Agent and API server stopped===================="
|
||||
|
||||
echo y | docker system prune
|
||||
|
||||
echo "ALL DONE!"
|
||||
echo "ALL DONE!!"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Copyright (C) 2024 Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
set -xe
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
export WORKDIR=$WORKPATH/../../
|
||||
@@ -72,4 +72,4 @@ echo "=================== #5 Agent and API server stopped===================="
|
||||
|
||||
echo y | docker system prune
|
||||
|
||||
echo "ALL DONE!"
|
||||
echo "ALL DONE!!"
|
||||
|
||||
@@ -26,7 +26,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
next_inputs["stream"] = inputs["streaming"] # False as default
|
||||
next_inputs["stream"] = inputs["stream"] # False as default
|
||||
next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
|
||||
# next_inputs["presence_penalty"] = inputs["presence_penalty"]
|
||||
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
|
||||
@@ -91,7 +91,7 @@ class AudioQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=False, # TODO add streaming LLM output as input to TTS
|
||||
stream=False, # TODO add stream LLM output as input to TTS
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"audio": chat_request.audio},
|
||||
|
||||
@@ -28,7 +28,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
next_inputs["stream"] = inputs["streaming"] # False as default
|
||||
next_inputs["stream"] = inputs["stream"] # False as default
|
||||
next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
|
||||
# next_inputs["presence_penalty"] = inputs["presence_penalty"]
|
||||
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
|
||||
@@ -103,7 +103,7 @@ class AudioQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=False, # TODO add streaming LLM output as input to TTS
|
||||
stream=False, # TODO add stream LLM output as input to TTS
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"audio": chat_request.audio}, llm_parameters=parameters
|
||||
|
||||
@@ -40,7 +40,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
llmserve:
|
||||
run_test: true
|
||||
service_name: "llm-svc" # Replace with your service name
|
||||
|
||||
@@ -53,7 +53,7 @@ services:
|
||||
ipc: host
|
||||
audioqna-backend-server:
|
||||
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
|
||||
container_name: audioqna-xeon-backend-server
|
||||
container_name: audioqna-rocm-backend-server
|
||||
depends_on:
|
||||
- whisper-service
|
||||
- tgi-service
|
||||
|
||||
@@ -66,7 +66,7 @@ This involves deploying the AudioQnA custom resource. You can use audioQnA_xeon.
|
||||
```sh
|
||||
export CLIENT_POD=$(kubectl get pod -n audioqa -l app=client-test -o jsonpath={.items..metadata.name})
|
||||
export accessUrl=$(kubectl get gmc -n audioqa -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
|
||||
kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json'
|
||||
kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
> [NOTE]
|
||||
|
||||
@@ -44,6 +44,7 @@ function start_services() {
|
||||
# sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: audioqna-gaudi-backend-server|container_name: audioqna-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 200 ]]; do
|
||||
|
||||
@@ -46,6 +46,7 @@ function start_services() {
|
||||
# sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: audioqna-rocm-backend-server|container_name: audioqna-rocm-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 200 ]]; do
|
||||
@@ -63,7 +64,7 @@ function validate_megaservice() {
|
||||
docker logs whisper-service > $LOG_PATH/whisper-service.log
|
||||
docker logs speecht5-service > $LOG_PATH/tts-service.log
|
||||
docker logs tgi-service > $LOG_PATH/tgi-service.log
|
||||
docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
|
||||
docker logs audioqna-rocm-backend-server > $LOG_PATH/audioqna-rocm-backend-server.log
|
||||
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
|
||||
|
||||
if [[ $(file speech.mp3) == *"RIFF"* ]]; then
|
||||
|
||||
@@ -45,6 +45,7 @@ function start_services() {
|
||||
# sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: audioqna-xeon-backend-server|container_name: audioqna-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 200 ]]; do
|
||||
|
||||
@@ -34,7 +34,7 @@ function validate_audioqa() {
|
||||
export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
|
||||
echo "$CLIENT_POD"
|
||||
accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
|
||||
byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
|
||||
byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str)
|
||||
echo "$byte_str" > $LOG_PATH/curl_audioqa.log
|
||||
if [ -z "$byte_str" ]; then
|
||||
echo "audioqa failed, please check the logs in ${LOG_PATH}!"
|
||||
|
||||
@@ -34,7 +34,7 @@ function validate_audioqa() {
|
||||
export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
|
||||
echo "$CLIENT_POD"
|
||||
accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
|
||||
byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
|
||||
byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str)
|
||||
echo "$byte_str" > $LOG_PATH/curl_audioqa.log
|
||||
if [ -z "$byte_str" ]; then
|
||||
echo "audioqa failed, please check the logs in ${LOG_PATH}!"
|
||||
|
||||
@@ -29,7 +29,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
next_inputs["stream"] = inputs["streaming"] # False as default
|
||||
next_inputs["stream"] = inputs["stream"] # False as default
|
||||
next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
|
||||
# next_inputs["presence_penalty"] = inputs["presence_penalty"]
|
||||
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
|
||||
@@ -112,7 +112,7 @@ class AvatarChatbotService:
|
||||
top_p=chat_request.top_p if chat_request.top_p else 0.95,
|
||||
temperature=chat_request.temperature if chat_request.temperature else 0.01,
|
||||
repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
|
||||
streaming=False, # TODO add streaming LLM output as input to TTS
|
||||
stream=False, # TODO add stream LLM output as input to TTS
|
||||
)
|
||||
# print(parameters)
|
||||
|
||||
|
||||
@@ -71,6 +71,7 @@ function start_services() {
|
||||
export FPS=10
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: avatarchatbot-gaudi-backend-server|container_name: avatarchatbot-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 200 ]]; do
|
||||
|
||||
@@ -71,6 +71,7 @@ function start_services() {
|
||||
export FPS=10
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: avatarchatbot-xeon-backend-server|container_name: avatarchatbot-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose up -d
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
|
||||
@@ -58,7 +58,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
llmserve:
|
||||
run_test: false
|
||||
service_name: "chatqna-tgi" # Replace with your service name
|
||||
|
||||
@@ -76,7 +76,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
next_inputs["stream"] = inputs["streaming"]
|
||||
next_inputs["stream"] = inputs["stream"]
|
||||
next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
|
||||
# next_inputs["presence_penalty"] = inputs["presence_penalty"]
|
||||
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
|
||||
@@ -158,7 +158,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
|
||||
|
||||
next_data["inputs"] = prompt
|
||||
|
||||
elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["streaming"]:
|
||||
elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["stream"]:
|
||||
next_data["text"] = data["choices"][0]["message"]["content"]
|
||||
else:
|
||||
next_data = data
|
||||
@@ -342,7 +342,7 @@ class ChatQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
retriever_parameters = RetrieverParms(
|
||||
|
||||
@@ -86,7 +86,7 @@ class ChatQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
retriever_parameters = RetrieverParms(
|
||||
|
||||
@@ -38,6 +38,7 @@ function start_services() {
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: chatqna-gaudi-backend-server|container_name: chatqna-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
|
||||
@@ -65,6 +65,7 @@ function start_services() {
|
||||
cd "$WORKPATH"/docker_compose/amd/gpu/rocm
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: chatqna-backend-server|container_name: chatqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > "${LOG_PATH}"/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
|
||||
@@ -38,6 +38,7 @@ function start_services() {
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: chatqna-xeon-backend-server|container_name: chatqna-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
|
||||
@@ -38,7 +38,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
llmserve:
|
||||
run_test: true
|
||||
service_name: "llm-svc" # Replace with your service name
|
||||
|
||||
@@ -53,7 +53,7 @@ class CodeGenService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"query": prompt}, llm_parameters=parameters
|
||||
|
||||
@@ -113,7 +113,7 @@ curl http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT}/generate \
|
||||
```bash
|
||||
curl http://${HOST_IP}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ docker compose up -d
|
||||
```bash
|
||||
curl http://${host_ip}:9000/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -250,7 +250,7 @@ There are 4 areas worth noting as shown in the screenshot above:
|
||||
|
||||
1. Enter and submit your question
|
||||
2. Your previous questions
|
||||
3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output)
|
||||
3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output)
|
||||
4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted)
|
||||
|
||||
You can also select the code in the editor and ask the AI assistant questions about the code directly.
|
||||
|
||||
@@ -119,7 +119,7 @@ docker compose up -d
|
||||
```bash
|
||||
curl http://${host_ip}:9000/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -227,7 +227,7 @@ There are 4 areas worth noting as shown in the screenshot above:
|
||||
|
||||
1. Enter and submit your question
|
||||
2. Your previous questions
|
||||
3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output)
|
||||
3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output)
|
||||
4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted)
|
||||
|
||||
You can also select the code in the editor and ask the AI assistant questions about the code directly.
|
||||
|
||||
@@ -38,7 +38,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
llmserve:
|
||||
run_test: true
|
||||
service_name: "codetrans-llm-svc" # Replace with your service name
|
||||
|
||||
@@ -289,7 +289,7 @@ You will have the following Docker Images:
|
||||
|
||||
**summary_type=map_reduce**
|
||||
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
|
||||
@@ -280,7 +280,7 @@ You will have the following Docker Images:
|
||||
|
||||
**summary_type=map_reduce**
|
||||
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
|
||||
@@ -231,7 +231,7 @@ class DocSumService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
model=chat_request.model if chat_request.model else None,
|
||||
language=chat_request.language if chat_request.language else "auto",
|
||||
summary_type=summary_type,
|
||||
|
||||
@@ -52,7 +52,7 @@ class EdgeCraftRagService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters)
|
||||
|
||||
@@ -175,7 +175,7 @@ def build_app(cfg, args):
|
||||
}
|
||||
server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}"
|
||||
|
||||
# Async for streaming response
|
||||
# Async for stream response
|
||||
partial_text = ""
|
||||
link_urls = []
|
||||
image_paths = []
|
||||
|
||||
@@ -38,7 +38,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
llmserve:
|
||||
run_test: false
|
||||
service_name: "faq-micro-svc" # Replace with your service name
|
||||
|
||||
@@ -124,7 +124,7 @@ docker compose up -d
|
||||
```
|
||||
|
||||
```bash
|
||||
##enable streaming
|
||||
##enable stream
|
||||
curl http://${host_ip}:8888/v1/faqgen \
|
||||
-H "Content-Type: multipart/form-data" \
|
||||
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
|
||||
|
||||
@@ -109,7 +109,7 @@ class FaqGenService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
model=chat_request.model if chat_request.model else None,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
|
||||
@@ -64,7 +64,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
next_inputs["stream"] = inputs["streaming"]
|
||||
next_inputs["stream"] = inputs["stream"]
|
||||
next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
|
||||
# next_inputs["presence_penalty"] = inputs["presence_penalty"]
|
||||
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
|
||||
@@ -191,7 +191,7 @@ class GraphRAGService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
retriever_parameters = RetrieverParms(
|
||||
|
||||
@@ -40,6 +40,7 @@ function start_services() {
|
||||
export host_ip=${ip_address}
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: graphrag-gaudi-backend-server|container_name: graphrag-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
|
||||
@@ -220,7 +220,7 @@ class MultimodalQnAService:
|
||||
data = await request.json()
|
||||
stream_opt = bool(data.get("stream", False))
|
||||
if stream_opt:
|
||||
print("[ MultimodalQnAService ] stream=True not used, this has not support streaming yet!")
|
||||
print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!")
|
||||
stream_opt = False
|
||||
chat_request = ChatCompletionRequest.model_validate(data)
|
||||
# Multimodal RAG QnA With Videos has not yet accepts image as input during QnA.
|
||||
@@ -263,7 +263,7 @@ class MultimodalQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
result_dict, runtime_graph = await cur_megaservice.schedule(
|
||||
@@ -272,8 +272,8 @@ class MultimodalQnAService:
|
||||
for node, response in result_dict.items():
|
||||
# the last microservice in this megaservice is LVM.
|
||||
# checking if LVM returns StreamingResponse
|
||||
# Currently, LVM with LLAVA has not yet supported streaming.
|
||||
# @TODO: Will need to test this once LVM with LLAVA supports streaming
|
||||
# Currently, LVM with LLAVA has not yet supported stream.
|
||||
# @TODO: Will need to test this once LVM with LLAVA supports stream
|
||||
if (
|
||||
isinstance(response, StreamingResponse)
|
||||
and node == runtime_graph.all_leaves()[-1]
|
||||
|
||||
@@ -62,6 +62,7 @@ function start_services() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
sleep 2m
|
||||
}
|
||||
|
||||
@@ -68,6 +68,7 @@ function start_services() {
|
||||
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
sleep 2m
|
||||
}
|
||||
|
||||
@@ -61,6 +61,7 @@ function start_services() {
|
||||
|
||||
|
||||
# Start Docker Containers
|
||||
sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
sleep 2m
|
||||
}
|
||||
|
||||
@@ -277,7 +277,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
|
||||
```bash
|
||||
curl http://${host_ip}:9000/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ curl http://${host_ip}:3006/generate \
|
||||
# llm microservice
|
||||
curl http://${host_ip}:3007/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
```
|
||||
|
||||
@@ -138,7 +138,7 @@ curl http://${host_ip}:3006/generate \
|
||||
# llm microservice
|
||||
curl http://${host_ip}:3007/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
```
|
||||
|
||||
@@ -96,7 +96,7 @@ class SearchQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"input": prompt}, llm_parameters=parameters
|
||||
|
||||
@@ -85,7 +85,7 @@ class VideoQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"text": prompt}, llm_parameters=parameters
|
||||
|
||||
@@ -38,7 +38,7 @@ test_cases:
|
||||
top_k: 10
|
||||
top_p: 0.95
|
||||
repetition_penalty: 1.03
|
||||
streaming: true
|
||||
stream: true
|
||||
lvmserve:
|
||||
run_test: true
|
||||
service_name: "lvm-serving-svc" # Replace with your service name
|
||||
|
||||
@@ -52,7 +52,7 @@ class VisualQnAService:
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
stream=stream_opt,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(
|
||||
initial_inputs={"prompt": prompt, "image": images[0]}, llm_parameters=parameters
|
||||
|
||||
@@ -20,7 +20,7 @@ services:
|
||||
model: ${model}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
streaming: false
|
||||
stream: false
|
||||
tools: /home/user/tools/tools.yaml
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
|
||||
Reference in New Issue
Block a user