Rename streaming to stream to align with OpenAI API (#1332)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
2025-01-06 13:25:55 +08:00
parent 1f29eca288
commit 464e2d3125
53 changed files with 70 additions and 57 deletions
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -49,7 +49,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/worker_agent_tools.yaml
      require_human_feedback: false
      RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
@@ -83,7 +83,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -19,7 +19,7 @@ services:
      model: ${model}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/worker_agent_tools.yaml
      require_human_feedback: false
      RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
@@ -51,7 +51,7 @@ services:
      model: ${model}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -21,7 +21,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/worker_agent_tools.yaml
      require_human_feedback: false
      RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL}
@@ -55,7 +55,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
--- a/AgentQnA/tests/step2_start_retrieval_tool.sh
+++ b/AgentQnA/tests/step2_start_retrieval_tool.sh
@@ -7,6 +7,7 @@ WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
+export host_ip=${ip_address}

 export HF_CACHE_DIR=$WORKDIR/hf_cache
 if [ ! -d "$HF_CACHE_DIR" ]; then
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -2,7 +2,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-set -e
+set -xe

 WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
@@ -82,4 +82,4 @@ echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune

-echo "ALL DONE!"
+echo "ALL DONE!!"
--- a/AgentQnA/tests/test_compose_on_rocm.sh
+++ b/AgentQnA/tests/test_compose_on_rocm.sh
@@ -2,7 +2,7 @@
 # Copyright (C) 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: Apache-2.0

-set -e
+set -xe

 WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
@@ -72,4 +72,4 @@ echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune

-echo "ALL DONE!"
+echo "ALL DONE!!"
--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -26,7 +26,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]  # False as default
+        next_inputs["stream"] = inputs["stream"]  # False as default
        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
        # next_inputs["presence_penalty"] = inputs["presence_penalty"]
        # next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
@@ -91,7 +91,7 @@ class AudioQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=False,  # TODO add streaming LLM output as input to TTS
+            stream=False,  # TODO add stream LLM output as input to TTS
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"audio": chat_request.audio},
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -28,7 +28,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]  # False as default
+        next_inputs["stream"] = inputs["stream"]  # False as default
        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
        # next_inputs["presence_penalty"] = inputs["presence_penalty"]
        # next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
@@ -103,7 +103,7 @@ class AudioQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=False,  # TODO add streaming LLM output as input to TTS
+            stream=False,  # TODO add stream LLM output as input to TTS
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"audio": chat_request.audio}, llm_parameters=parameters
--- a/AudioQnA/benchmark/performance/benchmark.yaml
+++ b/AudioQnA/benchmark/performance/benchmark.yaml
@@ -40,7 +40,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    llmserve:
      run_test: true
      service_name: "llm-svc"  # Replace with your service name
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -53,7 +53,7 @@ services:
    ipc: host
  audioqna-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
-    container_name: audioqna-xeon-backend-server
+    container_name: audioqna-rocm-backend-server
    depends_on:
      - whisper-service
      - tgi-service
--- a/AudioQnA/kubernetes/intel/README_gmc.md
+++ b/AudioQnA/kubernetes/intel/README_gmc.md
@@ -66,7 +66,7 @@ This involves deploying the AudioQnA custom resource. You can use audioQnA_xeon.
   ```sh
   export CLIENT_POD=$(kubectl get pod -n audioqa -l app=client-test -o jsonpath={.items..metadata.name})
   export accessUrl=$(kubectl get gmc -n audioqa -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-   kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl  -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json'
+   kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl  -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json'
   ```

 > [NOTE]
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -44,6 +44,7 @@ function start_services() {
    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
+    sed -i "s|container_name: audioqna-gaudi-backend-server|container_name: audioqna-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
--- a/AudioQnA/tests/test_compose_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_on_rocm.sh
@@ -46,6 +46,7 @@ function start_services() {
    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
+    sed -i "s|container_name: audioqna-rocm-backend-server|container_name: audioqna-rocm-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
@@ -63,7 +64,7 @@ function validate_megaservice() {
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
    docker logs tgi-service > $LOG_PATH/tgi-service.log
-    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
+    docker logs audioqna-rocm-backend-server > $LOG_PATH/audioqna-rocm-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3

    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -45,6 +45,7 @@ function start_services() {
    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
+    sed -i "s|container_name: audioqna-xeon-backend-server|container_name: audioqna-xeon-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
--- a/AudioQnA/tests/test_gmc_on_gaudi.sh
+++ b/AudioQnA/tests/test_gmc_on_gaudi.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
    export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
    echo "$CLIENT_POD"
    accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str)
    echo "$byte_str" > $LOG_PATH/curl_audioqa.log
    if [ -z "$byte_str" ]; then
 	echo "audioqa failed, please check the logs in ${LOG_PATH}!"
--- a/AudioQnA/tests/test_gmc_on_xeon.sh
+++ b/AudioQnA/tests/test_gmc_on_xeon.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
    export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
    echo "$CLIENT_POD"
    accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str)
    echo "$byte_str" > $LOG_PATH/curl_audioqa.log
    if [ -z "$byte_str" ]; then
        echo "audioqa failed, please check the logs in ${LOG_PATH}!"
--- a/AvatarChatbot/avatarchatbot.py
+++ b/AvatarChatbot/avatarchatbot.py
@@ -29,7 +29,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]  # False as default
+        next_inputs["stream"] = inputs["stream"]  # False as default
        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
        # next_inputs["presence_penalty"] = inputs["presence_penalty"]
        # next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
@@ -112,7 +112,7 @@ class AvatarChatbotService:
            top_p=chat_request.top_p if chat_request.top_p else 0.95,
            temperature=chat_request.temperature if chat_request.temperature else 0.01,
            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
-            streaming=False,  # TODO add streaming LLM output as input to TTS
+            stream=False,  # TODO add stream LLM output as input to TTS
        )
        # print(parameters)

--- a/AvatarChatbot/tests/test_compose_on_gaudi.sh
+++ b/AvatarChatbot/tests/test_compose_on_gaudi.sh
@@ -71,6 +71,7 @@ function start_services() {
    export FPS=10

    # Start Docker Containers
+    sed -i "s|container_name: avatarchatbot-gaudi-backend-server|container_name: avatarchatbot-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
--- a/AvatarChatbot/tests/test_compose_on_xeon.sh
+++ b/AvatarChatbot/tests/test_compose_on_xeon.sh
@@ -71,6 +71,7 @@ function start_services() {
    export FPS=10

    # Start Docker Containers
+    sed -i "s|container_name: avatarchatbot-xeon-backend-server|container_name: avatarchatbot-xeon-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose up -d
    n=0
    until [[ "$n" -ge 100 ]]; do
--- a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml
+++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml
@@ -58,7 +58,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    llmserve:
      run_test: false
      service_name: "chatqna-tgi"  # Replace with your service name
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -76,7 +76,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
        next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]
+        next_inputs["stream"] = inputs["stream"]
        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
        # next_inputs["presence_penalty"] = inputs["presence_penalty"]
        # next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
@@ -158,7 +158,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di

        next_data["inputs"] = prompt

-    elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["streaming"]:
+    elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["stream"]:
        next_data["text"] = data["choices"][0]["message"]["content"]
    else:
        next_data = data
@@ -342,7 +342,7 @@ class ChatQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
        )
        retriever_parameters = RetrieverParms(
--- a/ChatQnA/chatqna_wrapper.py
+++ b/ChatQnA/chatqna_wrapper.py
@@ -86,7 +86,7 @@ class ChatQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
        )
        retriever_parameters = RetrieverParms(
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -38,6 +38,7 @@ function start_services() {
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

    # Start Docker Containers
+    sed -i "s|container_name: chatqna-gaudi-backend-server|container_name: chatqna-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -65,6 +65,7 @@ function start_services() {
    cd "$WORKPATH"/docker_compose/amd/gpu/rocm

    # Start Docker Containers
+    sed -i "s|container_name: chatqna-backend-server|container_name: chatqna-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > "${LOG_PATH}"/start_services_with_compose.log

    n=0
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -38,6 +38,7 @@ function start_services() {
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

    # Start Docker Containers
+    sed -i "s|container_name: chatqna-xeon-backend-server|container_name: chatqna-xeon-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
--- a/CodeGen/benchmark/performance/benchmark.yaml
+++ b/CodeGen/benchmark/performance/benchmark.yaml
@@ -38,7 +38,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    llmserve:
      run_test: true
      service_name: "llm-svc"  # Replace with your service name
--- a/CodeGen/codegen.py
+++ b/CodeGen/codegen.py
@@ -53,7 +53,7 @@ class CodeGenService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"query": prompt}, llm_parameters=parameters
--- a/CodeGen/docker_compose/amd/gpu/rocm/README.md
+++ b/CodeGen/docker_compose/amd/gpu/rocm/README.md
@@ -113,7 +113,7 @@ curl http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT}/generate \
 ```bash
 curl http://${HOST_IP}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions\
  -X POST \
-  -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
  -H 'Content-Type: application/json'
 ```

--- a/CodeGen/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -138,7 +138,7 @@ docker compose up -d
   ```bash
   curl http://${host_ip}:9000/v1/chat/completions\
     -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
     -H 'Content-Type: application/json'
   ```

@@ -250,7 +250,7 @@ There are 4 areas worth noting as shown in the screenshot above:

 1. Enter and submit your question
 2. Your previous questions
-3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output)
+3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output)
 4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted)

 You can also select the code in the editor and ask the AI assistant questions about the code directly.
--- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
@@ -119,7 +119,7 @@ docker compose up -d
   ```bash
   curl http://${host_ip}:9000/v1/chat/completions\
     -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
     -H 'Content-Type: application/json'
   ```

@@ -227,7 +227,7 @@ There are 4 areas worth noting as shown in the screenshot above:

 1. Enter and submit your question
 2. Your previous questions
-3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output)
+3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output)
 4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted)

 You can also select the code in the editor and ask the AI assistant questions about the code directly.
--- a/CodeTrans/benchmark/performance/benchmark.yaml
+++ b/CodeTrans/benchmark/performance/benchmark.yaml
@@ -38,7 +38,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    llmserve:
      run_test: true
      service_name: "codetrans-llm-svc"  # Replace with your service name
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -289,7 +289,7 @@ You will have the following Docker Images:

   **summary_type=map_reduce**

-   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.

   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -280,7 +280,7 @@ You will have the following Docker Images:

   **summary_type=map_reduce**

-   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.

   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

--- a/DocSum/docsum.py
+++ b/DocSum/docsum.py
@@ -231,7 +231,7 @@ class DocSumService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            model=chat_request.model if chat_request.model else None,
            language=chat_request.language if chat_request.language else "auto",
            summary_type=summary_type,
--- a/EdgeCraftRAG/chatqna.py
+++ b/EdgeCraftRAG/chatqna.py
@@ -52,7 +52,7 @@ class EdgeCraftRagService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters)
--- a/EdgeCraftRAG/ui/gradio/ecragui.py
+++ b/EdgeCraftRAG/ui/gradio/ecragui.py
@@ -175,7 +175,7 @@ def build_app(cfg, args):
        }
        server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}"

-        # Async for streaming response
+        # Async for stream response
        partial_text = ""
        link_urls = []
        image_paths = []
--- a/FaqGen/benchmark/performance/benchmark.yaml
+++ b/FaqGen/benchmark/performance/benchmark.yaml
@@ -38,7 +38,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    llmserve:
      run_test: false
      service_name: "faq-micro-svc"  # Replace with your service name
--- a/FaqGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/README.md
@@ -124,7 +124,7 @@ docker compose up -d
   ```

   ```bash
-   ##enable streaming
+   ##enable stream
   curl http://${host_ip}:8888/v1/faqgen \
     -H "Content-Type: multipart/form-data" \
     -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
--- a/FaqGen/faqgen.py
+++ b/FaqGen/faqgen.py
@@ -109,7 +109,7 @@ class FaqGenService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            model=chat_request.model if chat_request.model else None,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
--- a/GraphRAG/graphrag.py
+++ b/GraphRAG/graphrag.py
@@ -64,7 +64,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
        next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]
+        next_inputs["stream"] = inputs["stream"]
        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
        # next_inputs["presence_penalty"] = inputs["presence_penalty"]
        # next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
@@ -191,7 +191,7 @@ class GraphRAGService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
        )
        retriever_parameters = RetrieverParms(
--- a/GraphRAG/tests/test_compose_on_gaudi.sh
+++ b/GraphRAG/tests/test_compose_on_gaudi.sh
@@ -40,6 +40,7 @@ function start_services() {
    export host_ip=${ip_address}

    # Start Docker Containers
+    sed -i "s|container_name: graphrag-gaudi-backend-server|container_name: graphrag-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
--- a/MultimodalQnA/multimodalqna.py
+++ b/MultimodalQnA/multimodalqna.py
@@ -220,7 +220,7 @@ class MultimodalQnAService:
        data = await request.json()
        stream_opt = bool(data.get("stream", False))
        if stream_opt:
-            print("[ MultimodalQnAService ] stream=True not used, this has not support streaming yet!")
+            print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!")
            stream_opt = False
        chat_request = ChatCompletionRequest.model_validate(data)
        # Multimodal RAG QnA With Videos has not yet accepts image as input during QnA.
@@ -263,7 +263,7 @@ class MultimodalQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
        )
        result_dict, runtime_graph = await cur_megaservice.schedule(
@@ -272,8 +272,8 @@ class MultimodalQnAService:
        for node, response in result_dict.items():
            # the last microservice in this megaservice is LVM.
            # checking if LVM returns StreamingResponse
-            # Currently, LVM with LLAVA has not yet supported streaming.
-            # @TODO: Will need to test this once LVM with LLAVA supports streaming
+            # Currently, LVM with LLAVA has not yet supported stream.
+            # @TODO: Will need to test this once LVM with LLAVA supports stream
            if (
                isinstance(response, StreamingResponse)
                and node == runtime_graph.all_leaves()[-1]
--- a/MultimodalQnA/tests/test_compose_on_gaudi.sh
+++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh
@@ -62,6 +62,7 @@ function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi

    # Start Docker Containers
+    sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    sleep 2m
 }
--- a/MultimodalQnA/tests/test_compose_on_rocm.sh
+++ b/MultimodalQnA/tests/test_compose_on_rocm.sh
@@ -68,6 +68,7 @@ function start_services() {


    # Start Docker Containers
+    sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    sleep 2m
 }
--- a/MultimodalQnA/tests/test_compose_on_xeon.sh
+++ b/MultimodalQnA/tests/test_compose_on_xeon.sh
@@ -61,6 +61,7 @@ function start_services() {


    # Start Docker Containers
+    sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    sleep 2m
 }
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -277,7 +277,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
   ```bash
   curl http://${host_ip}:9000/v1/chat/completions\
     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
     -H 'Content-Type: application/json'
   ```

--- a/SearchQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/README.md
@@ -140,7 +140,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
  -H 'Content-Type: application/json'

 ```
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -138,7 +138,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
  -H 'Content-Type: application/json'

 ```
--- a/SearchQnA/searchqna.py
+++ b/SearchQnA/searchqna.py
@@ -96,7 +96,7 @@ class SearchQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"input": prompt}, llm_parameters=parameters
--- a/VideoQnA/videoqna.py
+++ b/VideoQnA/videoqna.py
@@ -85,7 +85,7 @@ class VideoQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"text": prompt}, llm_parameters=parameters
--- a/VisualQnA/benchmark/performance/benchmark.yaml
+++ b/VisualQnA/benchmark/performance/benchmark.yaml
@@ -38,7 +38,7 @@ test_cases:
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
-        streaming: true
+        stream: true
    lvmserve:
      run_test: true
      service_name: "lvm-serving-svc"  # Replace with your service name
--- a/VisualQnA/visualqna.py
+++ b/VisualQnA/visualqna.py
@@ -52,7 +52,7 @@ class VisualQnAService:
            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
-            streaming=stream_opt,
+            stream=stream_opt,
        )
        result_dict, runtime_graph = await self.megaservice.schedule(
            initial_inputs={"prompt": prompt, "image": images[0]}, llm_parameters=parameters
--- a/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml
@@ -20,7 +20,7 @@ services:
      model: ${model}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      streaming: false
+      stream: false
      tools: /home/user/tools/tools.yaml
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}