Update test_compose_vllm_on_xeon.sh

move the vllm-service
[pre-commit.ci] auto fixes from pre-commit.com hooks
2025-03-04 19:11:12 +08:00 · 2025-03-04 09:06:37 +00:00 · 2025-03-04 17:05:30 +08:00 · 2025-03-04 17:01:41 +08:00 · 2025-03-04 08:59:57 +00:00 · 2025-03-04 16:57:49 +08:00
89 changed files with 1185 additions and 1449 deletions
--- a/.github/workflows/scripts/get_test_matrix.sh
+++ b/.github/workflows/scripts/get_test_matrix.sh
@@ -12,7 +12,6 @@ run_matrix="{\"include\":["

 examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
 for example in ${examples}; do
-    if [[ ! -d $WORKSPACE/$example ]]; then continue; fi
    cd $WORKSPACE/$example
    if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
    cd tests
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "${AGENTQNA_TGI_SERVICE_PORT-8085}:80"
    volumes:
-      - ${HF_CACHE_DIR:-/var/opea/agent-service/}:/data
+      - /var/opea/agent-service/:/data
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/AgentQnA/tests/step2_start_retrieval_tool.sh
+++ b/AgentQnA/tests/step2_start_retrieval_tool.sh
@@ -9,7 +9,7 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export host_ip=${ip_address}

-export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
+export HF_CACHE_DIR=$WORKDIR/hf_cache
 if [ ! -d "$HF_CACHE_DIR" ]; then
    echo "Creating HF_CACHE directory"
    mkdir -p "$HF_CACHE_DIR"
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -13,7 +13,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"

-export HF_CACHE_DIR=${model_cache:-"/data2/huggingface"}
+export HF_CACHE_DIR=/data2/huggingface
 if [ ! -d "$HF_CACHE_DIR" ]; then
    HF_CACHE_DIR=$WORKDIR/hf_cache
    mkdir -p "$HF_CACHE_DIR"
--- a/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
+++ b/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

-export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
+export HF_CACHE_DIR=$WORKDIR/hf_cache
 if [ ! -d "$HF_CACHE_DIR" ]; then
    mkdir -p "$HF_CACHE_DIR"
 fi
--- a/AudioQnA/Dockerfile
+++ b/AudioQnA/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./audioqna.py $HOME/audioqna.py

--- a/AudioQnA/Dockerfile.multilang
+++ b/AudioQnA/Dockerfile.multilang
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py

--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,14 +16,13 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
 SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = LLM_MODEL_ID
+        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,7 +17,6 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
 GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -25,7 +24,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = LLM_MODEL_ID
+        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -69,7 +69,6 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
-      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -61,7 +61,6 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
-      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -40,7 +40,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -82,7 +82,6 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
-      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
    volumes:
-      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+      - "/var/opea/chatqna-service/data:/data"
    shm_size: 1g
    ipc: host
    environment:
@@ -72,7 +72,7 @@ services:
    ports:
      - "${CHATQNA_TEI_RERANKING_PORT}:80"
    volumes:
-      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+      - "/var/opea/chatqna-service/data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -104,7 +104,7 @@ services:
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
    volumes:
-      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+      - "/var/opea/chatqna-service/data:/data"
    shm_size: 1g
    devices:
      - /dev/kfd:/dev/kfd
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -28,7 +28,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -59,7 +59,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -75,7 +75,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -32,7 +32,7 @@ services:
    ports:
      - "6040:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "6041:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "6042:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -62,7 +62,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -83,7 +83,7 @@ services:
    ports:
      - "8007:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8088:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -70,7 +70,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -124,7 +124,7 @@ services:
    ports:
      - "8008:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -85,7 +85,7 @@ services:
    ports:
      - "8005:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8007:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"/var/opea/chatqna-service/data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8028:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8028:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
@@ -2,8 +2,6 @@

 This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.

-The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
-
 ## 🚀 Create an AWS Xeon Instance

 To run the example on a AWS Xeon instance, start by creating an AWS account if you don't have one already. Then, get started with the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home). AWS EC2 M7i, C7i, C7i-flex and M7i-flex are Intel Xeon Scalable processor instances suitable for the task. (code named Sapphire Rapids).
@@ -65,37 +63,6 @@ By default, the LLM model is set to a default value as listed below:

 Change the `LLM_MODEL_ID` below for your needs.

-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
-
-1. Online
-
-   ```bash
-   export HF_TOKEN=${your_hf_token}
-   export HF_ENDPOINT="https://hf-mirror.com"
-   model_name="mistralai/Mistral-7B-Instruct-v0.3"
-   # Start vLLM LLM Service
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
-   # Start TGI LLM Service
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
-   ```
-
-2. Offline
-
-   - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
-
-   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
-
-   - Run the following command to start the LLM service.
-
-     ```bash
-     export HF_TOKEN=${your_hf_token}
-     export model_path="/path/to/model"
-     # Start vLLM LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
-     # Start TGI LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
-     ```
-
 ### Setup Environment Variables

 1. Set the required environment variables:
@@ -128,47 +95,15 @@ For users in China who are unable to download models directly from Huggingface,

 ```bash
 cd GenAIExamples/CodeTrans/docker_compose/intel/cpu/xeon
-```
-
-If use vLLM as the LLM serving backend.
-
-```bash
-docker compose -f compose.yaml up -d
-```
-
-If use TGI as the LLM serving backend.
-
-```bash
-docker compose -f compose_tgi.yaml up -d
+docker compose up -d
 ```

 ### Validate Microservices

-1. LLM backend Service
-
-   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
-
-   Try the command below to check whether the LLM serving is ready.
+1. TGI Service

   ```bash
-   # vLLM service
-   docker logs codetrans-xeon-vllm-service 2>&1 | grep complete
-   # If the service is ready, you will get the response like below.
-   INFO:     Application startup complete.
-   ```
-
-   ```bash
-   # TGI service
-   docker logs codetrans-xeon-tgi-service | grep Connected
-   # If the service is ready, you will get the response like below.
-   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
-   ```
-
-   Then try the `cURL` command below to validate services.
-
-   ```bash
-   # either vLLM or TGI service
-   curl http://${host_ip}:8008/v1/chat/completions \
+   curl http://${host_ip}:8008/generate \
     -X POST \
     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
     -H 'Content-Type: application/json'
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -2,32 +2,31 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  vllm-service:
-    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
-    container_name: codetrans-xeon-vllm-service
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: codetrans-tgi-service
    ports:
      - "8008:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
-    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codetrans-xeon-llm-server
+    container_name: llm-textgen-server
    depends_on:
-      vllm-service:
+      tgi-service:
        condition: service_healthy
    ports:
      - "9000:9000"
@@ -36,19 +35,18 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  codetrans-xeon-backend-server:
    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
    container_name: codetrans-xeon-backend-server
    depends_on:
-      - vllm-service
+      - tgi-service
      - llm
    ports:
-      - "${BACKEND_SERVICE_PORT:-7777}:7777"
+      - "7777:7777"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
@@ -63,7 +61,7 @@ services:
    depends_on:
      - codetrans-xeon-backend-server
    ports:
-      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+      - "5173:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -1,95 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: codetrans-xeon-tgi-service
-    ports:
-      - "8008:80"
-    volumes:
-      - "${MODEL_CACHE}:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      host_ip: ${host_ip}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  llm:
-    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codetrans-xeon-llm-server
-    depends_on:
-      tgi-service:
-        condition: service_healthy
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${LLM_ENDPOINT}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
-  codetrans-xeon-backend-server:
-    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
-    container_name: codetrans-xeon-backend-server
-    depends_on:
-      - tgi-service
-      - llm
-    ports:
-      - "${BACKEND_SERVICE_PORT:-7777}:7777"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-    ipc: host
-    restart: always
-  codetrans-xeon-ui-server:
-    image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
-    container_name: codetrans-xeon-ui-server
-    depends_on:
-      - codetrans-xeon-backend-server
-    ports:
-      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-    ipc: host
-    restart: always
-  codetrans-xeon-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: codetrans-xeon-nginx-server
-    depends_on:
-      - codetrans-xeon-backend-server
-      - codetrans-xeon-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
-      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
-      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
-      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
-      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
@@ -2,8 +2,6 @@

 This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.

-The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
-
 ## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it. This step can be ignored after the Docker images published to Docker hub.
@@ -57,37 +55,6 @@ By default, the LLM model is set to a default value as listed below:

 Change the `LLM_MODEL_ID` below for your needs.

-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
-
-1. Online
-
-   ```bash
-   export HF_TOKEN=${your_hf_token}
-   export HF_ENDPOINT="https://hf-mirror.com"
-   model_name="mistralai/Mistral-7B-Instruct-v0.3"
-   # Start vLLM LLM Service
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
-   # Start TGI LLM Service
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
-   ```
-
-2. Offline
-
-   - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
-
-   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
-
-   - Run the following command to start the LLM service.
-
-     ```bash
-     export HF_TOKEN=${your_hf_token}
-     export model_path="/path/to/model"
-     # Start vLLM LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
-     # Start TGI LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
-     ```
-
 ### Setup Environment Variables

 1. Set the required environment variables:
@@ -120,43 +87,12 @@ For users in China who are unable to download models directly from Huggingface,

 ```bash
 cd GenAIExamples/CodeTrans/docker_compose/intel/hpu/gaudi
-```
-
-If use vLLM as the LLM serving backend.
-
-```bash
-docker compose -f compose.yaml up -d
-```
-
-If use TGI as the LLM serving backend.
-
-```bash
-docker compose -f compose_tgi.yaml up -d
+docker compose up -d
 ```

 ### Validate Microservices

-1. LLM backend Service
-
-   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
-
-   Try the command below to check whether the LLM serving is ready.
-
-   ```bash
-   # vLLM service
-   docker logs codetrans-gaudi-vllm-service 2>&1 | grep complete
-   # If the service is ready, you will get the response like below.
-   INFO:     Application startup complete.
-   ```
-
-   ```bash
-   # TGI service
-   docker logs codetrans-gaudi-tgi-service | grep Connected
-   # If the service is ready, you will get the response like below.
-   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
-   ```
-
-   Then try the `cURL` command below to validate services.
+1. TGI Service

   ```bash
   curl http://${host_ip}:8008/generate \
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,38 +2,39 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  vllm-service:
-    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
-    container_name: codetrans-gaudi-vllm-service
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    container_name: codetrans-tgi-service
    ports:
      - "8008:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      NUM_CARDS: ${NUM_CARDS}
-      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 100
+      test: ["CMD-SHELL", "sleep 500 && exit 0"]
+      interval: 1s
+      timeout: 505s
+      retries: 1
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
  llm:
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codetrans-xeon-llm-server
+    container_name: llm-textgen-gaudi-server
    depends_on:
-      vllm-service:
+      tgi-service:
        condition: service_healthy
    ports:
      - "9000:9000"
@@ -42,19 +43,18 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  codetrans-gaudi-backend-server:
    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
    container_name: codetrans-gaudi-backend-server
    depends_on:
-      - vllm-service
+      - tgi-service
      - llm
    ports:
-      - "${BACKEND_SERVICE_PORT:-7777}:7777"
+      - "7777:7777"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
@@ -69,7 +69,7 @@ services:
    depends_on:
      - codetrans-gaudi-backend-server
    ports:
-      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+      - "5173:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -1,99 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    container_name: codetrans-gaudi-tgi-service
-    ports:
-      - "8008:80"
-    volumes:
-      - "${MODEL_CACHE}:/data"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
-  llm:
-    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codetrans-gaudi-llm-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${LLM_ENDPOINT}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
-  codetrans-gaudi-backend-server:
-    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
-    container_name: codetrans-gaudi-backend-server
-    depends_on:
-      - tgi-service
-      - llm
-    ports:
-      - "${BACKEND_SERVICE_PORT:-7777}:7777"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-    ipc: host
-    restart: always
-  codetrans-gaudi-ui-server:
-    image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
-    container_name: codetrans-gaudi-ui-server
-    depends_on:
-      - codetrans-gaudi-backend-server
-    ports:
-      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-    ipc: host
-    restart: always
-  codetrans-gaudi-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: codetrans-gaudi-nginx-server
-    depends_on:
-      - codetrans-gaudi-backend-server
-      - codetrans-gaudi-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
-      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
-      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
-      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
-      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/CodeTrans/docker_compose/set_env.sh
+++ b/CodeTrans/docker_compose/set_env.sh
@@ -8,12 +8,7 @@ popd > /dev/null


 export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-export LLM_ENDPOINT="http://${host_ip}:8008"
-export LLM_COMPONENT_NAME="OpeaTextGenService"
-export NUM_CARDS=1
-export BLOCK_SIZE=128
-export MAX_NUM_SEQS=256
-export MAX_SEQ_LEN_TO_CAPTURE=2048
+export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7777/v1/codetrans"
--- a/CodeTrans/docker_image_build/build.yaml
+++ b/CodeTrans/docker_image_build/build.yaml
@@ -23,18 +23,6 @@ services:
      dockerfile: comps/llms/src/text-generation/Dockerfile
    extends: codetrans
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-  vllm:
-    build:
-      context: vllm
-      dockerfile: Dockerfile.cpu
-    extends: codetrans
-    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
-  vllm-gaudi:
-    build:
-      context: vllm-fork
-      dockerfile: Dockerfile.hpu
-    extends: codetrans
-    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
  nginx:
    build:
      context: GenAIComps
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -30,12 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen vllm-gaudi nginx"
+    service_list="codetrans codetrans-ui llm-textgen nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

@@ -45,12 +45,7 @@ function start_services() {
    export http_proxy=${http_proxy}
    export https_proxy=${http_proxy}
    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export LLM_ENDPOINT="http://${ip_address}:8008"
-    export LLM_COMPONENT_NAME="OpeaTextGenService"
-    export NUM_CARDS=1
-    export BLOCK_SIZE=128
-    export MAX_NUM_SEQS=256
-    export MAX_SEQ_LEN_TO_CAPTURE=2048
+    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export LLM_SERVICE_HOST_IP=${ip_address}
@@ -70,15 +65,13 @@ function start_services() {

    n=0
    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-gaudi-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
-        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
+        docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
            break
        fi
        sleep 5s
        n=$((n+1))
    done
-
-    sleep 1m
 }

 function validate_services() {
@@ -110,19 +103,27 @@ function validate_services() {
 }

 function validate_microservices() {
+    # tgi for embedding service
+    validate_services \
+        "${ip_address}:8008/generate" \
+        "generated_text" \
+        "tgi" \
+        "codetrans-tgi-service" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
    # llm microservice
    validate_services \
        "${ip_address}:9000/v1/chat/completions" \
        "data: " \
        "llm" \
-        "codetrans-xeon-llm-server" \
+        "llm-textgen-gaudi-server" \
        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
 }

 function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
-        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+        "${ip_address}:7777/v1/codetrans" \
        "print" \
        "mega-codetrans" \
        "codetrans-gaudi-backend-server" \
@@ -130,7 +131,7 @@ function validate_megaservice() {

    # test the megeservice via nginx
    validate_services \
-        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+        "${ip_address}:80/v1/codetrans" \
        "print" \
        "mega-codetrans-nginx" \
        "codetrans-gaudi-nginx-server" \
@@ -169,7 +170,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose -f compose.yaml stop && docker compose rm -f
+    docker compose stop && docker compose rm -f
 }

 function main() {
--- a/CodeTrans/tests/test_compose_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_on_xeon.sh
@@ -30,16 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone https://github.com/vllm-project/vllm.git && cd vllm
-    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
-    echo "Check out vLLM tag ${VLLM_VER}"
-    git checkout ${VLLM_VER} &> /dev/null
-    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen vllm nginx"
+    service_list="codetrans codetrans-ui llm-textgen nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }

@@ -48,8 +44,7 @@ function start_services() {
    export http_proxy=${http_proxy}
    export https_proxy=${http_proxy}
    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export LLM_ENDPOINT="http://${ip_address}:8008"
-    export LLM_COMPONENT_NAME="OpeaTextGenService"
+    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export LLM_SERVICE_HOST_IP=${ip_address}
@@ -65,19 +60,17 @@ function start_services() {
    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
-    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-xeon-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
-        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
+        docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
            break
        fi
        sleep 5s
        n=$((n+1))
    done
-
-    sleep 1m
 }

 function validate_services() {
@@ -109,12 +102,20 @@ function validate_services() {
 }

 function validate_microservices() {
+    # tgi for embedding service
+    validate_services \
+        "${ip_address}:8008/generate" \
+        "generated_text" \
+        "tgi" \
+        "codetrans-tgi-service" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
    # llm microservice
    validate_services \
        "${ip_address}:9000/v1/chat/completions" \
        "data: " \
        "llm" \
-        "codetrans-xeon-llm-server" \
+        "llm-textgen-server" \
        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'

 }
@@ -122,7 +123,7 @@ function validate_microservices() {
 function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
-        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+        "${ip_address}:7777/v1/codetrans" \
        "print" \
        "mega-codetrans" \
        "codetrans-xeon-backend-server" \
@@ -130,7 +131,7 @@ function validate_megaservice() {

    # test the megeservice via nginx
    validate_services \
-        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+        "${ip_address}:80/v1/codetrans" \
        "print" \
        "mega-codetrans-nginx" \
        "codetrans-xeon-nginx-server" \
@@ -168,7 +169,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose -f compose.yaml stop && docker compose rm -f
+    docker compose stop && docker compose rm -f
 }

 function main() {
--- a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
@@ -1,194 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    opea_branch=${opea_branch:-"main"}
-    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
-    if [[ "${opea_branch}" != "main" ]]; then
-        cd $WORKPATH
-        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
-        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
-        find . -type f -name "Dockerfile*" | while read -r file; do
-            echo "Processing file: $file"
-            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
-        done
-    fi
-
-    cd $WORKPATH/docker_image_build
-    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen nginx"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    export http_proxy=${http_proxy}
-    export https_proxy=${http_proxy}
-    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export LLM_ENDPOINT="http://${ip_address}:8008"
-    export LLM_COMPONENT_NAME="OpeaTextGenService"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
-    export FRONTEND_SERVICE_IP=${ip_address}
-    export FRONTEND_SERVICE_PORT=5173
-    export BACKEND_SERVICE_NAME=codetrans
-    export BACKEND_SERVICE_IP=${ip_address}
-    export BACKEND_SERVICE_PORT=7777
-    export NGINX_PORT=80
-    export host_ip=${ip_address}
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-gaudi-tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
-            break
-        fi
-        sleep 5s
-        n=$((n+1))
-    done
-
-    sleep 1m
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 5s
-}
-
-function validate_microservices() {
-    # tgi for embedding service
-    validate_services \
-        "${ip_address}:8008/generate" \
-        "generated_text" \
-        "tgi" \
-        "codetrans-gaudi-tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "codetrans-gaudi-llm-server" \
-        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
-        "print" \
-        "mega-codetrans" \
-        "codetrans-gaudi-backend-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
-
-    # test the megeservice via nginx
-    validate_services \
-        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
-        "print" \
-        "mega-codetrans-nginx" \
-        "codetrans-gaudi-nginx-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs=22.6.0 -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    docker compose -f compose_tgi.yaml stop && docker compose rm -f
-}
-
-function main() {
-
-    stop_docker
-
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_services
-
-    validate_microservices
-    validate_megaservice
-    validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/CodeTrans/tests/test_compose_tgi_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
@@ -1,194 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    opea_branch=${opea_branch:-"main"}
-    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
-    if [[ "${opea_branch}" != "main" ]]; then
-        cd $WORKPATH
-        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
-        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
-        find . -type f -name "Dockerfile*" | while read -r file; do
-            echo "Processing file: $file"
-            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
-        done
-    fi
-
-    cd $WORKPATH/docker_image_build
-    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen nginx"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    export http_proxy=${http_proxy}
-    export https_proxy=${http_proxy}
-    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export LLM_ENDPOINT="http://${ip_address}:8008"
-    export LLM_COMPONENT_NAME="OpeaTextGenService"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
-    export FRONTEND_SERVICE_IP=${ip_address}
-    export FRONTEND_SERVICE_PORT=5173
-    export BACKEND_SERVICE_NAME=codetrans
-    export BACKEND_SERVICE_IP=${ip_address}
-    export BACKEND_SERVICE_PORT=7777
-    export NGINX_PORT=80
-    export host_ip=${ip_address}
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-xeon-tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
-            break
-        fi
-        sleep 5s
-        n=$((n+1))
-    done
-
-    sleep 1m
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 5s
-}
-
-function validate_microservices() {
-    # tgi for embedding service
-    validate_services \
-        "${ip_address}:8008/generate" \
-        "generated_text" \
-        "tgi" \
-        "codetrans-xeon-tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "codetrans-xeon-llm-server" \
-        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
-        "print" \
-        "mega-codetrans" \
-        "codetrans-xeon-backend-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
-
-    # test the megeservice via nginx
-    validate_services \
-        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
-        "print" \
-        "mega-codetrans-nginx" \
-        "codetrans-xeon-nginx-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs=22.6.0 -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose -f compose_tgi.yaml stop && docker compose rm -f
-}
-
-function main() {
-
-    stop_docker
-
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_services
-
-    validate_microservices
-    validate_megaservice
-    validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/DBQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/DBQnA/docker_compose/intel/cpu/xeon/README.md
@@ -51,16 +51,20 @@ Since the `compose.yaml` will consume some environment variables, you need to se
 export host_ip=$(hostname -I | awk '{print $1}')

 # Example: no_proxy="localhost,127.0.0.1,192.168.1.1"
-export no_proxy=${no_proxy},${host_ip}
+export no_proxy=${no_proxy},${host_ip},dbqna-xeon-react-ui-server,text2sql-service,vllm-service,tgi-service

 # If you are in a proxy environment, also set the proxy-related environment variables:
 export http_proxy=${http_proxy}
 export https_proxy=${https_proxy}

 # Set other required variables
-
+#TGI Service
 export TGI_PORT=8008
 export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_PORT}
+#vLLM Sercice
+export LLM_ENDPOINT_PORT=8008
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+
 export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
 export POSTGRES_USER=postgres
@@ -89,6 +93,13 @@ cd GenAIExamples/DBQnA/docker_compose/intel/cpu/xeon
 docker compose up -d
 ```

+or use vLLM service
+
+```bash
+cd GenAIExamples/DBQnA/docker_compose/intel/cpu/xeon
+docker compose -f compose_vllm.yaml up -d
+```
+
 #### 2.2.2 Alternatively we can start the microservices by running individual docker services

 **NOTE:** Make sure all the individual docker services are down before starting them.
@@ -108,7 +119,7 @@ docker run --name test-text2sql-postgres --ipc=host -e POSTGRES_USER=${POSTGRES_

 ```bash

-docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $model
+docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model
 ```

 - Start Text-to-SQL Service
@@ -127,7 +138,9 @@ docker run -d --name="test-dbqna-react-ui-server" --ipc=host -p 5174:80 -e no_pr

 ## 🚀 Validate Microservices

-### 3.1 TGI Service
+### 3.1 TGI Service Or vllm Sercice
+
+TGI Service

 ```bash

@@ -137,6 +150,13 @@ curl http://${host_ip}:$TGI_PORT/generate \
    -H 'Content-Type: application/json'
 ```

+vllm Sercice
+
+````bash
+curl http://${host_ip}:8008/v1/chat/completions \
+  -X POST \
+  -d '{"model":"mistralai/Mistral-7B-Instruct-v0.3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+  -H 'Content-Type: application/json'
 ### 3.2 Postgres Microservice

 Once Text-to-SQL microservice is started, user can use below command
@@ -147,7 +167,7 @@ Once Text-to-SQL microservice is started, user can use below command
 curl --location http://${host_ip}:9090/v1/postgres/health \
    --header 'Content-Type: application/json' \
    --data '{"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${host_ip}'", "port": "5442", "database": "'${POSTGRES_DB}'"}'
-```
+````

 #### 3.2.2 Invoke the microservice.

--- a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8008:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/DBQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
@@ -0,0 +1,67 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "${DATA_PATH:-./data}:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "${VLLM_TORCH_PROFILER_DIR:-/mnt}"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+
+  postgres:
+    image: postgres:latest
+    container_name: postgres-container
+    restart: always
+    environment:
+      - POSTGRES_USER=${POSTGRES_USER}
+      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
+      - POSTGRES_DB=${POSTGRES_DB}
+    ports:
+      - '5442:5432'
+    volumes:
+      - ./chinook.sql:/docker-entrypoint-initdb.d/chinook.sql
+
+  text2sql-service:
+    image: ${REGISTRY:-opea}/text2sql:${TAG:-latest}
+    container_name: text2sql-service
+    ports:
+      - "9090:8080"
+    environment:
+      - TGI_LLM_ENDPOINT=${LLM_ENDPOINT}
+
+  dbqna-xeon-react-ui-server:
+    image: ${REGISTRY:-opea}/text2sql-react-ui:${TAG:-latest}
+    container_name: dbqna-xeon-react-ui-server
+    depends_on:
+      - text2sql-service
+    ports:
+      - "5174:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/DBQnA/tests/test_compose_vllm_on_xeon.sh
+++ b/DBQnA/tests/test_compose_vllm_on_xeon.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    git clone https://github.com/vllm-project/vllm.git
+    cd ./vllm/
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    docker build --no-cache -f Dockerfile.cpu -t ${REGISTRY:-opea}/vllm:${TAG:-latest} --shm-size=128g .
+    if [ $? -ne 0 ]; then
+        echo "opea/vllm built fail"
+        exit 1
+    else
+        echo "opea/vllm built successful"
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --single-branch --branch "${opea_branch:-"main"}" https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details... #vllm-service"
+    service_list="text2sql text2sql-react-ui"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+	}
+
+function start_service() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    export model="mistralai/Mistral-7B-Instruct-v0.3"
+    export LLM_MODEL_ID=${model}
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export POSTGRES_USER=postgres
+    export POSTGRES_PASSWORD=testpwd
+    export POSTGRES_DB=chinook
+    export TEXT2SQL_PORT=9090
+	  export LLM_ENDPOINT_PORT=8008
+    export LLM_ENDPOINT="http://${ip_address}:${LLM_ENDPOINT_PORT}"
+
+    # Start Docker Containers
+    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    # check whether vLLM is fully ready.
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
+            break
+        fi
+        sleep 5s
+        n=$((n+1))
+    done
+}
+
+function validate_microservice() {
+    result=$(http_proxy="" curl --connect-timeout 5 --max-time 120000 http://${ip_address}:$TEXT2SQL_PORT/v1/text2sql\
+        -X POST \
+        -d '{"input_text": "Find the total number of Albums.","conn_str": {"user": "'${POSTGRES_USER}'","password": "'${POSTGRES_PASSWORD}'","host": "'${ip_address}'", "port": "5442", "database": "'${POSTGRES_DB}'" }}' \
+        -H 'Content-Type: application/json')
+
+    if [[ $result == *"output"* ]]; then
+        echo $result
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs text2sql-service > ${LOG_PATH}/text2sql.log
+        docker logs tgi-service > ${LOG_PATH}/tgi.log
+        exit 1
+    fi
+
+}
+
+function validate_frontend() {
+    echo "[ TEST INFO ]: --------- frontend test started ---------"
+    cd $WORKPATH/ui/react
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+
+    source activate ${conda_env_name}
+    echo "[ TEST INFO ]: --------- conda env activated ---------"
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npm run test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    docker compose stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/DocIndexRetriever/Dockerfile
+++ b/DocIndexRetriever/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./retrieval_tool.py $HOME/retrieval_tool.py

--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -38,7 +38,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -34,7 +34,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -95,7 +95,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
      timeout: 10s
      retries: 100
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0  --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}

--- a/EdgeCraftRAG/Dockerfile
+++ b/EdgeCraftRAG/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./chatqna.py $HOME/chatqna.py

--- a/EdgeCraftRAG/Dockerfile.server
+++ b/EdgeCraftRAG/Dockerfile.server
@@ -37,8 +37,7 @@ RUN mkdir -p /home/user/gradio_cache
 ENV GRADIO_TEMP_DIR=/home/user/gradio_cache

 WORKDIR /home/user/edgecraftrag
-RUN pip install --no-cache-dir --upgrade pip setuptools==70.0.0 && \
-    pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt

 WORKDIR /home/user/

--- a/EdgeCraftRAG/edgecraftrag/requirements.txt
+++ b/EdgeCraftRAG/edgecraftrag/requirements.txt
@@ -1,6 +1,6 @@
 docx2txt
 faiss-cpu>=1.8.0.post1
-langchain-core>=0.2.29
+langchain-core==0.2.29
 llama-index>=0.11.0
 llama-index-embeddings-openvino>=0.4.0
 llama-index-llms-openai-like>=0.2.0
@@ -9,7 +9,7 @@ llama-index-postprocessor-openvino-rerank>=0.3.0
 llama-index-readers-file>=0.4.0
 llama-index-retrievers-bm25>=0.3.0
 llama-index-vector-stores-faiss>=0.2.1
-opea-comps>=1.2
+opea-comps>=0.9
 pillow>=10.4.0
 python-docx==1.1.2
 unstructured==0.16.11
--- a/EdgeCraftRAG/ui/docker/Dockerfile.ui
+++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui
@@ -15,7 +15,7 @@ RUN mkdir -p /home/user/gradio_cache
 ENV GRADIO_TEMP_DIR=/home/user/gradio_cache

 WORKDIR /home/user/ui
-RUN pip install --no-cache-dir --upgrade pip setuptools==70.0.0 && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
    pip install --no-cache-dir -r requirements.txt

 USER user
--- a/FaqGen/Dockerfile
+++ b/FaqGen/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./faqgen.py $HOME/faqgen.py

--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - ${LLM_ENDPOINT_PORT:-8008}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/Finetuning/tests/test_compose_on_gaudi.sh
+++ b/Finetuning/tests/test_compose_on_gaudi.sh
--- a/GraphRAG/README.md
+++ b/GraphRAG/README.md
@@ -72,7 +72,7 @@ Here is an example of `Nike 2023` pdf.
 # download pdf file
 wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
 # upload pdf file with dataprep
-curl -X POST "http://${host_ip}:11103/v1/dataprep/ingest" \
+curl -X POST "http://${host_ip}:6004/v1/dataprep/ingest" \
    -H "Content-Type: multipart/form-data" \
    -F "files=@./nke-10k-2023.pdf"
 ```
@@ -80,7 +80,8 @@ curl -X POST "http://${host_ip}:11103/v1/dataprep/ingest" \
 ```bash
 curl http://${host_ip}:8888/v1/graphrag \
    -H "Content-Type: application/json"  \
-    -d '{"messages": [{"role": "user","content": "where do Nike subsidiaries operate?
+    -d '{
+        "model": "gpt-4o-mini","messages": [{"role": "user","content": "What is the revenue of Nike in 2023?
    "}]}'
 ```

--- a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -5,65 +5,52 @@ services:
  neo4j-apoc:
    image: neo4j:latest
    container_name: neo4j-apoc
-    ports:
-      - "${NEO4J_PORT1:-7474}:7474"
-      - "${NEO4J_PORT2:-7687}:7687"
    volumes:
-        - ./data/neo4j/logs:/logs
-        - ./data/neo4j/config:/config
-        - ./data/neo4j/data:/data
-        - ./data/neo4j/plugins:/plugins
+        - /$HOME/neo4j/logs:/logs
+        - /$HOME/neo4j/config:/config
+        - /$HOME/neo4j/data:/data
+        - /$HOME/neo4j/plugins:/plugins
    ipc: host
    environment:
-      - no_proxy=${no_proxy}
-      - http_proxy=${http_proxy}
-      - https_proxy=${https_proxy}
      - NEO4J_AUTH=${NEO4J_USERNAME}/${NEO4J_PASSWORD}
      - NEO4J_PLUGINS=["apoc"]
      - NEO4J_apoc_export_file_enabled=true
      - NEO4J_apoc_import_file_enabled=true
      - NEO4J_apoc_import_file_use__neo4j__config=true
      - NEO4J_dbms_security_procedures_unrestricted=apoc.\*
-      - NEO4J_server_bolt_advertised__address=localhost:${NEO4J_PORT2}
-    restart: always
-    healthcheck:
-      test: wget http://localhost:7474 || exit 1
-      interval: 5s
-      timeout: 10s
-      retries: 20
-      start_period: 3s
-  tei-embedding-serving:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    container_name: tei-embedding-serving
-    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
    ports:
-      - "${TEI_EMBEDDER_PORT:-12000}:80"
+      - "7474:7474"
+      - "7687:7687"
+    restart: always
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-server
+    ports:
+      - "6006:80"
    volumes:
      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
+      NO_PROXY: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      host_ip: ${host_ip}
-      HF_TOKEN: ${HF_TOKEN}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://${host_ip}:${TEI_EMBEDDER_PORT}/health"]
-      interval: 10s
-      timeout: 6s
-      retries: 48
-  tgi-gaudi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    ipc: host
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+  tgi-gaudi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
    container_name: tgi-gaudi-server
    ports:
-      - ${LLM_ENDPOINT_PORT:-8008}:80
+      - "6005:80"
    volumes:
-      - "${DATA_PATH:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
+      NO_PROXY: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_TOKEN: ${HF_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
@@ -73,44 +60,63 @@ services:
      LIMIT_HPU_GRAPH: true
      USE_FLASH_ATTENTION: true
      FLASH_ATTENTION_RECOMPUTE: true
-      host_ip: ${host_ip}
-      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
-      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-2048}
-      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS:-4096}
      TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN: false
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 100
-    command: --model-id ${LLM_MODEL_ID}
-
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 6000 --max-total-tokens 8192
  dataprep-neo4j-llamaindex:
    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
-    container_name: dataprep-neo4j-llamaindex
+    container_name: dataprep-neo4j-server
    depends_on:
-      neo4j-apoc:
-        condition: service_healthy
-      tgi-gaudi-server:
-        condition: service_healthy
-      tei-embedding-serving:
-        condition: service_healthy
+      - neo4j-apoc
+      - tgi-gaudi-service
+      - tei-embedding-service
    ports:
-      - "${DATAPREP_PORT:-11103}:5000"
+      - "6004:5000"
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      host_ip: ${host_ip}
-      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_NEO4J_LLAMAINDEX"
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HF_TOKEN}
      NEO4J_URL: ${NEO4J_URL}
      NEO4J_USERNAME: ${NEO4J_USERNAME}
      NEO4J_PASSWORD: ${NEO4J_PASSWORD}
+      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_NEO4J_LLAMAINDEX"
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+      OPENAI_EMBEDDING_MODEL: ${OPENAI_EMBEDDING_MODEL}
+      OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
+      EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      MAX_OUTPUT_TOKENS: ${MAX_OUTPUT_TOKENS}
+      LOGFLAG: ${LOGFLAG}
+    restart: unless-stopped
+  retriever-neo4j-llamaindex:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-neo4j-server
+    depends_on:
+      - neo4j-apoc
+      - tgi-gaudi-service
+      - tei-embedding-service
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      host_ip: ${host_ip}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HF_TOKEN}
+      NEO4J_URI: ${NEO4J_URL}
+      NEO4J_USERNAME: ${NEO4J_USERNAME}
+      NEO4J_PASSWORD: ${NEO4J_PASSWORD}
      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -118,61 +124,29 @@ services:
      OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
      EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      MAX_OUTPUT_TOKENS: ${MAX_OUTPUT_TOKENS}
      LOGFLAG: ${LOGFLAG}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      HF_TOKEN: ${HF_TOKEN}
-      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-4096}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_NEO4J"
    restart: unless-stopped
-  retriever-neo4j:
-    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
-    container_name: retriever-neo4j
-    ports:
-      - "${RETRIEVER_PORT:-7000}:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      LOGFLAG: ${LOGFLAG:-False}
-      RETRIEVER_COMPONENT_NAME: ${RETRIEVER_COMPONENT_NAME:-OPEA_RETRIEVER_NEO4J}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      NEO4J_URI: ${NEO4J_URI}
-      NEO4J_URL: ${NEO4J_URI}
-      NEO4J_USERNAME: ${NEO4J_USERNAME}
-      NEO4J_PASSWORD: ${NEO4J_PASSWORD}
-      VDMS_USE_CLIP: 0
-      host_ip: ${host_ip}
-    depends_on:
-      neo4j-apoc:
-        condition: service_healthy
-      tei-embedding-serving:
-        condition: service_healthy
-      tgi-gaudi-server:
-        condition: service_healthy
  graphrag-gaudi-backend-server:
    image: ${REGISTRY:-opea}/graphrag:${TAG:-latest}
    container_name: graphrag-gaudi-backend-server
    depends_on:
      - neo4j-apoc
-      - tei-embedding-serving
-      - retriever-neo4j
-      - tgi-gaudi-server
+      - tei-embedding-service
+      - retriever-neo4j-llamaindex
+      - tgi-gaudi-service
    ports:
      - "8888:8888"
-      - "${MEGA_SERVICE_PORT:-8888}:8888"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=graphrag-gaudi-backend-server
-      - RETRIEVER_SERVICE_HOST_IP=retriever-neo4j
+      - RETRIEVER_SERVICE_HOST_IP=retriever-neo4j-llamaindex
      - RETRIEVER_SERVICE_PORT=7000
-      - LLM_SERVER_HOST_IP=tgi-gaudi-server
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_HOST_IP=tgi-gaudi-service
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -10,25 +10,16 @@ pushd "../../../../../" > /dev/null
 source .set_env.sh
 popd > /dev/null

-export TEI_EMBEDDER_PORT=11633
-export LLM_ENDPOINT_PORT=11634
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export OPENAI_EMBEDDING_MODEL="text-embedding-3-small"
 export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
 export OPENAI_LLM_MODEL="gpt-4o"
-export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
-export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-export NEO4J_PORT1=11631
-export NEO4J_PORT2=11632
-export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
-export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
-export NEO4J_USERNAME="neo4j"
-export NEO4J_PASSWORD="neo4jtest"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TGI_LLM_ENDPOINT="http://${host_ip}:6005"
+export NEO4J_URL="bolt://${host_ip}:7687"
+export NEO4J_USERNAME=neo4j
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
 export LOGFLAG=True
-export MAX_INPUT_TOKENS=4096
-export MAX_TOTAL_TOKENS=8192
-export DATA_PATH="/mnt/nvme2n1/hf_cache"
-export DATAPREP_PORT=11103
-export RETRIEVER_PORT=11635
+export RETRIEVER_SERVICE_PORT=80
+export LLM_SERVER_PORT=80
+export MAX_OUTPUT_TOKENS=1024
--- a/GraphRAG/tests/test_compose_on_gaudi.sh
+++ b/GraphRAG/tests/test_compose_on_gaudi.sh
@@ -12,7 +12,7 @@ export TAG=${IMAGE_TAG}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
-export host_ip=$(hostname -I | awk '{print $1}')
+ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
@@ -33,38 +33,25 @@ function build_docker_images() {
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-    export TEI_EMBEDDER_PORT=11633
-    export LLM_ENDPOINT_PORT=11634
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export OPENAI_EMBEDDING_MODEL="text-embedding-3-small"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
-    export OPENAI_LLM_MODEL="gpt-4o"
-    export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
-    export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-    export NEO4J_PORT1=11631
-    export NEO4J_PORT2=11632
-    export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
-    export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
    export NEO4J_USERNAME="neo4j"
    export NEO4J_PASSWORD="neo4jtest"
-    export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
-    export LOGFLAG=True
-    export MAX_INPUT_TOKENS=4096
-    export MAX_TOTAL_TOKENS=8192
-    export DATAPREP_PORT=11103
-    export RETRIEVER_PORT=11635
-    export MEGA_SERVICE_PORT=8888
+    export NEO4J_URL="bolt://${ip_address}:7687"
+    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
+    export TGI_LLM_ENDPOINT="http://${ip_address}:6005"
+    export host_ip=${ip_address}
+    export LOGFLAG=true
+    export MAX_OUTPUT_TOKENS="1024"
    unset OPENAI_API_KEY

    # Start Docker Containers
@@ -129,7 +116,7 @@ function validate_microservices() {

    # validate neo4j-apoc
    validate_service \
-        "${host_ip}:${NEO4J_PORT1}" \
+        "${ip_address}:7474" \
        "200 OK" \
        "neo4j-apoc" \
        "neo4j-apoc" \
@@ -137,46 +124,45 @@ function validate_microservices() {

    # tei for embedding service
    validate_service \
-        "${host_ip}:${TEI_EMBEDDER_PORT}/embed" \
+        "${ip_address}:6006/embed" \
        "[[" \
        "tei-embedding-service" \
-        "tei-embedding-serving" \
+        "tei-embedding-server" \
        '{"inputs":"What is Deep Learning?"}'

    sleep 1m # retrieval can't curl as expected, try to wait for more time

-    # tgi for llm service
-    validate_service \
-        "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \
-        "generated_text" \
-        "tgi-gaudi-service" \
-        "tgi-gaudi-server" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
    # test /v1/dataprep/ingest graph extraction
    echo "Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. FirstEnergy (NYSE:FE – Get Rating) posted its earnings results on Tuesday. The utilities provider reported $0.53 earnings per share for the quarter, topping the consensus estimate of $0.52 by $0.01, RTT News reports. FirstEnergy had a net margin of 10.85% and a return on equity of 17.17%. The Dáil was almost suspended on Thursday afternoon after Sinn Féin TD John Brady walked across the chamber and placed an on-call pager in front of the Minister for Housing Darragh O’Brien during a debate on retained firefighters. Mr O’Brien said Mr Brady had taken part in an act of theatre that was obviously choreographed.Around 2,000 retained firefighters around the country staged a second day of industrial action on Tuesday and are due to start all out-strike action from next Tuesday. The mostly part-time workers, who keep the services going outside of Ireland’s larger urban centres, are taking industrial action in a dispute over pay and working conditions. Speaking in the Dáil, Sinn Féin deputy leader Pearse Doherty said firefighters had marched on Leinster House today and were very angry at the fact the Government will not intervene. Reintroduction of tax relief on mortgages needs to be considered, O’Brien says. Martin withdraws comment after saying People Before Profit would ‘put the jackboot on people’ Taoiseach ‘propagated fears’ farmers forced to rewet land due to nature restoration law – Cairns An intervention is required now. I’m asking you to make an improved offer in relation to pay for retained firefighters, Mr Doherty told the housing minister.I’m also asking you, and challenging you, to go outside after this Order of Business and meet with the firefighters because they are just fed up to the hilt in relation to what you said.Some of them have handed in their pagers to members of the Opposition and have challenged you to wear the pager for the next number of weeks, put up with an €8,600 retainer and not leave your community for the two and a half kilometres and see how you can stand over those type of pay and conditions. At this point, Mr Brady got up from his seat, walked across the chamber and placed the pager on the desk in front of Mr O’Brien. Ceann Comhairle Seán Ó Fearghaíl said the Sinn Féin TD was completely out of order and told him not to carry out a charade in this House, adding it was absolutely outrageous behaviour and not to be encouraged.Mr O’Brien said Mr Brady had engaged in an act of theatre here today which was obviously choreographed and was then interrupted with shouts from the Opposition benches. Mr Ó Fearghaíl said he would suspend the House if this racket continues.Mr O’Brien later said he said he was confident the dispute could be resolved and he had immense regard for firefighters. The minister said he would encourage the unions to re-engage with the State’s industrial relations process while also accusing Sinn Féin of using the issue for their own political gain." > $LOG_PATH/dataprep_file.txt
    validate_service \
-        "http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest" \
+        "http://${ip_address}:6004/v1/dataprep/ingest" \
        "Data preparation succeeded" \
        "extract_graph_neo4j" \
-        "dataprep-neo4j-llamaindex"
+        "dataprep-neo4j-server"

    sleep 2m

    # retrieval microservice
    validate_service \
-        "${host_ip}:${RETRIEVER_PORT}/v1/retrieval" \
-        "documents" \
+        "${ip_address}:7000/v1/retrieval" \
+        "retrieved_docs" \
        "retriever_community_answers_neo4j" \
-        "retriever-neo4j" \
-        "{\"messages\": [{\"role\": \"user\",\"content\": \"Who is John Brady and has he had any confrontations?\"}]}"
+        "retriever-neo4j-server" \
+        "{\"model\": \"gpt-4o-mini\",\"messages\": [{\"role\": \"user\",\"content\": \"Who is John Brady and has he had any confrontations?\"}]}"

-    }
+    # tgi for llm service
+    validate_service \
+        "${ip_address}:6005/generate" \
+        "generated_text" \
+        "tgi-gaudi-service" \
+        "tgi-gaudi-server" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+}

 function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
-        "${host_ip}:${MEGA_SERVICE_PORT}/v1/graphrag" \
+        "${ip_address}:8888/v1/graphrag" \
        "data: " \
        "graphrag-megaservice" \
        "graphrag-gaudi-backend-server" \
@@ -195,7 +181,7 @@ function validate_frontend() {
    fi
    source activate ${conda_env_name}

-    sed -i "s/localhost/$host_ip/g" playwright.config.ts
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts

    conda install -c conda-forge nodejs=22.6.0 -y
    npm install && npm ci && npx playwright install --with-deps
--- a/InstructionTuning/README.md
+++ b/InstructionTuning/README.md
@@ -1,23 +1,21 @@
-# Finetuning
+# Instruction Tuning

-This example includes instruction tuning and rerank model finetuning. Instruction tuning is the process of further training LLMs on a dataset consisting of (instruction, output) pairs in a supervised fashion, which bridges the gap between the next-word prediction objective of LLMs and the users' objective of having LLMs adhere to human instructions. Rerank model finetuning is the process of further training rerank model on a dataset for improving its capability on specific field. The implementation of this example deploys a Ray cluster for the task.
+Instruction tuning is the process of further training LLMs on a dataset consisting of (instruction, output) pairs in a supervised fashion, which bridges the gap between the next-word prediction objective of LLMs and the users' objective of having LLMs adhere to human instructions. This implementation deploys a Ray cluster for the task.

-## Deploy Finetuning Service
+## Deploy Instruction Tuning Service

-### Deploy Finetuning Service on Xeon
+### Deploy Instruction Tuning Service on Xeon

 Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for detail.

-### Deploy Finetuning Service on Gaudi
+### Deploy Instruction Tuning Service on Gaudi

 Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) for detail.

-## Consume Finetuning Service
+## Consume Instruction Tuning Service

 ### 1. Upload a training file

-#### Instruction tuning dataset example
-
 Download a training file `alpaca_data.json` and upload it to the server with below command, this file can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json):

 ```bash
@@ -25,19 +23,8 @@ Download a training file `alpaca_data.json` and upload it to the server with bel
 curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./alpaca_data.json" -F purpose="fine-tune"
 ```

-#### Rerank model finetuning dataset example
-
-Download a toy example training file `toy_finetune_data.jsonl` and upload it to the server with below command, this file can be downloaded in [here](https://github.com/FlagOpen/FlagEmbedding/blob/JUNJIE99-patch-1/examples/finetune/toy_finetune_data.jsonl):
-
-```bash
-# upload a training file
-curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./toy_finetune_data.jsonl" -F purpose="fine-tune"
-```
-
 ### 2. Create fine-tuning job

-#### Instruction tuning
-
 After a training file like `alpaca_data.json` is uploaded, use the following command to launch a finetuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:

 ```bash
@@ -53,25 +40,6 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \

 The outputs of the finetune job (adapter_model.safetensors, adapter_config,json... ) are stored in `/home/user/comps/finetuning/src/output` and other execution logs are stored in `/home/user/ray_results`

-#### Rerank model finetuning
-
-After a training file `toy_finetune_data.jsonl` is uploaded, use the following command to launch a finetuning job using `BAAI/bge-reranker-large` as base model:
-
-```bash
-# create a finetuning job
-curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "training_file": "toy_finetune_data.jsonl",
-    "model": "BAAI/bge-reranker-large",
-    "General":{
-      "task":"rerank",
-      "lora_config":null
-    }
-  }'
-```
-
 ### 3. Manage fine-tuning job

 Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.
--- a/InstructionTuning/docker_compose/intel/cpu/xeon/README.md
+++ b/InstructionTuning/docker_compose/intel/cpu/xeon/README.md
@@ -1,6 +1,6 @@
-# Deploy Finetuning Service on Xeon
+# Deploy Instruction Tuning Service on Xeon

-This document outlines the deployment process for a finetuning Service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Xeon server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
+This document outlines the deployment process for a Instruction Tuning Service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Xeon server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.

 ## 🚀 Build Docker Images

--- a/InstructionTuning/docker_compose/intel/hpu/gaudi/README.md
+++ b/InstructionTuning/docker_compose/intel/hpu/gaudi/README.md
@@ -1,6 +1,6 @@
-# Deploy Finetuning Service on Gaudi
+# Deploy Instruction Tuning Service on Gaudi

-This document outlines the deployment process for a finetuning Service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Gaudi server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
+This document outlines the deployment process for a Instruction Tuning Service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Gaudi server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.

 ## 🚀 Build Docker Images

--- a/InstructionTuning/docker_image_build/build.yaml
+++ b/InstructionTuning/docker_image_build/build.yaml
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  finetuning:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+        no_proxy: ${no_proxy}
+      context: GenAIComps
+      dockerfile: comps/finetuning/src/Dockerfile
+    image: ${REGISTRY:-opea}/finetuning:${TAG:-latest}
--- a/InstructionTuning/tests/test_compose_on_xeon.sh
+++ b/InstructionTuning/tests/test_compose_on_xeon.sh
--- a/README.md
+++ b/README.md
@@ -2,13 +2,13 @@

 ## Introduction

-GenAIExamples are designed to give developers an easy entry into generative AI, featuring microservice-based samples that simplify the processes of deploying, testing, and scaling GenAI applications. All examples are fully compatible with both Docker and Kubernetes, supporting a wide range of hardware platforms such as Gaudi, Xeon, NVIDIA GPUs, and other hardwares including AMD GPUs, ensuring flexibility and efficiency for your GenAI adoption.
+GenAIExamples are designed to give developers an easy entry into generative AI, featuring microservice-based samples that simplify the processes of deploying, testing, and scaling GenAI applications. All examples are fully compatible with Docker and Kubernetes, supporting a wide range of hardware platforms such as Gaudi, Xeon, and NVIDIA GPU, and other hardwares, ensuring flexibility and efficiency for your GenAI adoption.

 ## Architecture

-[GenAIComps](https://github.com/opea-project/GenAIComps) is a service-based tool that includes microservice components such as llm, embedding, reranking, and so on. Using these components, various examples in GenAIExample can be constructed including ChatQnA, DocSum, etc.
+[GenAIComps](https://github.com/opea-project/GenAIComps) is a service-based tool that includes microservice components such as llm, embedding, reranking, and so on. Using these components, various examples in GenAIExample can be constructed, including ChatQnA, DocSum, etc.

-[GenAIInfra](https://github.com/opea-project/GenAIInfra) is part of the OPEA containerization and cloud-native suite and enables quick and efficient deployment of GenAIExamples in the cloud.
+[GenAIInfra](https://github.com/opea-project/GenAIInfra), part of the OPEA containerization and cloud-native suite, enables quick and efficient deployment of GenAIExamples in the cloud.

 [GenAIEval](https://github.com/opea-project/GenAIEval) measures service performance metrics such as throughput, latency, and accuracy for GenAIExamples. This feature helps users compare performance across various hardware configurations easily.

@@ -18,18 +18,18 @@ The GenAIExamples [documentation](https://opea-project.github.io/latest/examples

 ## Getting Started

-GenAIExamples offers flexible deployment options that cater to different user needs, enabling efficient use and deployment in various environments. Three primary methods are presently used to do this: Python startup, Docker Compose, and Kubernetes.
+GenAIExamples offers flexible deployment options that cater to different user needs, enabling efficient use and deployment in various environments. Here’s a brief overview of the three primary methods: Python startup, Docker Compose, and Kubernetes.

 Users can choose the most suitable approach based on ease of setup, scalability needs, and the environment in which they are operating.

 ### Deployment Guide

-Deployment is based on released docker images by default - check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.
+Deployment is based on released docker images by default, check [docker image list](./docker_images_list.md) for detailed information. You can also build your own images following instructions.

 #### Prerequisite

- For Docker Compose-based deployment, you should have docker compose installed. Refer to [docker compose install](https://docs.docker.com/compose/install/) for more information.
- For Kubernetes-based deployment, you can use [Helm](https://helm.sh) or [GMC](https://github.com/opea-project/GenAIInfra/tree/main/microservices-connector/README.md)-based deployment.
+- For Docker Compose based deployment, you should have docker compose installed. Refer to [docker compose install](https://docs.docker.com/compose/install/).
+- For Kubernetes based deployment, you can use [Helm](https://helm.sh) or [GMC](https://github.com/opea-project/GenAIInfra/tree/main/microservices-connector/README.md) based deployment.

  - You should have a kubernetes cluster ready for use. If not, you can refer to [k8s install](https://github.com/opea-project/docs/tree/main/guide/installation/k8s_install/README.md) to deploy one.
  - (Optional) You should have Helm (version >= 3.15) installed if you want to deploy with Helm Charts. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
@@ -37,7 +37,7 @@ Deployment is based on released docker images by default - check [docker image l

 - Recommended Hardware Reference

-  Based on different deployment model sizes and performance requirements, you may choose different hardware platforms or cloud instances. Here are some of the reference platforms:
+  Based on different deployment model size and performance requirement, you may choose different hardware platforms or cloud instances. Here are some reference platforms

  | Use Case | Deployment model          | Reference Configuration                                              | Hardware access/instances                                                    |
  | -------- | ------------------------- | -------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
@@ -47,7 +47,7 @@ Deployment is based on released docker images by default - check [docker image l

 #### Deploy Examples

-> **Note**: Check for [sample guides](https://opea-project.github.io/latest/examples/index.html) first for your use case. If it is not available, then refer to the table below:
+> **Note**: Check for [sample guides](https://opea-project.github.io/latest/examples/index.html) first for your use case. If it is not available, then refer to the table below.

 | Use Case          | Docker Compose<br/>Deployment on Xeon                                          | Docker Compose<br/>Deployment on Gaudi                                       | Docker Compose<br/>Deployment on ROCm                                    | Kubernetes with Helm Charts                                         | Kubernetes with GMC                                          |
 | ----------------- | ------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------------- | ------------------------------------------------------------ |
--- a/RerankFinetuning/README.md
+++ b/RerankFinetuning/README.md
@@ -0,0 +1,61 @@
+# Rerank Model Finetuning
+
+Rerank model finetuning is the process of further training rerank model on a dataset for improving its capability on specific field.
+
+## Deploy Rerank Model Finetuning Service
+
+### Deploy Rerank Model Finetuning Service on Xeon
+
+Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for detail.
+
+### Deploy Rerank Model Finetuning Service on Gaudi
+
+Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) for detail.
+
+## Consume Rerank Model Finetuning Service
+
+### 1. Upload a training file
+
+Download a toy example training file `toy_finetune_data.jsonl` and upload it to the server with below command, this file can be downloaded in [here](https://github.com/FlagOpen/FlagEmbedding/blob/master/examples/finetune/toy_finetune_data.jsonl):
+
+```bash
+# upload a training file
+curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./toy_finetune_data.jsonl" -F purpose="fine-tune"
+```
+
+### 2. Create fine-tuning job
+
+After a training file `toy_finetune_data.jsonl` is uploaded, use the following command to launch a finetuning job using `BAAI/bge-reranker-large` as base model:
+
+```bash
+# create a finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "toy_finetune_data.jsonl",
+    "model": "BAAI/bge-reranker-large",
+    "General":{
+      "task":"rerank",
+      "lora_config":null
+    }
+  }'
+```
+
+### 3. Manage fine-tuning job
+
+Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.
+
+```bash
+# list finetuning jobs
+curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET
+
+# retrieve one finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+
+# cancel one finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+
+# list checkpoints of a finetuning job
+curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+```
--- a/RerankFinetuning/docker_compose/intel/cpu/xeon/README.md
+++ b/RerankFinetuning/docker_compose/intel/cpu/xeon/README.md
@@ -0,0 +1,26 @@
+# Deploy Rerank Model Finetuning Service on Xeon
+
+This document outlines the deployment process for a rerank model finetuning service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Xeon server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally. This step can be ignored after the Docker images published to Docker hub.
+
+### 1. Build Docker Image
+
+Build docker image with below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+export HF_TOKEN=${your_huggingface_token}
+docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/src/Dockerfile .
+```
+
+### 2. Run Docker with CLI
+
+Start docker container with below command:
+
+```bash
+docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
+```
--- a/RerankFinetuning/docker_compose/intel/hpu/gaudi/README.md
+++ b/RerankFinetuning/docker_compose/intel/hpu/gaudi/README.md
@@ -0,0 +1,26 @@
+# Deploy Rerank Model Finetuning Service on Gaudi
+
+This document outlines the deployment process for a rerank model finetuning service utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice on Intel Xeon server. The steps include Docker image creation, container deployment. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally. This step can be ignored after the Docker images published to Docker hub.
+
+### 1. Build Docker Image
+
+Build docker image with below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/finetuning-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/finetuning/src/Dockerfile.intel_hpu .
+```
+
+### 2. Run Docker with CLI
+
+Start docker container with below command:
+
+```bash
+export HF_TOKEN=${your_huggingface_token}
+docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
+```
--- a/RerankFinetuning/docker_image_build/build.yaml
+++ b/RerankFinetuning/docker_image_build/build.yaml
--- a/RerankFinetuning/tests/test_compose_on_gaudi.sh
+++ b/RerankFinetuning/tests/test_compose_on_gaudi.sh
@@ -0,0 +1,131 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+finetuning_service_port=8015
+ray_port=8265
+service_name=finetuning-gaudi
+
+function build_docker_images() {
+    cd $WORKPATH/docker_image_build
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+    docker compose -f build.yaml build ${service_name} --no-cache > ${LOG_PATH}/docker_image_build.log
+}
+
+function start_service() {
+    export no_proxy="localhost,127.0.0.1,"${ip_address}
+    docker run -d --name="finetuning-server" -p $finetuning_service_port:$finetuning_service_port -p $ray_port:$ray_port --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy ${IMAGE_REPO}/finetuning-gaudi:${IMAGE_TAG}
+    sleep 1m
+}
+
+function validate_microservice() {
+    cd $LOG_PATH
+    export no_proxy="localhost,127.0.0.1,"${ip_address}
+
+    # test /v1/dataprep upload file
+    URL="http://${ip_address}:$finetuning_service_port/v1/files"
+    cat <<EOF > test_data.json
+{"query": "Five women walk along a beach wearing flip-flops.", "pos": ["Some women with flip-flops on, are walking along the beach"], "neg": ["The 4 women are sitting on the beach.", "There was a reform in 1996.", "She's not going to court to clear her record.", "The man is talking about hawaii.", "A woman is standing outside.", "The battle was over. ", "A group of people plays volleyball."]}
+{"query": "A woman standing on a high cliff on one leg looking over a river.", "pos": ["A woman is standing on a cliff."], "neg": ["A woman sits on a chair.", "George Bush told the Republicans there was no way he would let them even consider this foolish idea, against his top advisors advice.", "The family was falling apart.", "no one showed up to the meeting", "A boy is sitting outside playing in the sand.", "Ended as soon as I received the wire.", "A child is reading in her bedroom."]}
+{"query": "Two woman are playing instruments; one a clarinet, the other a violin.", "pos": ["Some people are playing a tune."], "neg": ["Two women are playing a guitar and drums.", "A man is skiing down a mountain.", "The fatal dose was not taken when the murderer thought it would be.", "Person on bike", "The girl is standing, leaning against the archway.", "A group of women watch soap operas.", "No matter how old people get they never forget. "]}
+{"query": "A girl with a blue tank top sitting watching three dogs.", "pos": ["A girl is wearing blue."], "neg": ["A girl is with three cats.", "The people are watching a funeral procession.", "The child is wearing black.", "Financing is an issue for us in public schools.", "Kids at a pool.", "It is calming to be assaulted.", "I face a serious problem at eighteen years old. "]}
+{"query": "A yellow dog running along a forest path.", "pos": ["a dog is running"], "neg": ["a cat is running", "Steele did not keep her original story.", "The rule discourages people to pay their child support.", "A man in a vest sits in a car.", "Person in black clothing, with white bandanna and sunglasses waits at a bus stop.", "Neither the Globe or Mail had comments on the current state of Canada's road system. ", "The Spring Creek facility is old and outdated."]}
+{"query": "It sets out essential activities in each phase along with critical factors related to those activities.", "pos": ["Critical factors for essential activities are set out."], "neg": ["It lays out critical activities but makes no provision for critical factors related to those activities.", "People are assembled in protest.", "The state would prefer for you to do that.", "A girl sits beside a boy.", "Two males are performing.", "Nobody is jumping", "Conrad was being plotted against, to be hit on the head."]}
+EOF
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'file=@./test_data.json' -F purpose="fine-tune" -H 'Content-Type: multipart/form-data' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="finetuning-server - upload - file"
+
+    # Parse the JSON response
+    purpose=$(echo "$RESPONSE_BODY" | jq -r '.purpose')
+    filename=$(echo "$RESPONSE_BODY" | jq -r '.filename')
+
+    # Define expected values
+    expected_purpose="fine-tune"
+    expected_filename="test_data.json"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # Check if the parsed values match the expected values
+    if [[ "$purpose" != "$expected_purpose" || "$filename" != "$expected_filename" ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    # test /v1/fine_tuning/jobs
+    URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs"
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_data.json","model": "BAAI/bge-reranker-base","General":{"task":"rerank","lora_config":null}}' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="finetuning-server - create finetuning job"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 3m
+
+    docker logs finetuning-server 2>&1 | tee ${LOG_PATH}/finetuning-server_create.log
+    FINETUNING_LOG=$(grep "succeeded" ${LOG_PATH}/finetuning-server_create.log)
+    if [[ "$FINETUNING_LOG" != *'succeeded'* ]]; then
+        echo "Finetuning failed."
+        RAY_JOBID=$(grep "Submitted Ray job" ${LOG_PATH}/finetuning-server_create.log | sed 's/.*raysubmit/raysubmit/' | cut -d' ' -f 1)
+        docker exec finetuning-server python -c "import os;os.environ['RAY_ADDRESS']='http://localhost:8265';from ray.job_submission import JobSubmissionClient;client = JobSubmissionClient();print(client.get_job_logs('${RAY_JOBID}'))" 2>&1 | tee ${LOG_PATH}/finetuning.log
+        exit 1
+    else
+        echo "Finetuning succeeded."
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=finetuning-server*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/RerankFinetuning/tests/test_compose_on_xeon.sh
+++ b/RerankFinetuning/tests/test_compose_on_xeon.sh
@@ -0,0 +1,131 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+finetuning_service_port=8015
+ray_port=8265
+service_name=finetuning
+
+function build_docker_images() {
+    cd $WORKPATH/docker_image_build
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+    docker compose -f build.yaml build ${service_name} --no-cache > ${LOG_PATH}/docker_image_build.log
+}
+
+function start_service() {
+    export no_proxy="localhost,127.0.0.1,"${ip_address}
+    docker run -d --name="finetuning-server" -p $finetuning_service_port:$finetuning_service_port -p $ray_port:$ray_port --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy ${IMAGE_REPO}/finetuning:${IMAGE_TAG}
+    sleep 1m
+}
+
+function validate_microservice() {
+    cd $LOG_PATH
+    export no_proxy="localhost,127.0.0.1,"${ip_address}
+
+    # test /v1/dataprep upload file
+    URL="http://${ip_address}:$finetuning_service_port/v1/files"
+    cat <<EOF > test_data.json
+{"query": "Five women walk along a beach wearing flip-flops.", "pos": ["Some women with flip-flops on, are walking along the beach"], "neg": ["The 4 women are sitting on the beach.", "There was a reform in 1996.", "She's not going to court to clear her record.", "The man is talking about hawaii.", "A woman is standing outside.", "The battle was over. ", "A group of people plays volleyball."]}
+{"query": "A woman standing on a high cliff on one leg looking over a river.", "pos": ["A woman is standing on a cliff."], "neg": ["A woman sits on a chair.", "George Bush told the Republicans there was no way he would let them even consider this foolish idea, against his top advisors advice.", "The family was falling apart.", "no one showed up to the meeting", "A boy is sitting outside playing in the sand.", "Ended as soon as I received the wire.", "A child is reading in her bedroom."]}
+{"query": "Two woman are playing instruments; one a clarinet, the other a violin.", "pos": ["Some people are playing a tune."], "neg": ["Two women are playing a guitar and drums.", "A man is skiing down a mountain.", "The fatal dose was not taken when the murderer thought it would be.", "Person on bike", "The girl is standing, leaning against the archway.", "A group of women watch soap operas.", "No matter how old people get they never forget. "]}
+{"query": "A girl with a blue tank top sitting watching three dogs.", "pos": ["A girl is wearing blue."], "neg": ["A girl is with three cats.", "The people are watching a funeral procession.", "The child is wearing black.", "Financing is an issue for us in public schools.", "Kids at a pool.", "It is calming to be assaulted.", "I face a serious problem at eighteen years old. "]}
+{"query": "A yellow dog running along a forest path.", "pos": ["a dog is running"], "neg": ["a cat is running", "Steele did not keep her original story.", "The rule discourages people to pay their child support.", "A man in a vest sits in a car.", "Person in black clothing, with white bandanna and sunglasses waits at a bus stop.", "Neither the Globe or Mail had comments on the current state of Canada's road system. ", "The Spring Creek facility is old and outdated."]}
+{"query": "It sets out essential activities in each phase along with critical factors related to those activities.", "pos": ["Critical factors for essential activities are set out."], "neg": ["It lays out critical activities but makes no provision for critical factors related to those activities.", "People are assembled in protest.", "The state would prefer for you to do that.", "A girl sits beside a boy.", "Two males are performing.", "Nobody is jumping", "Conrad was being plotted against, to be hit on the head."]}
+EOF
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'file=@./test_data.json' -F purpose="fine-tune" -H 'Content-Type: multipart/form-data' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="finetuning-server - upload - file"
+
+    # Parse the JSON response
+    purpose=$(echo "$RESPONSE_BODY" | jq -r '.purpose')
+    filename=$(echo "$RESPONSE_BODY" | jq -r '.filename')
+
+    # Define expected values
+    expected_purpose="fine-tune"
+    expected_filename="test_data.json"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # Check if the parsed values match the expected values
+    if [[ "$purpose" != "$expected_purpose" || "$filename" != "$expected_filename" ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    # test /v1/fine_tuning/jobs
+    URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs"
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_data.json","model": "BAAI/bge-reranker-base","General":{"task":"rerank","lora_config":null}}' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="finetuning-server - create finetuning job"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 3m
+
+    docker logs finetuning-server 2>&1 | tee ${LOG_PATH}/finetuning-server_create.log
+    FINETUNING_LOG=$(grep "succeeded" ${LOG_PATH}/finetuning-server_create.log)
+    if [[ "$FINETUNING_LOG" != *'succeeded'* ]]; then
+        echo "Finetuning failed."
+        RAY_JOBID=$(grep "Submitted Ray job" ${LOG_PATH}/finetuning-server_create.log | sed 's/.*raysubmit/raysubmit/' | cut -d' ' -f 1)
+        docker exec finetuning-server python -c "import os;os.environ['RAY_ADDRESS']='http://localhost:8265';from ray.job_submission import JobSubmissionClient;client = JobSubmissionClient();print(client.get_job_logs('${RAY_JOBID}'))" 2>&1 | tee ${LOG_PATH}/finetuning.log
+        exit 1
+    else
+        echo "Finetuning succeeded."
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=finetuning-server*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -10,7 +10,7 @@ services:
    ports:
      - "3001:80"
    volumes:
-      - "${MODEL_PATH:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -56,7 +56,7 @@ services:
    ports:
      - "3004:80"
    volumes:
-      - "${MODEL_PATH:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -86,7 +86,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_PATH:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -9,7 +9,7 @@ services:
    ports:
      - "3001:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -60,7 +60,7 @@ services:
    ports:
      - "3004:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -96,7 +96,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -9,7 +9,7 @@ services:
    ports:
      - "3001:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -67,7 +67,7 @@ services:
    ports:
      - "3004:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/SearchQnA/tests/test_compose_on_gaudi.sh
+++ b/SearchQnA/tests/test_compose_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/SearchQnA/tests/test_compose_on_rocm.sh
+++ b/SearchQnA/tests/test_compose_on_rocm.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_PATH=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/SearchQnA/tests/test_compose_on_xeon.sh
+++ b/SearchQnA/tests/test_compose_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -21,7 +21,7 @@ services:
      timeout: 10s
      retries: 100
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
--- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -30,7 +30,7 @@ services:
      - SYS_NICE
    ipc: host
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
  llm:
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
--- a/VideoQnA/Dockerfile
+++ b/VideoQnA/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./videoqna.py $HOME/videoqna.py

--- a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8399:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8399:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
Author	SHA1	Message	Date
Ying Hu	b10456e42a	Update test_compose_vllm_on_xeon.sh move the vllm-service	2025-03-04 19:11:12 +08:00
pre-commit-ci[bot]	3fb6cb590c	[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci	2025-03-04 09:06:37 +00:00
Ying Hu	75ee579021	Create test_compose_vllm_on_xeon.sh for vLLM Create test_compose_vllm_on_xeon.sh for vLLM	2025-03-04 17:05:30 +08:00
Ying Hu	768f1a45e2	Create compose_vllm.yaml for vLLM Create compose_vllm.yaml for vLLM	2025-03-04 17:01:41 +08:00
pre-commit-ci[bot]	c4dffdad80	[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci	2025-03-04 08:59:57 +00:00
Ying Hu	2a0c4ccb81	Support vLLM for DBQnA Support vLLM for DBQnA 1. update Readme	2025-03-04 16:57:49 +08:00