[ProductivitySuite] Fix CD Issue (#858 )

Signed-off-by: letonghan <letong.han@intel.com> (cherry picked from commit d55a33dda1)
Fix SearchQnA tests bug (#857 )
2024-09-20 16:32:05 +08:00 · 2024-09-20 16:31:49 +08:00 · 2024-09-20 16:31:46 +08:00 · 2024-09-20 16:31:44 +08:00
101 changed files with 355 additions and 8416 deletions
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -64,6 +64,10 @@ jobs:
        run: |
          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
+          if [[ $(grep -c "tei-gaudi:" ${docker_compose_path}) != 0 ]]; then
+              git clone https://github.com/huggingface/tei-gaudi.git
+              cd tei-gaudi && git rev-parse HEAD && cd ../
+          fi
          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
              git clone https://github.com/vllm-project/vllm.git
              cd vllm && git rev-parse HEAD && cd ../
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.4
    container_name: tgi-server
    ports:
      - "8085:80"
@@ -13,16 +13,12 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -51,7 +51,7 @@ services:
    environment:
      TTS_ENDPOINT: ${TTS_ENDPOINT}
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "3006:80"
@@ -61,15 +61,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/AudioQnA/kubernetes/intel/README_gmc.md
+++ b/AudioQnA/kubernetes/intel/README_gmc.md
@@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
 For Gaudi:

- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.5
+- tgi-service: ghcr.io/huggingface/tgi-gaudi:1.2.1
 - whisper-gaudi: opea/whisper-gaudi:latest
 - speecht5-gaudi: opea/speecht5-gaudi:latest

--- a/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml
+++ b/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml
@@ -247,7 +247,7 @@ spec:
      - envFrom:
        - configMapRef:
            name: audio-qna-config
-        image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+        image: ghcr.io/huggingface/text-generation-inference:2.2.0
        name: llm-dependency-deploy-demo
        securityContext:
          capabilities:
--- a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml
+++ b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml
@@ -271,7 +271,7 @@ spec:
      - envFrom:
        - configMapRef:
            name: audio-qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.1
        name: llm-dependency-deploy-demo
        securityContext:
          capabilities:
@@ -303,14 +303,6 @@ spec:
          value: none
        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
          value: 'true'
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
        - name: runtime
          value: habana
        - name: HABANA_VISIBLE_DEVICES
@@ -323,7 +315,7 @@ spec:
      volumes:
      - name: model-volume
        hostPath:
-          path: /mnt/models
+          path: /home/sdp/cesg
          type: Directory
      - name: shm
        emptyDir:
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="audioqna whisper-gaudi asr llm-tgi speecht5-gaudi tts"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="audioqna whisper asr llm-tgi speecht5 tts"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -72,7 +72,7 @@ docker pull opea/chatqna-ui:latest

 In following cases, you could build docker image from source by yourself.

- Failed to download the docker image.
+- Failed to download the docker image. (The essential Docker image `opea/nginx` has not yet been released, users need to build this image first)

 - If you want to use a specific version of Docker image.

--- a/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
@@ -1,653 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 31
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
@@ -1,653 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 7
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
@@ -1,653 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 15
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
@@ -1,742 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 32
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tei-gaudi:latest
-        name: reranking-dependency-deploy
-        args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: reranking-dependency-deploy
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: reranking-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        args: null
-        ports:
-        - containerPort: 8000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: reranking-deploy
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
@@ -1,591 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 8
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
@@ -1,591 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 16
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/benchmark/performance/README.md
+++ b/ChatQnA/benchmark/performance/README.md
@@ -29,8 +29,6 @@ Results will be displayed in the terminal and saved as CSV file named `1_stats.c

 ## Getting Started

-We recommend using Kubernetes to deploy the ChatQnA service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. Below is a description of Kubernetes deployment and benchmarking. For instructions on deploying and benchmarking with Docker, please refer to [this section](#benchmark-with-docker).
-
 ### Prerequisites

 - Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md).
@@ -189,13 +187,10 @@ curl -X POST "http://${cluster_ip}:6007/v1/dataprep" \

 ###### 3.2 Run Benchmark Test

-We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.deployment_type`, `test_suite_config.service_ip`, `test_suite_config.service_port`, `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.
+We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.

 ```bash
-export DEPLOYMENT_TYPE="k8s"
-export SERVICE_IP = None
-export SERVICE_PORT = None
-export USER_QUERIES="[640, 640, 640, 640]"
+export USER_QUERIES="[4, 8, 16, 640]"
 export TEST_OUTPUT_DIR="/home/sdp/benchmark_output/node_1"
 envsubst < ./benchmark.yaml > GenAIEval/evals/benchmark/benchmark.yaml
 ```
@@ -242,22 +237,20 @@ kubectl apply -f .

 ##### 3. Run tests

-We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.deployment_type`, `test_suite_config.service_ip`, `test_suite_config.service_port`, `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.
+We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.

-````bash
-export DEPLOYMENT_TYPE="k8s"
-export SERVICE_IP = None
-export SERVICE_PORT = None
-export USER_QUERIES="[1280, 1280, 1280, 1280]"
+```bash
+export USER_QUERIES="[4, 8, 16, 1280]"
 export TEST_OUTPUT_DIR="/home/sdp/benchmark_output/node_2"
 envsubst < ./benchmark.yaml > GenAIEval/evals/benchmark/benchmark.yaml
+```

 And then run the benchmark tool by:

 ```bash
 cd GenAIEval/evals/benchmark
 python benchmark.py
-````
+```

 ##### 4. Data collection

@@ -293,13 +286,10 @@ kubectl apply -f .

 ##### 3. Run tests

-We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.deployment_type`, `test_suite_config.service_ip`, `test_suite_config.service_port`, `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.
+We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.

 ```bash
-export DEPLOYMENT_TYPE="k8s"
-export SERVICE_IP = None
-export SERVICE_PORT = None
-export USER_QUERIES="[2560, 2560, 2560, 2560]"
+export USER_QUERIES="[4, 8, 16, 2560]"
 export TEST_OUTPUT_DIR="/home/sdp/benchmark_output/node_4"
 envsubst < ./benchmark.yaml > GenAIEval/evals/benchmark/benchmark.yaml
 ```
@@ -323,80 +313,3 @@ cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi
 kubectl delete -f .
 kubectl label nodes k8s-master k8s-worker1 k8s-worker2 k8s-worker3 node-type-
 ```
-
-## Benchmark with Docker
-
-### Deploy ChatQnA service with Docker
-
-In order to set up the environment correctly, you'll need to configure essential environment variables and, if applicable, proxy-related variables.
-
-```bash
-# Example: host_ip="192.168.1.1"
-export host_ip="External_Public_IP"
-# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-export no_proxy="Your_No_Proxy"
-export http_proxy="Your_HTTP_Proxy"
-export https_proxy="Your_HTTPs_Proxy"
-export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
-```
-
-#### Deploy ChatQnA on Gaudi
-
-```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
-docker compose up -d
-```
-
-Refer to the [Gaudi Guide](../../docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.
-
-#### Deploy ChatQnA on Xeon
-
-```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
-docker compose up -d
-```
-
-Refer to the [Xeon Guide](../../docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.
-
-#### Deploy ChatQnA on NVIDIA GPU
-
-```bash
-cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu/
-docker compose up -d
-```
-
-Refer to the [NVIDIA GPU Guide](../../docker_compose/nvidia/gpu/README.md) for more instructions on building docker images from source.
-
-### Run tests
-
-We copy the configuration file [benchmark.yaml](./benchmark.yaml) to `GenAIEval/evals/benchmark/benchmark.yaml` and config `test_suite_config.deployment_type`, `test_suite_config.service_ip`, `test_suite_config.service_port`, `test_suite_config.user_queries` and `test_suite_config.test_output_dir`.
-
-```bash
-export DEPLOYMENT_TYPE="docker"
-export SERVICE_IP = "ChatQnA Service IP"
-export SERVICE_PORT = "ChatQnA Service Port"
-export USER_QUERIES="[640, 640, 640, 640]"
-export TEST_OUTPUT_DIR="/home/sdp/benchmark_output/docker"
-envsubst < ./benchmark.yaml > GenAIEval/evals/benchmark/benchmark.yaml
-```
-
-And then run the benchmark tool by:
-
-```bash
-cd GenAIEval/evals/benchmark
-python benchmark.py
-```
-
-### Data collection
-
-All the test results will come to this folder `/home/sdp/benchmark_output/docker` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
-
-### Clean up
-
-Take gaudi as example, use the below command to clean up system.
-
-```bash
-cd GenAIExamples/docker_compose/intel/hpu/gaudi
-docker compose stop && docker compose rm -f
-echo y | docker system prune
-```
--- a/ChatQnA/benchmark/performance/benchmark.yaml
+++ b/ChatQnA/benchmark/performance/benchmark.yaml
@@ -3,9 +3,6 @@

 test_suite_config: # Overall configuration settings for the test suite
  examples: ["chatqna"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
-  deployment_type: ${DEPLOYMENT_TYPE}  # Default is "k8s", can also be "docker"
-  service_ip: ${SERVICE_IP}  # Leave as None for k8s, specify for Docker
-  service_port: ${SERVICE_PORT}  # Leave as None for k8s, specify for Docker
  concurrent_level: 5  # The concurrency level, adjustable based on requirements
  user_queries: ${USER_QUERIES}  # Number of test requests at each concurrency level
  random_prompt: false  # Use random prompts if true, fixed prompts if false
--- a/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
@@ -1,683 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 31
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
@@ -1,683 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 7
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
@@ -1,683 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 15
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
---
--- a/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
@@ -1,622 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 32
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
@@ -1,622 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 8
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
+++ b/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
@@ -1,622 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 16
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HUGGING_FACE_HUB_TOKEN
-          value: ${HF_TOKEN}
-        - name: ENABLE_HPU_GRAPH
-          value: 'true'
-        - name: LIMIT_HPU_GRAPH
-          value: 'true'
-        - name: USE_FLASH_ATTENTION
-          value: 'true'
-        - name: FLASH_ATTENTION_RECOMPUTE
-          value: 'true'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
---
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
---
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
---
--- a/ChatQnA/chatqna.yaml
+++ b/ChatQnA/chatqna.yaml
@@ -19,7 +19,7 @@ opea_micro_services:
  tei-embedding-service:
    host: ${TEI_EMBEDDING_SERVICE_IP}
    ports: ${TEI_EMBEDDING_SERVICE_PORT}
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: opea/tei-gaudi:latest
    volumes:
      - "./data:/data"
    runtime: habana
@@ -48,7 +48,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -56,13 +56,10 @@ opea_micro_services:
      - SYS_NICE
    ipc: host
    environment:
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -49,7 +49,7 @@ docker pull opea/chatqna-ui:latest

 In following cases, you could build docker image from source by yourself.

- Failed to download the docker image.
+- Failed to download the docker image. (The essential Docker image `opea/nginx` has not yet been released, users need to build this image first)

 - If you want to use a specific version of Docker image.

@@ -233,7 +233,7 @@ For users in China who are unable to download models directly from Huggingface,
   export HF_TOKEN=${your_hf_token}
   export HF_ENDPOINT="https://hf-mirror.com"
   model_name="Intel/neural-chat-7b-v3-3"
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.2.0 --model-id $model_name
   ```

 2. Offline
@@ -247,7 +247,7 @@ For users in China who are unable to download models directly from Huggingface,
     ```bash
     export HF_TOKEN=${your_hf_token}
     export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.2.0 --model-id /data
     ```

 ### Setup Environment Variables
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -69,7 +69,7 @@ services:
      INDEX_NAME: ${INDEX_NAME}
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    container_name: tei-reranking-server
    ports:
      - "6041:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -75,7 +75,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -50,7 +50,7 @@ docker pull opea/chatqna-ui:latest

 In following cases, you could build docker image from source by yourself.

- Failed to download the docker image.
+- Failed to download the docker image. (The essential Docker image `opea/nginx` has not yet been released, users need to build this image first)

 - If you want to use a specific version of Docker image.

--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -108,7 +108,7 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8005:80"
@@ -118,15 +118,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: ${llm_service_devices}
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tgi-guardrails-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-guardrails-server
    ports:
      - "8088:80"
@@ -35,15 +35,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -64,7 +60,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -145,7 +141,7 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
@@ -155,15 +151,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -108,7 +108,7 @@ services:
  #     HF_HUB_ENABLE_HF_TRANSFER: 0
  #   restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8005:80"
@@ -118,15 +118,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -73,7 +73,7 @@ services:
      INDEX_NAME: ${INDEX_NAME}
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    container_name: tei-reranking-gaudi-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -73,7 +73,7 @@ services:
      INDEX_NAME: ${INDEX_NAME}
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    container_name: tei-reranking-gaudi-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -25,7 +25,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -75,7 +75,7 @@ services:
      INDEX_NAME: ${INDEX_NAME}
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8005:80"
@@ -85,15 +85,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -56,16 +56,16 @@ f810f3b4d329   opea/embedding-tei:latest                               "python e
 2fa17d84605f   opea/dataprep-redis:latest                              "python prepare_doc_…"   2 minutes ago   Up 2 minutes                    0.0.0.0:6007->6007/tcp, :::6007->6007/tcp                                              dataprep-redis-server
 69e1fb59e92c   opea/retriever-redis:latest                             "/home/user/comps/re…"   2 minutes ago   Up 2 minutes                    0.0.0.0:7000->7000/tcp, :::7000->7000/tcp                                              retriever-redis-server
 313b9d14928a   opea/reranking-tei:latest                               "python reranking_te…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8000->8000/tcp, :::8000->8000/tcp                                              reranking-tei-gaudi-server
-05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.0.5                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
-174bd43fa6b5   ghcr.io/huggingface/tei-gaudi:latest                    "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8090->80/tcp, :::8090->80/tcp                                                  tei-embedding-gaudi-server
+05c40b636239   ghcr.io/huggingface/tgi-gaudi:1.2.1                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
+174bd43fa6b5   opea/tei-gaudi:latest                                   "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8090->80/tcp, :::8090->80/tcp                                                  tei-embedding-gaudi-server
 74084469aa33   redis/redis-stack:7.2.0-v9                              "/entrypoint.sh"         2 minutes ago   Up 2 minutes                    0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
 88399dbc9e43   ghcr.io/huggingface/text-embeddings-inference:cpu-1.5   "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8808->80/tcp, :::8808->80/tcp                                                  tei-reranking-gaudi-server
 ```

-In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.5` Existed.
+In this case, `ghcr.io/huggingface/tgi-gaudi:1.2.1` Existed.

 ```
-05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.0.5                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
+05c40b636239   ghcr.io/huggingface/tgi-gaudi:1.2.1                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
 ```

 Next we can check the container logs to get to know what happened during the docker start.
@@ -76,7 +76,7 @@ Check the log of container by:

 `docker logs <CONTAINER ID> -t`

-View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.5`
+View the logs of `ghcr.io/huggingface/tgi-gaudi:1.2.1`

 `docker logs 05c40b636239 -t`

@@ -105,7 +105,7 @@ So just make sure the devices are available.
 Here is another failure example:

 ```
-f7a08f9867f9   ghcr.io/huggingface/tgi-gaudi:2.0.5                     "text-generation-lau…"   16 seconds ago   Exited (2) 14 seconds ago                                                                                          tgi-gaudi-server
+f7a08f9867f9   ghcr.io/huggingface/tgi-gaudi:1.2.1                     "text-generation-lau…"   16 seconds ago   Exited (2) 14 seconds ago                                                                                          tgi-gaudi-server
 ```

 Check the log by `docker logs f7a08f9867f9 -t`.
@@ -122,7 +122,7 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co

 ```
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:1.2.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
@@ -131,13 +131,9 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co
    environment:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -50,7 +50,7 @@ docker pull opea/chatqna-ui:latest

 In following cases, you could build docker image from source by yourself.

- Failed to download the docker image.
+- Failed to download the docker image. (The essential Docker image `opea/nginx` has not yet been released, users need to build this image first)

 - If you want to use a specific version of Docker image.

--- a/ChatQnA/docker_image_build/build.yaml
+++ b/ChatQnA/docker_image_build/build.yaml
@@ -125,6 +125,12 @@ services:
      dockerfile: comps/guardrails/llama_guard/langchain/Dockerfile
    extends: chatqna
    image: ${REGISTRY:-opea}/guardrails-tgi:${TAG:-latest}
+  tei-gaudi:
+    build:
+      context: tei-gaudi
+      dockerfile: Dockerfile-hpu
+    extends: chatqna
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
  vllm:
    build:
      context: vllm
--- a/ChatQnA/kubernetes/intel/README_gmc.md
+++ b/ChatQnA/kubernetes/intel/README_gmc.md
@@ -27,8 +27,8 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
 For Gaudi:

- tei-embedding-service: ghcr.io/huggingface/tei-gaudi:latest
- tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.5
+- tei-embedding-service: opea/tei-gaudi:latest
+- tgi-service: ghcr.io/huggingface/tgi-gaudi:1.2.1

 > [NOTE]  
 > Please refer to [Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/cpu/xeon/README.md) or [Gaudi README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/hpu/gaudi/README.md) to build the OPEA images. These too will be available on Docker Hub soon to simplify use.
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
@@ -1474,7 +1474,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.2.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
@@ -1554,7 +1554,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.2.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
@@ -1477,7 +1477,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
@@ -1558,7 +1558,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml
@@ -1298,7 +1298,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-guardrails chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi guardrails-tgi"
+    service_list="chatqna-guardrails chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi tei-gaudi guardrails-tgi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh
@@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-no-wrapper chatqna-ui dataprep-redis retriever-redis"
+    service_list="chatqna-no-wrapper chatqna-ui dataprep-redis retriever-redis tei-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="chatqna-no-wrapper chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx"
+    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi tei-gaudi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
@@ -17,13 +17,13 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-hpu llm-vllm"
+    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei tei-gaudi llm-vllm-hpu llm-vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    docker images && sleep 1s
 }

--- a/ChatQnA/tests/test_compose_vllm_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_vllm_on_xeon.sh
@@ -23,7 +23,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
@@ -17,13 +17,13 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-ray-hpu llm-vllm-ray"
+    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei tei-gaudi llm-vllm-ray-hpu llm-vllm-ray"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
    docker images && sleep 1s
 }

--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-without-rerank chatqna-ui dataprep-redis embedding-tei retriever-redis llm-tgi"
+    service_list="chatqna-without-rerank chatqna-ui dataprep-redis embedding-tei retriever-redis llm-tgi tei-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="chatqna-without-rerank chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis llm-tgi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
--- a/CodeGen/codegen.yaml
+++ b/CodeGen/codegen.yaml
@@ -6,7 +6,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -17,11 +17,7 @@ opea_micro_services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8028:80"
@@ -15,11 +15,7 @@ services:
      https_proxy: ${https_proxy}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml
+++ b/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml
@@ -405,7 +405,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/CodeGen/tests/test_compose_on_gaudi.sh
+++ b/CodeGen/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="codegen codegen-ui llm-tgi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/CodeTrans/codetrans.yaml
+++ b/CodeTrans/codetrans.yaml
@@ -6,7 +6,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -17,11 +17,7 @@ opea_micro_services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: codetrans-tgi-service
    ports:
      - "8008:80"
@@ -15,11 +15,7 @@ services:
      https_proxy: ${https_proxy}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml
+++ b/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml
@@ -405,7 +405,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="codetrans codetrans-ui llm-tgi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -28,7 +28,7 @@ services:
      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/DocIndexRetriever/docker_image_build/build.yaml
+++ b/DocIndexRetriever/docker_image_build/build.yaml
@@ -35,3 +35,9 @@ services:
      dockerfile: comps/dataprep/redis/langchain/Dockerfile
    extends: doc-index-retriever
    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
+  tei-gaudi:
+    build:
+      context: tei-gaudi
+      dockerfile: Dockerfile-hpu
+    extends: doc-index-retriever
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
--- a/DocIndexRetriever/tests/test_compose_on_gaudi.sh
+++ b/DocIndexRetriever/tests/test_compose_on_gaudi.sh
@@ -19,12 +19,14 @@ function build_docker_images() {
    if [ ! -d "GenAIComps" ] ; then
        git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
    fi
+    if [ ! -d "tei-gaudi" ] ; then
+        git clone https://github.com/huggingface/tei-gaudi
+    fi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull redis/redis-stack:7.2.0-v9
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
    docker images && sleep 1s
 }

--- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
@@ -11,11 +11,8 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    volumes:
      - "./data:/data"
    runtime: habana
--- a/DocSum/docsum.yaml
+++ b/DocSum/docsum.yaml
@@ -6,7 +6,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -17,11 +17,7 @@ opea_micro_services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
--- a/DocSum/kubernetes/intel/README_gmc.md
+++ b/DocSum/kubernetes/intel/README_gmc.md
@@ -7,9 +7,9 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll

 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.

-The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the
-the image `ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
-service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.5`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
+The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the
+the image ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3.

 [NOTE]
 Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/docker_compose/intel/cpu/xeon/README.md) or
@@ -17,7 +17,7 @@ Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob
 These will be available on Docker Hub soon, simplifying installation.

 ## Deploy the RAG pipeline
-This involves deploying the application pipeline custom resource. You can use `docsum_xeon.yaml` if you have just a Xeon cluster or `docsum_gaudi.yaml` if you have a Gaudi cluster.
+This involves deploying the application pipeline custom resource. You can use docsum_xeon.yaml if you have just a Xeon cluster or docsum_gaudi.yaml if you have a Gaudi cluster.

 1. Setup Environment variables. These are specific to the user. Skip the proxy settings if you are not operating behind one.
   
--- a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml
+++ b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml
@@ -405,7 +405,7 @@ spec:
            runAsUser: 1000
            seccompProfile:
              type: RuntimeDefault
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.1"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="docsum docsum-ui llm-docsum-tgi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
@@ -18,10 +18,6 @@ services:
      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      PREFILL_BATCH_BUCKET_SIZE: 1
      BATCH_BUCKET_SIZE: 8
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/FaqGen/faqgen.yaml
+++ b/FaqGen/faqgen.yaml
@@ -6,7 +6,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:1.2.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -14,13 +14,10 @@ opea_micro_services:
      - SYS_NICE
    ipc: host
    environment:
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
--- a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml
+++ b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml
@@ -39,15 +39,7 @@ spec:
            value: "8"
          - name: PORT
            value: "80"
-          - name: ENABLE_HPU_GRAPH
-            value: 'true'
-          - name: LIMIT_HPU_GRAPH
-            value: 'true'
-          - name: USE_FLASH_ATTENTION
-            value: 'true'
-          - name: FLASH_ATTENTION_RECOMPUTE
-            value: 'true'
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.1
        imagePullPolicy: IfNotPresent
        securityContext:
          capabilities:
@@ -80,7 +72,7 @@ spec:
      volumes:
      - name: model-volume
        hostPath:
-          path: /mnt/models
+          path: /home/sdp/cesg
          type: Directory
      - name: shm
        emptyDir:
--- a/FaqGen/tests/test_compose_on_gaudi.sh
+++ b/FaqGen/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="faqgen faqgen-ui llm-faqgen-tgi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:1.2.1
    docker images && sleep 1s
 }

--- a/ProductivitySuite/README.md
+++ b/ProductivitySuite/README.md
@@ -1,39 +1,23 @@
-# Productivity Suite Application
+# OPEA Productivity Suite Application

-Productivity Suite, a tool designed to streamline your workflow and boost productivity! Our application leverages the power of OPEA microservices to deliver a comprehensive suite of features tailored to meet the diverse needs of modern enterprises.
+OPEA Productivity Suite streamlines your workflow to boost productivity. It leverages the OPEA microservices to provide a comprehensive suite of features to cater to the diverse needs of modern enterprises.

---
+## Key Features

-## 🛠️ Key Features
+- Chat with Documents: Engage in intelligent conversations with your documents using our advanced RAG Capabilities. Our Retrieval-Augmented Generation (RAG) model allows you to ask questions, receive relevant information, and gain insights from your documents in real-time.

-### 💬 Chat with Documents
+- Content Summarization: Save time and effort by automatically summarizing lengthy documents or articles, enabling you to quickly grasp the key takeaways.

-Engage in intelligent conversations with your documents using our advanced **Retrieval-Augmented Generation (RAG)** capabilities. Ask questions, receive relevant information, and gain insights from your documents in real-time!
+- FAQ Generation: Effortlessly create comprehensive FAQs based on your documents, ensuring that your users have access to the information they need.

-### 📄 Content Summarization
+- Code Generation: Boost your coding productivity with our code generation feature. Simply provide a description of the functionality you require, and the application will generate the corresponding code snippets, saving you valuable time and effort.

-Summarize lengthy documents or articles, enabling you to grasp key takeaways quickly. Save time and effort with our intelligent summarization feature!
+- User Context Management: Maintain a seamless workflow by managing your user's context within the application. Our context management system keeps track of your documents and chat history, allowing for personalized experiences.

-### ❓ FAQ Generation
+- Identity and access management: uses the open source platform Keycloak for single sign-on identity and access management.

-Effortlessly create comprehensive FAQs based on your documents. Ensure your users have access to the information they need with minimal effort!
+Refer to the [Keycloak Configuration Guide](./docker_compose/intel/cpu/xeon/keycloak_setup_guide.md) for instructions to setup Keycloak.

-### 💻 Code Generation
+Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for instructions to build docker images from source and running the application via docker compose.

-Boost your coding productivity by providing a description of the functionality you require. Our application generates corresponding code snippets, saving you valuable time and effort!
-
-### 🎛️ User Context Management
-
-Maintain a seamless workflow by managing your user's context within the application. Our context management system keeps track of documents and chat history for a personalized experience.
-
-### 🔐 Identity and Access Management
-
-Utilizes the open-source platform **Keycloak** for single sign-on identity and access management. This ensures secure and convenient access to your productivity tools.
-
---
-
-## 📚 Setup Guide
-
- **[Keycloak Configuration Guide](./docker_compose/intel/cpu/xeon/keycloak_setup_guide.md)**: Instructions to set up Keycloak for identity and access management.
- **[Xeon Guide](./docker_compose/intel/cpu/xeon/README.md)**: Instructions to build Docker images from source and run the application via Docker Compose.
- **[Xeon Kubernetes Guide](./kubernetes/intel/README.md)**: Instructions to deploy the application via Kubernetes.
+Refer to the [Xeon Kubernetes Guide](./kubernetes/intel/README.md) for instructions to deploy the application via kubernetes.
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -2,9 +2,7 @@

 This document outlines the deployment process for OPEA Productivity Suite utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server and [GenAIExamples](https://github.com/opea-project/GenAIExamples.git) solutions. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.

---
-
-## 🐳 Build Docker Images
+## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it.

@@ -40,12 +38,15 @@ docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_pr

 ```bash
 docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
+
 ```

 ### 6. Build Prompt Registry Image

 ```bash
 docker build -t opea/promptregistry-mongo-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/mongo/Dockerfile .
+
+
 ```

 ### 7. Build Chat History Image
@@ -99,8 +100,6 @@ cd GenAIExamples/ProductivitySuite/ui
 docker build --no-cache -t ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml docker/Dockerfile.react .
 ```

---
-
 ## 🚀 Start Microservices

 ### Setup Environment Variables
@@ -185,19 +184,17 @@ Note: Please replace with `host_ip` with you external IP address, do not use loc

 ```bash
 cd GenAIExamples/ProductivitySuite/docker_compose/intel/cpu/xeon
+```

+```bash
 docker compose -f compose.yaml up -d
 ```

---
+### Setup Keycloak

-### 🔐 Setup Keycloak
+Please refer to [keycloak_setup_guide](keycloak_setup_guide.md) for more detail related to Keycloak configuration setup.

-Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more detail related to Keycloak configuration setup.
-
---
-
-### ✅ Validate Microservices
+### Validate Microservices

 1. TEI Embedding Service

@@ -477,8 +474,6 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
      "user": "test", "id":"{Conversation id to Delete}"}'
    ```

---
-
 ## 🚀 Launch the UI

 To access the frontend, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
@@ -495,60 +490,57 @@ Here is an example of running Productivity Suite
 ![project-screenshot](../../../../assets/img/chat_qna_init.png)
 ![project-screenshot](../../../../assets/img/Login_page.png)

---
-
-## 🛠️ Key Features
+## 🧐 Features

 Here're some of the project's features:

-### 💬ChatQnA
+### CHAT QNA

- **Start a Text Chat**：Initiate a text chat with the ability to input written conversations, where the dialogue content can also be customized based on uploaded files.
- **Context Awareness**: The AI assistant maintains the context of the conversation, understanding references to previous statements or questions. This allows for more natural and coherent exchanges.
+- Start a Text Chat：Initiate a text chat with the ability to input written conversations, where the dialogue content can also be customized based on uploaded files.
+- Context Awareness: The AI assistant maintains the context of the conversation, understanding references to previous statements or questions. This allows for more natural and coherent exchanges.

-### 🎛️ Data Source
+### DATA SOURCE

- **File Upload or Remote Link**: The choice between uploading locally or copying a remote link. Chat according to uploaded knowledge base.
- **File Management**:Uploaded File would get listed and user would be able add or remove file/links
+- The choice between uploading locally or copying a remote link. Chat according to uploaded knowledge base.
+- Uploaded File would get listed and user would be able add or remove file/links

-#### Screenshots
+#### Screen Shot

 ![project-screenshot](../../../../assets/img/data_source.png)

- **Clear Chat**: Clear the record of the current dialog box without retaining the contents of the dialog box.
- **Chat history**: Historical chat records can still be retained after refreshing, making it easier for users to view the context.
- **Conversational Chat**: The application maintains a history of the conversation, allowing users to review previous messages and the AI to refer back to earlier points in the dialogue when necessary.
+- Clear: Clear the record of the current dialog box without retaining the contents of the dialog box.
+- Chat history: Historical chat records can still be retained after refreshing, making it easier for users to view the context.
+- Conversational Chat : The application maintains a history of the conversation, allowing users to review previous messages and the AI to refer back to earlier points in the dialogue when necessary.

-#### Screenshots
+#### Screen Shots

 ![project-screenshot](../../../../assets/img/chat_qna_init.png)
 ![project-screenshot](../../../../assets/img/chatqna_with_conversation.png)

-### 💻 Codegen
+### CODEGEN

- **Generate code**: generate the corresponding code based on the current user's input.
+- Generate code: generate the corresponding code based on the current user's input.

-#### Screenshots
+  Screen Shot
+  ![project-screenshot](../../../../assets/img/codegen.png)

-![project-screenshot](../../../../assets/img/codegen.png)
+### DOC SUMMARY

-### 📚 Document Summarization
+- Summarizing Uploaded Files: Upload files from their local device, then click 'Generate Summary' to summarize the content of the uploaded file. The summary will be displayed on the 'Summary' box.
+- Summarizing Text via Pasting: Paste the text to be summarized into the text box, then click 'Generate Summary' to produce a condensed summary of the content, which will be displayed in the 'Summary' box on the right.
+- Scroll to Bottom: The summarized content will automatically scroll to the bottom.

- **Summarizing Uploaded Files**: Upload files from their local device, then click 'Generate Summary' to summarize the content of the uploaded file. The summary will be displayed on the 'Summary' box.
- **Summarizing Text via Pasting**: Paste the text to be summarized into the text box, then click 'Generate Summary' to produce a condensed summary of the content, which will be displayed in the 'Summary' box on the right.
- **Scroll to Bottom**: The summarized content will automatically scroll to the bottom.
-
-#### Screenshots
+#### Screen Shot

 ![project-screenshot](../../../../assets/img/doc_summary_paste.png)
 ![project-screenshot](../../../../assets/img/doc_summary_file.png)

-### ❓ FAQ Generator
+### FAQ Generator

- **Generate FAQs from Text via Pasting**: Paste the text to into the text box, then click 'Generate FAQ' to produce a condensed FAQ of the content, which will be displayed in the 'FAQ' box below.
+- Generate FAQs from Text via Pasting: Paste the text to into the text box, then click 'Generate FAQ' to produce a condensed FAQ of the content, which will be displayed in the 'FAQ' box below.

- **Generate FAQs from Text via txt file Upload**: Upload the file in the Upload bar, then click 'Generate FAQ' to produce a condensed FAQ of the content, which will be displayed in the 'FAQ' box below.
+- Generate FAQs from Text via txt file Upload: Upload the file in the Upload bar, then click 'Generate FAQ' to produce a condensed FAQ of the content, which will be displayed in the 'FAQ' box below.

-#### Screenshots
+#### Screen Shot

 ![project-screenshot](../../../../assets/img/faq_generator.png)
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/keycloak_setup_guide.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/keycloak_setup_guide.md
@@ -1,27 +1,21 @@
-# 🔐 Keycloak Configuration Setup
+# Keycloak Configuration Setup

-This README document provides a comprehensive, step-by-step guide on how to configure **Keycloak** settings. The user management is facilitated via Keycloak, and the configuration is outlined below:
+This document show you step-by-step how to configure Keycloak settings.

-1. Access the Keycloak admin console via url http:${host_ip}:8080 or endpoint that is exposed from your Kubernetes cluster to configure users. Use the default username(**admin**) and password(**admin**) to login.
+The user management is done via Keycloak and the configuration steps look like this:
+
+1. Access the Keycloak admin console via url http:${host_ip}:8080 or endpoint that exposed from your kubernetes cluster to configure user. Use default username(admin) and password(admin) to login.
   ![project-screenshot](../../../../assets/img/keycloak_login.png)
-
 2. Create a new realm named **productivitysuite** within Keycloak.
   ![project-screenshot](../../../../assets/img/create_realm.png)
-
   ![project-screenshot](../../../../assets/img/create_productivitysuite_realm.png)
-
 3. Create a new client called **productivitysuite** with default configurations.
   ![project-screenshot](../../../../assets/img/create_client.png)
-
-4. Select the **productivitysuite** client that you just created. Insert your ProductivitySuite UI url endpoint into **"Valid redirect URIs"** and **"Web origins"** field. Refer to screenshot below as an example:
+4. Select the **productivitysuite** client that created just now. Insert your ProductivitySuite UI url endpoint into "Valid redirect URIs" and "Web origins" field. Example as screenshot below:
   ![project-screenshot](../../../../assets/img/productivitysuite_client_settings.png)
-
-5. From the left pane, select the Realm roles and create a new role named **user** and another new role as **viewer**.
+5. From the left pane select the Realm roles and create a new role name as user and another new role as viewer.
   ![project-screenshot](../../../../assets/img/create_roles.png)
-
-6. Create a new user named, for example, **mary** and another user as **bob**. Set passwords for both users (set **'Temporary'** to **'Off'**).Select **Role mapping** on the top, assign the user role to mary and assign the viewer role to bob.
+6. Create a new user name as for example mary and another user as bob. Set passwords for both users (set 'Temporary' to 'Off'). Select Role mapping on the top, assign the user role to mary and assign the viewer role to bob.
   ![project-screenshot](../../../../assets/img/create_users.png)
-
   ![project-screenshot](../../../../assets/img/set_user_password.png)
-
   ![project-screenshot](../../../../assets/img/user_role_mapping.png)
--- a/ProductivitySuite/kubernetes/intel/README.md
+++ b/ProductivitySuite/kubernetes/intel/README.md
@@ -1,4 +1,4 @@
-# 🚀 Deploy ProductivitySuite with ReactUI
+# Deploy ProductivitySuite with ReactUI

 The document outlines the deployment steps for ProductivitySuite via Kubernetes cluster while utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline components and ReactUI, a popular React-based user interface library.

@@ -16,14 +16,12 @@ In ProductivitySuite, it consists of following pipelines/examples and components
 - keycloak
 ```

---
-
-## ⚠️ Prerequisites for Deploying ProductivitySuite with ReactUI
+## Prerequisites for Deploying ProductivitySuite with ReactUI
 To begin with, ensure that you have following prerequisites in place:

-1. ☸ Kubernetes installation: Make sure that you have Kubernetes installed.
-2. 🐳 Images: Make sure you have all the images ready for the examples and components stated above. You may refer to [README](../../docker_compose/intel/cpu/xeon/README.md) for steps to build the images.
-3. 🔧 Configuration Values: Set the following values in all the yaml files before proceeding with the deployment:
+1. Kubernetes installation: Make sure that you have Kubernetes installed.
+2. Images: Make sure you have all the images ready for the examples and components stated above. You may refer to [README](../../docker_compose/intel/cpu/xeon/README.md) for steps to build the images.
+3. Configuration Values: Set the following values in all the yaml files before proceeding with the deployment:

   a. HUGGINGFACEHUB_API_TOKEN (Your HuggingFace token to download your desired model from HuggingFace):
      ```
@@ -44,26 +42,20 @@ To begin with, ensure that you have following prerequisites in place:
      # Look for ENDPOINT in the yaml and insert all the url endpoint for all the required backend service.
      ```

-4. MODEL_ID and model-volume **(OPTIONAL)**: You may as well customize the "MODEL_ID" to use different model and model-volume for the volume to be mounted.
+4. MODEL_ID and model-volume (OPTIONAL): You may as well customize the "MODEL_ID" to use different model and model-volume for the volume to be mounted.
 5. After finish with steps above, you can proceed with the deployment of the yaml file.

---
-
-##  🌐 Deploying ProductivitySuite
+## Deploying ProductivitySuite
 You can use yaml files in xeon folder to deploy ProductivitySuite with reactUI.
 ```
 cd GenAIExamples/ProductivitySuite/kubernetes/intel/cpu/xeon/manifests/
 kubectl apply -f *.yaml
 ```

---
+## User Management via Keycloak Configuration
+Please refer to [keycloak_setup_guide](../../docker_compose/intel/cpu/xeon/keycloak_setup_guide.md) for more detail related to Keycloak configuration setup.

-## 🔐 User Management via Keycloak Configuration
-Please refer to **[keycloak_setup_guide](../../docker_compose/intel/cpu/xeon/keycloak_setup_guide.md)** for more detail related to Keycloak configuration setup.
-
---
-
-## ✅ Verify Services
+## Verify Services
 To verify the installation, run command 'kubectl get pod' to make sure all pods are running.

 To view all the available services, run command 'kubectl get svc' to obtain ports that need to used as backend service endpoint in productivity_suite_reactui.yaml.
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
@@ -993,7 +993,7 @@ spec:
                name: chatqna-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
@@ -229,7 +229,7 @@ spec:
                name: codegen-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:1.4"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
@@ -229,7 +229,7 @@ spec:
                name: docsum-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml
@@ -138,7 +138,7 @@ spec:
            - configMapRef:
                name: faqgen-tgi-config
          securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -32,7 +32,18 @@ docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$ht
 docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
 ```

-### 5. Build MegaService Docker Image
+### 5. Build TEI Gaudi Image
+
+Since a TEI Gaudi Docker image hasn't been published, we'll need to build it from the [tei-guadi](https://github.com/huggingface/tei-gaudi) repository.
+
+```bash
+git clone https://github.com/huggingface/tei-gaudi
+cd tei-gaudi/
+docker build --no-cache -f Dockerfile-hpu -t opea/tei-gaudi:latest .
+cd ../..
+```
+
+### 6. Build MegaService Docker Image

 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `searchqna.py` Python script. Build the MegaService Docker image using the command below:

@@ -51,11 +62,12 @@ docker build --no-cache -t opea/searchqna:latest --build-arg https_proxy=$https_

 Then run the command `docker images`, you will have

-1. `opea/embedding-tei:latest`
-2. `opea/web-retriever-chroma:latest`
-3. `opea/reranking-tei:latest`
-4. `opea/llm-tgi:latest`
-5. `opea/searchqna:latest`
+1. `opea/tei-gaudi:latest`
+2. `opea/embedding-tei:latest`
+3. `opea/web-retriever-chroma:latest`
+4. `opea/reranking-tei:latest`
+5. `opea/llm-tgi:latest`
+6. `opea/searchqna:latest`

 ## 🚀 Set the environment variables

--- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
    container_name: tei-embedding-gaudi-server
    ports:
      - "3001:80"
@@ -80,7 +80,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "3006:80"
@@ -90,15 +90,11 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/SearchQnA/docker_image_build/build.yaml
+++ b/SearchQnA/docker_image_build/build.yaml
@@ -41,3 +41,9 @@ services:
      dockerfile: comps/llms/text-generation/tgi/Dockerfile
    extends: searchqna
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
+  tei-gaudi:
+    build:
+      context: tei-gaudi
+      dockerfile: Dockerfile-hpu
+    extends: searchqna
+    image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest}
--- a/SearchQnA/tests/test_compose_on_gaudi.sh
+++ b/SearchQnA/tests/test_compose_on_gaudi.sh
@@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/huggingface/tei-gaudi

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="searchqna searchqna-ui embedding-tei web-retriever-chroma reranking-tei llm-tgi"
+    service_list="searchqna searchqna-ui embedding-tei web-retriever-chroma reranking-tei llm-tgi tei-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

--- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,23 +3,18 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
    environment:
-      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -36,7 +31,6 @@ services:
      - "9000:9000"
    ipc: host
    environment:
-      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
@@ -53,7 +47,6 @@ services:
    ports:
      - "8888:8888"
    environment:
-      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
@@ -68,7 +61,6 @@ services:
    ports:
      - "5173:5173"
    environment:
-     - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
--- a/Translation/tests/test_compose_on_gaudi.sh
+++ b/Translation/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="translation translation-ui llm-tgi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
    docker images && sleep 1s
 }

@@ -166,7 +166,7 @@ function main() {

    validate_microservices
    validate_megaservice
-    #validate_frontend
+    validate_frontend

    stop_docker
    echo y | docker system prune
--- a/Translation/translation.yaml
+++ b/Translation/translation.yaml
@@ -6,7 +6,7 @@ opea_micro_services:
  tgi-service:
    host: ${TGI_SERVICE_IP}
    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    volumes:
      - "./data:/data"
    runtime: habana
@@ -14,17 +14,10 @@ opea_micro_services:
      - SYS_NICE
    ipc: host
    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    model-id: ${LLM_MODEL_ID}
  llm:
    host: ${LLM_SERVICE_HOST_IP}
@@ -32,9 +25,6 @@ opea_micro_services:
    image: opea/llm-tgi:latest
    endpoint: /v1/chat/completions
    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  ui:
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  llava-tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.4
    container_name: tgi-llava-gaudi-server
    ports:
      - "8399:80"
@@ -17,11 +17,6 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
    runtime: habana
    cap_add:
      - SYS_NICE
--- a/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml
+++ b/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml
@@ -216,7 +216,7 @@ spec:
                name: visualqna-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.2.0"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/VisualQnA/tests/test_compose_on_gaudi.sh
+++ b/VisualQnA/tests/test_compose_on_gaudi.sh
@@ -21,7 +21,7 @@ function build_docker_images() {
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.4
    docker images && sleep 1s
 }

--- a/VisualQnA/tests/test_compose_on_xeon.sh
+++ b/VisualQnA/tests/test_compose_on_xeon.sh
@@ -21,7 +21,7 @@ function build_docker_images() {
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.2.0
    docker images && sleep 1s
 }

--- a/VisualQnA/ui/svelte/.env
+++ b/VisualQnA/ui/svelte/.env
@@ -1,2 +1 @@
-GUARDRAIL_BASE_URL = 'http://backend_address:9399/v1/lvm'
-BACKEND_BASE_URL = 'http://backend_address:9499/v1/lvm'
+BACKEND_BASE_URL = '/v1/visualqna'
--- a/VisualQnA/ui/svelte/src/lib/modules/chat/ChatMessage.svelte
+++ b/VisualQnA/ui/svelte/src/lib/modules/chat/ChatMessage.svelte
@@ -17,7 +17,6 @@
 <script lang="ts">
 	import MessageAvatar from "$lib/modules/chat/MessageAvatar.svelte";
 	import type { Message } from "$lib/shared/constant/Interface";
-	import { Alert } from "flowbite-svelte";
 	import MessageTimer from "./MessageTimer.svelte";
 	import { createEventDispatcher } from "svelte";

@@ -32,42 +31,26 @@
 	class={msg.role === 0
 		? "flex w-full gap-3"
 		: "flex w-full items-center gap-3"}
-	data-testid={msg.role === 0 ? "display-answer" : "display-question"}
+	data-testid={msg.role === 0
+		? "display-answer"
+		: "display-question"}
 >
 	<div
 		class={msg.role === 0
 			? "flex aspect-square w-[3px]  items-center justify-center rounded bg-[#0597ff] max-sm:hidden"
-			: "flex aspect-square h-10 w-[3px] items-center justify-center rounded bg-[#acacac] max-sm:hidden mb-4"}
+			: "flex aspect-square h-10 w-[3px] items-center justify-center rounded bg-[#000] max-sm:hidden"}
 	>
 		<MessageAvatar role={msg.role} />
 	</div>
-	<div
-		class={msg.role === 0
-			? "group relative flex items-start border-b-4 border-gray-200 pb-2"
-			: "group relative flex items-start"}
-	>
+	<div class="group relative flex items-start">
 		<div class="flex flex-col items-start">
-			{#if msg.imgSrc}
-				<img
-					src={msg.imgSrc}
-					alt="Uploaded Image"
-					class="max-w-28 m-2 max-h-28"
-				/>
-			{/if}
+			<img src={msg.imgSrc} alt="Uploaded Image" class="m-2 max-w-28 max-h-28" />

-			{#if msg.content === "unsafe"}
-				<Alert color="red">
-					<span class="font-medium">Danger alert! </span>
-					<span>The uploaded image/question contains potential security risks.</span>
-				</Alert>
-			{:else}
 			<p
-				class="max-w-[60vw] items-start whitespace-pre-line break-keep leading-6 sm:max-w-[50rem] xl:max-w-[65vw]"
+				class="xl:max-w-[65vw] max-w-[60vw] items-start whitespace-pre-line break-keep text-[0.8rem] leading-5 sm:max-w-[50rem]"
 			>
 				{@html msg.content}
 			</p>
-			{/if}
-			
 		</div>
 	</div>
 </div>
--- a/VisualQnA/ui/svelte/src/lib/modules/upload/imagePrompt.svelte
+++ b/VisualQnA/ui/svelte/src/lib/modules/upload/imagePrompt.svelte
@@ -13,17 +13,17 @@

  let images = [
    {
-      id: 0,
+      id: 1,
      alt: 'Waterview',
      imgurl: waterview,
      prompt: 'What are the things I should be cautious about when I visit here?'
    },
    {
-      id: 1,
+      id: 0,
      alt: 'Extreme Ironing',
      imgurl: extreme_ironing,
      prompt: 'What is unusual about this image?'
-    },
+    }
  ];

  let currentIndex = 0;
@@ -37,14 +37,12 @@
  }


-  async function handleImageClick() {    
+  async function handleImageClick() {
    const imgUrl = images[currentIndex].imgurl;
    const base64Data = await convertImageToBase64(imgUrl);
    const currentPrompt = images[currentIndex].prompt;
-    base64ImageStore.set(base64Data);
-
    dispatch("imagePrompt", { content: currentPrompt });
-    
+    base64ImageStore.set(base64Data);
  }

  async function convertImageToBase64(url) {
--- a/VisualQnA/ui/svelte/src/lib/modules/upload/upload.svelte
+++ b/VisualQnA/ui/svelte/src/lib/modules/upload/upload.svelte
@@ -10,29 +10,23 @@
 	import { Range } from "flowbite-svelte";
 	import { FilePasteSolid } from "flowbite-svelte-icons";
 	import { stepValueStore } from "$lib/shared/stores/common/Store";
-	let stepValue = 128;
+	let stepValue = 512;
    let imageUrl = '';

 	$: stepValueStore.set(stepValue);
-
 </script>

-
-
 <div class="flex w-full flex-col gap-3 rounded-xl bg-white p-5">
 	<p>Upload Images</p>
 	<UploadImg imageUrl={imageUrl}/>
 	<Hr classHr="my-8 w-64">or</Hr>
-	<div class="gap-1">
-		<div class="">
-			<Label for="input-group-1" class="block mb-2">Import from URL</Label>
-			<Input type="text" placeholder=""  bind:value={imageUrl}>
-			  <FilePasteSolid slot="left" class="w-5 h-5 text-gray-500 dark:text-gray-400" />
-			</Input>
-		  </div>
-		<p class="text-[0.9rem]">Parameters</p>
-		<Range id="range-steps" min="0" max="1024" bind:value={stepValue} step="1" />
-		<p class="text-xs">Max output tokens: {stepValue}</p>
-	</div>
-
-</div>	
+	<div class="mb-6">
+		<Label for="input-group-1" class="block mb-2">Import from URL</Label>
+		<Input type="text" placeholder=""  bind:value={imageUrl}>
+		  <FilePasteSolid slot="left" class="w-5 h-5 text-gray-500 dark:text-gray-400" />
+		</Input>
+	  </div>
+	<p>Parameters</p>
+	<Range id="range-steps" min="0" max="1024" bind:value={stepValue} step="1" />
+	<p>Max output tokens: {stepValue}</p>
+</div>
--- a/VisualQnA/ui/svelte/src/lib/modules/upload/uploadImg.svelte
+++ b/VisualQnA/ui/svelte/src/lib/modules/upload/uploadImg.svelte
@@ -103,6 +103,6 @@
 			SVG, PNG, JPG
 		</p>
 	{:else if imageUrl}
-		<img src={imageUrl} alt="Uploaded Image" class="m-2 mx-auto block max-h-[15.5rem]" />
+		<img src={imageUrl} alt="Uploaded Image" class="m-2 mx-auto block" />
 	{/if}
 </Dropzone>
--- a/VisualQnA/ui/svelte/src/lib/network/chat/Network.ts
+++ b/VisualQnA/ui/svelte/src/lib/network/chat/Network.ts
@@ -13,47 +13,9 @@
 // limitations under the License.

 import { env } from "$env/dynamic/public";
+import { SSE } from "sse.js";

 const BACKEND_BASE_URL = env.BACKEND_BASE_URL;
-const guardrail_BASE_URL = env.GUARDRAIL_BASE_URL;
-
-
-async function fetchFunc(url, init) {
-	try {
-		const response = await fetch(url, init);
-		if (!response.ok) throw response.status;
-
-		return await response.json();
-	} catch (error) {
-		console.error("network error: ", error);
-
-		return undefined;
-	}
-}
-
-
-export async function fetchGuardRail(query: string, stepValueStore: number, base64ImageStore: string) {	
-	let payload = {};
-	let url = "";
-	base64ImageStore = base64ImageStore.replace(/^data:[a-zA-Z]+\/[a-zA-Z]+;base64,/, "");
-
-	payload = {
-		image: base64ImageStore,
-		prompt: query,
-		max_new_tokens: 1,
-		stream: false,
-	};
-
-	url = `${guardrail_BASE_URL}`;
-
-	const init: RequestInit = {
-		method: "POST",
-		headers: { "Content-Type": "application/json" },
-		body: JSON.stringify(payload),
-	};
-
-	return fetchFunc(url, init);
-}

 export async function fetchTextStream(query: string, stepValueStore: number, base64ImageStore: string) {
 	let payload = {};
@@ -61,19 +23,30 @@ export async function fetchTextStream(query: string, stepValueStore: number, bas
 	base64ImageStore = base64ImageStore.replace(/^data:[a-zA-Z]+\/[a-zA-Z]+;base64,/, "");

 	payload = {
-		image: base64ImageStore,
-		prompt: query,
-		max_new_tokens: stepValueStore,
-
+		messages: [
+			{
+				role: "user",
+				content: [
+					{
+						type: "text",
+						text: query,
+					},
+					{
+						type: "image_url",
+						image_url: { url: base64ImageStore },
+					},
+				],
+			},
+		],
+		max_tokens: stepValueStore,
+		stream: true,
 	};
+	console.log("payload", payload);

 	url = `${BACKEND_BASE_URL}`;

-	const init: RequestInit = {
-		method: "POST",
+	return new SSE(url, {
 		headers: { "Content-Type": "application/json" },
-		body: JSON.stringify(payload),
-	};
-
-	return fetchFunc(url, init);
+		payload: JSON.stringify(payload),
+	});
 }
--- a/VisualQnA/ui/svelte/src/lib/shared/components/header/header.svelte
+++ b/VisualQnA/ui/svelte/src/lib/shared/components/header/header.svelte
@@ -25,7 +25,7 @@
  >
    <div class="mx-auto flex flex-wrap justify-end items-center w-full">
      <span
-        class="self-center py-2 whitespace-nowrap text-[2rem] font-semibold text-white ml-4"
+        class="self-center py-2 whitespace-nowrap text-2xl font-semibold text-white ml-4"
        data-svelte-h="svelte-1hbktnk">VisualQnA</span
      >
    </div>
--- a/VisualQnA/ui/svelte/src/lib/shared/stores/common/Store.ts
+++ b/VisualQnA/ui/svelte/src/lib/shared/stores/common/Store.ts
@@ -42,4 +42,4 @@ export const knowledgeName = writable("");

 export const base64ImageStore = writable("");

-export const stepValueStore = writable(256);
+export const stepValueStore = writable(512);
--- a/VisualQnA/ui/svelte/src/routes/+page.svelte
+++ b/VisualQnA/ui/svelte/src/routes/+page.svelte
@@ -33,7 +33,7 @@
 		scrollToBottom,
 		scrollToTop,
 	} from "$lib/shared/Utils";
-	import { fetchGuardRail, fetchTextStream } from "$lib/network/chat/Network";
+	import { fetchTextStream } from "$lib/network/chat/Network";
 	import LoadingAnimation from "$lib/shared/components/loading/Loading.svelte";
 	import "driver.js/dist/driver.css";
 	import "$lib/assets/layout/css/driver.css";
@@ -42,14 +42,12 @@
 	import ChatMessage from "$lib/modules/chat/ChatMessage.svelte";
 	import Upload from "$lib/modules/upload/upload.svelte";
 	import ImagePrompt from "$lib/modules/upload/imagePrompt.svelte";
-	import { Toast } from "flowbite-svelte";
-	import { ExclamationCircleSolid, FireOutline } from "flowbite-svelte-icons";

 	let query: string = "";
 	let loading: boolean = false;
 	let scrollToDiv: HTMLDivElement;
 	let chatMessages: Message[] = data.chatMsg ? data.chatMsg : [];
-	let showToast = false;
+	console.log("chatMessages", chatMessages);

 	onMount(async () => {
 		scrollToDiv = document
@@ -83,54 +81,62 @@
 	}

 	const callTextStream = async (query: string) => {
-		const res = await fetchGuardRail(query, $stepValueStore, $base64ImageStore);
-		const lastSegment = res.text.split("[/INST]").pop().trim();
+		const eventSource = await fetchTextStream(
+			query,
+			$stepValueStore,
+			$base64ImageStore
+		);

-		if (lastSegment === "unsafe") {
-			loading = false;
+		eventSource.addEventListener("message", (e: any) => {
+			let Msg = e.data;
+			if (Msg.startsWith("b")) {
+				let trimmedData = Msg.slice(2, -1);

-			showToast = true;
-			setTimeout(() => {
-				showToast = false;
-			}, 3000);
+				if (/\\x[\dA-Fa-f]{2}/.test(trimmedData)) {
+					trimmedData = decodeEscapedBytes(trimmedData);
+				} else if (/\\u[\dA-Fa-f]{4}/.test(trimmedData)) {
+					trimmedData = decodeUnicode(trimmedData);
+				}

-			chatMessages = [
-				...chatMessages,
-				{
-					role: MessageRole.Assistant,
-					type: MessageType.Text,
-					content: "unsafe",
-					time: getCurrentTimeStamp(),
-					imgSrc: null, // Add the imgSrc property here
-				},
-			];
+				if (trimmedData !== "</s>") {
+					trimmedData = trimmedData.replace(/\\n/g, "\n");
+				}
+				if (chatMessages[chatMessages.length - 1].role == MessageRole.User) {
+					chatMessages = [
+						...chatMessages,
+						{
+							role: MessageRole.Assistant,
+							type: MessageType.Text,
+							content: trimmedData,
+							time: getCurrentTimeStamp(),
+							imgSrc: null, // Add the imgSrc property here
+						},
+					];
+					console.log("? chatMessages", chatMessages);
+				} else {
+					let content = chatMessages[chatMessages.length - 1].content as string;
+					chatMessages[chatMessages.length - 1].content = content + trimmedData;
+				}
+				scrollToBottom(scrollToDiv);
+			} else if (Msg === "[DONE]") {
+				let startTime = chatMessages[chatMessages.length - 1].time;

-			return;
-		} else {
-			const chatRes = await fetchTextStream(
-				query,
-				$stepValueStore,
-				$base64ImageStore
-			);
-			if (chatRes.text) {
 				loading = false;
-				chatMessages = [
-					...chatMessages,
-					{
-						role: MessageRole.Assistant,
-						type: MessageType.Text,
-						content: chatRes.text,
-						time: getCurrentTimeStamp(),
-						imgSrc: null, // Add the imgSrc property here
-					},
-				];
+				let totalTime = parseFloat(
+					((getCurrentTimeStamp() - startTime) / 1000).toFixed(2)
+				);
+				if (chatMessages.length - 1 !== -1) {
+					chatMessages[chatMessages.length - 1].time = totalTime;
+				}
 				storeMessages();
 			}
-		}
+		});
+		eventSource.stream();
 	};

 	const handleTextSubmit = async () => {
 		loading = true;
+		console.log("handleTextSubmit", $base64ImageStore);

 		const newMessage = {
 			role: MessageRole.User,
@@ -165,26 +171,13 @@

 <Header />
 <div class="h-full gap-5 bg-white sm:flex sm:pb-2 lg:rounded-tl-3xl">
-	<div class="w-1/4 bg-gray-200 p-4 pt-[0.08rem]">
-		<ImagePrompt on:imagePrompt={handleUpdateQuery} />
+	<div class="w-1/5 bg-gray-200 p-4">
 		<Upload />
+		<ImagePrompt on:imagePrompt={handleUpdateQuery} />
 	</div>
-
-	<div class="flex-1 p-4">
-		{#if showToast}
-			<div class="fixed right-0 top-0 z-50 mr-4 mt-4">
-				<Toast color="red" class="w-[50rem]">
-					<svelte:fragment slot="icon">
-						<ExclamationCircleSolid class="h-5 w-5" />
-						<span class="sr-only">Warning icon</span>
-					</svelte:fragment>
-					The uploaded image/question contains potential security risks.
-				</Toast>
-			</div>
-		{/if}
-
+	<div class="flex-1 bg-gray-100 p-4">
 		<div
-			class="mx-auto flex h-[93%] w-full flex-col bg-white pb-4 sm:mt-0 sm:w-[95%]"
+			class="mx-auto flex h-full w-full flex-col bg-white px-10 sm:mt-0 sm:w-[80%]"
 		>
 			<div
 				class="fixed relative flex w-full flex-col items-center justify-between bg-white p-2 pb-0"
@@ -192,7 +185,7 @@
 				<div class="relative my-4 flex w-full flex-row justify-center">
 					<div class="relative w-full focus:border-none">
 						<input
-							class="block w-full border-0 border-b-2 border-gray-300 px-1 py-4
+							class="text-md block w-full border-0 border-b-2 border-gray-300 px-1 py-4
 						text-gray-900 focus:border-gray-300 focus:ring-0 dark:border-gray-600 dark:bg-gray-700 dark:text-white dark:placeholder-gray-400 dark:focus:border-blue-500 dark:focus:ring-blue-500"
 							type="text"
 							data-testid="chat-input"
@@ -232,8 +225,8 @@
 							><svg
 								xmlns="http://www.w3.org/2000/svg"
 								viewBox="0 0 20 20"
-								width="20"
-								height="20"
+								width="24"
+								height="24"
 								class="fill-[#0597ff] group-hover:fill-[#0597ff]"
 								><path
 									d="M12.6 12 10 9.4 7.4 12 6 10.6 8.6 8 6 5.4 7.4 4 10 6.6 12.6 4 14 5.4 11.4 8l2.6 2.6zm7.4 8V2q0-.824-.587-1.412A1.93 1.93 0 0 0 18 0H2Q1.176 0 .588.588A1.93 1.93 0 0 0 0 2v12q0 .825.588 1.412Q1.175 16 2 16h14zm-3.15-6H2V2h16v13.125z"
@@ -247,9 +240,11 @@

 			<div class="mx-auto flex h-full w-full flex-col">
 				<!-- Loading text -->
-
+				{#if loading}
+					<LoadingAnimation />
+				{/if}
 				<Scrollbar
-					classLayout="flex flex-col gap-2 mr-4"
+					classLayout="flex flex-col gap-1 mr-4"
 					className="chat-scrollbar h-0 w-full grow px-2 pt-2 mt-3 mr-5"
 				>
 					{#each chatMessages as message, i}
@@ -262,9 +257,7 @@
 						/>
 					{/each}
 				</Scrollbar>
-				{#if loading}
-					<LoadingAnimation />
-				{/if}
+
 			</div>
 			<!-- gallery -->
 		</div>
--- a/VisualQnA/ui/svelte/vite.config.ts
+++ b/VisualQnA/ui/svelte/vite.config.ts
@@ -17,10 +17,7 @@ import type { UserConfig } from "vite";

 const config: UserConfig = {
 	plugins: [sveltekit()],
-	server: {
-		host: '0.0.0.0', 
-		port: 5173,
-	},
+	server: {},
 };

 export default config;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Letong Han	3c3d0b4d36	[ProductivitySuite] Fix CD Issue (#858 ) Signed-off-by: letonghan <letong.han@intel.com> (cherry picked from commit `d55a33dda1`)	2024-09-20 16:32:05 +08:00
XinyaoWa	c9001a3912	Fix SearchQnA tests bug (#857 ) Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> (cherry picked from commit `daf2a4fad7`)	2024-09-20 16:31:49 +08:00
chen, suyue	08fa591ebd	print image build test commit (#856 ) Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit `3ce395582b`)	2024-09-20 16:31:46 +08:00
Letong Han	6d4b3d6b0b	[Doc] Refine ChatQnA README (#855 ) Signed-off-by: letonghan <letong.han@intel.com> (cherry picked from commit `7eaab93d0b`)	2024-09-20 16:31:44 +08:00