From 3fb60608b30a71e6d08dae499f38a7b6a468bc3f Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Mon, 23 Sep 2024 17:52:56 +0800 Subject: [PATCH] Use official tei gaudi image and update tgi gaudi version (#810) Signed-off-by: lvliang-intel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/_example-workflow.yml | 4 - .../intel/hpu/gaudi/compose.yaml | 8 +- .../intel/hpu/gaudi/compose.yaml | 8 +- AudioQnA/kubernetes/intel/README_gmc.md | 2 +- .../intel/hpu/gaudi/manifest/audioqna.yaml | 12 +- AudioQnA/tests/test_compose_on_gaudi.sh | 2 +- AudioQnA/tests/test_compose_on_xeon.sh | 2 +- .../oob_four_gaudi_with_rerank.yaml | 653 +++++++++++++++ .../oob_single_gaudi_with_rerank.yaml | 653 +++++++++++++++ .../two_gaudi/oob_two_gaudi_with_rerank.yaml | 653 +++++++++++++++ .../oob_four_gaudi_without_rerank.yaml | 742 ++++++++++++++++++ .../oob_single_gaudi_without_rerank.yaml | 591 ++++++++++++++ .../oob_two_gaudi_without_rerank.yaml | 591 ++++++++++++++ .../tuned_four_gaudi_with_rerank.yaml | 683 ++++++++++++++++ .../tuned_single_gaudi_with_rerank.yaml | 683 ++++++++++++++++ .../tuned_two_gaudi_with_rerank.yaml | 683 ++++++++++++++++ .../tuned_four_gaudi_without_rerank.yaml | 622 +++++++++++++++ .../tuned_single_gaudi_without_rerank.yaml | 622 +++++++++++++++ .../tuned_two_gaudi_without_rerank.yaml | 622 +++++++++++++++ ChatQnA/chatqna.yaml | 11 +- .../intel/cpu/xeon/compose_qdrant.yaml | 2 +- .../intel/cpu/xeon/compose_vllm.yaml | 4 +- .../intel/hpu/gaudi/compose.yaml | 10 +- .../intel/hpu/gaudi/compose_guardrails.yaml | 18 +- .../intel/hpu/gaudi/compose_no_wrapper.yaml | 10 +- .../intel/hpu/gaudi/compose_vllm.yaml | 4 +- .../intel/hpu/gaudi/compose_vllm_ray.yaml | 4 +- .../hpu/gaudi/compose_without_rerank.yaml | 10 +- .../hpu/gaudi/how_to_validate_service.md | 20 +- ChatQnA/docker_image_build/build.yaml | 6 - ChatQnA/kubernetes/intel/README_gmc.md | 4 +- .../gaudi/manifest/chatqna-guardrails.yaml | 4 +- .../intel/hpu/gaudi/manifest/chatqna.yaml | 2 +- .../tests/test_compose_guardrails_on_gaudi.sh | 6 +- .../tests/test_compose_no_wrapper_on_gaudi.sh | 6 +- .../tests/test_compose_no_wrapper_on_xeon.sh | 2 +- ChatQnA/tests/test_compose_on_gaudi.sh | 6 +- ChatQnA/tests/test_compose_on_xeon.sh | 2 +- ChatQnA/tests/test_compose_vllm_on_gaudi.sh | 6 +- ChatQnA/tests/test_compose_vllm_on_xeon.sh | 2 +- .../tests/test_compose_vllm_ray_on_gaudi.sh | 6 +- .../test_compose_without_rerank_on_gaudi.sh | 6 +- .../test_compose_without_rerank_on_xeon.sh | 2 +- CodeGen/codegen.yaml | 8 +- .../intel/hpu/gaudi/compose.yaml | 8 +- .../intel/hpu/gaudi/manifest/codegen.yaml | 2 +- CodeGen/tests/test_compose_on_gaudi.sh | 2 +- CodeTrans/codetrans.yaml | 8 +- .../intel/hpu/gaudi/compose.yaml | 8 +- .../intel/hpu/gaudi/manifest/codetrans.yaml | 2 +- CodeTrans/tests/test_compose_on_gaudi.sh | 2 +- .../intel/hpu/gaudi/compose.yaml | 2 +- .../docker_image_build/build.yaml | 6 - .../tests/test_compose_on_gaudi.sh | 4 +- .../intel/hpu/gaudi/compose.yaml | 9 +- DocSum/docsum.yaml | 8 +- DocSum/kubernetes/intel/README_gmc.md | 2 +- .../intel/hpu/gaudi/manifest/docsum.yaml | 2 +- DocSum/tests/test_compose_on_gaudi.sh | 2 +- .../intel/hpu/gaudi/compose.yaml | 6 +- FaqGen/faqgen.yaml | 9 +- .../intel/hpu/gaudi/manifest/faqgen.yaml | 12 +- FaqGen/tests/test_compose_on_gaudi.sh | 2 +- .../docker_compose/intel/hpu/gaudi/README.md | 24 +- .../intel/hpu/gaudi/compose.yaml | 10 +- SearchQnA/docker_image_build/build.yaml | 6 - SearchQnA/tests/test_compose_on_gaudi.sh | 6 +- .../intel/hpu/gaudi/compose.yaml | 12 +- Translation/tests/test_compose_on_gaudi.sh | 4 +- Translation/translation.yaml | 14 +- .../intel/hpu/gaudi/compose.yaml | 7 +- VisualQnA/tests/test_compose_on_gaudi.sh | 2 +- 72 files changed, 8024 insertions(+), 154 deletions(-) create mode 100644 ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml create mode 100644 ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml create mode 100644 ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml create mode 100644 ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml index e2b946d75..adf17fa53 100644 --- a/.github/workflows/_example-workflow.yml +++ b/.github/workflows/_example-workflow.yml @@ -64,10 +64,6 @@ jobs: run: | cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml - if [[ $(grep -c "tei-gaudi:" ${docker_compose_path}) != 0 ]]; then - git clone https://github.com/huggingface/tei-gaudi.git - cd tei-gaudi && git rev-parse HEAD && cd ../ - fi if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then git clone https://github.com/vllm-project/vllm.git cd vllm && git rev-parse HEAD && cd ../ diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml index a95a9e58c..5200f757e 100644 --- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-server: - image: ghcr.io/huggingface/tgi-gaudi:2.0.4 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-server ports: - "8085:80" @@ -13,12 +13,16 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none PT_HPU_ENABLE_LAZY_COLLECTIVES: true + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml index c781bcb14..c3f885fce 100644 --- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -51,7 +51,7 @@ services: environment: TTS_ENDPOINT: ${TTS_ENDPOINT} tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "3006:80" @@ -61,11 +61,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/AudioQnA/kubernetes/intel/README_gmc.md b/AudioQnA/kubernetes/intel/README_gmc.md index 615d9d2e6..432282259 100644 --- a/AudioQnA/kubernetes/intel/README_gmc.md +++ b/AudioQnA/kubernetes/intel/README_gmc.md @@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services. For Gaudi: -- tgi-service: ghcr.io/huggingface/tgi-gaudi:1.2.1 +- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.5 - whisper-gaudi: opea/whisper-gaudi:latest - speecht5-gaudi: opea/speecht5-gaudi:latest diff --git a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml index db5dd21b1..2d0c567e3 100644 --- a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml +++ b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml @@ -271,7 +271,7 @@ spec: - envFrom: - configMapRef: name: audio-qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 name: llm-dependency-deploy-demo securityContext: capabilities: @@ -303,6 +303,14 @@ spec: value: none - name: PT_HPU_ENABLE_LAZY_COLLECTIVES value: 'true' + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' - name: runtime value: habana - name: HABANA_VISIBLE_DEVICES @@ -315,7 +323,7 @@ spec: volumes: - name: model-volume hostPath: - path: /home/sdp/cesg + path: /mnt/models type: Directory - name: shm emptyDir: diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh index 221888f91..69270736d 100644 --- a/AudioQnA/tests/test_compose_on_gaudi.sh +++ b/AudioQnA/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="audioqna whisper-gaudi asr llm-tgi speecht5-gaudi tts" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh index 998d5f362..b36b5c7de 100644 --- a/AudioQnA/tests/test_compose_on_xeon.sh +++ b/AudioQnA/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="audioqna whisper asr llm-tgi speecht5 tts" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml new file mode 100644 index 000000000..d2199faa9 --- /dev/null +++ b/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml @@ -0,0 +1,653 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 31 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml new file mode 100644 index 000000000..185236bc2 --- /dev/null +++ b/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml @@ -0,0 +1,653 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 7 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml new file mode 100644 index 000000000..7889acc9a --- /dev/null +++ b/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml @@ -0,0 +1,653 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 15 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml new file mode 100644 index 000000000..cc89feca2 --- /dev/null +++ b/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml @@ -0,0 +1,742 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 32 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: reranking-dependency-deploy + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:latest + name: reranking-dependency-deploy + args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: reranking-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: reranking-dependency-deploy + ports: + - name: service + port: 8808 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: reranking-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + args: null + ports: + - containerPort: 8000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: reranking-svc + namespace: default +spec: + type: ClusterIP + selector: + app: reranking-deploy + ports: + - name: service + port: 8000 + targetPort: 8000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml new file mode 100644 index 000000000..bc95406f4 --- /dev/null +++ b/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml @@ -0,0 +1,591 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml new file mode 100644 index 000000000..53392de65 --- /dev/null +++ b/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml @@ -0,0 +1,591 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 16 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '2048' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml new file mode 100644 index 000000000..cc8b66391 --- /dev/null +++ b/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml @@ -0,0 +1,683 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 31 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml new file mode 100644 index 000000000..1efd9b188 --- /dev/null +++ b/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml @@ -0,0 +1,683 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 7 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml new file mode 100644 index 000000000..2beff60bc --- /dev/null +++ b/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml @@ -0,0 +1,683 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVICE_HOST_IP: embedding-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVICE_HOST_IP: llm-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVICE_HOST_IP: reranking-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + ports: + - containerPort: 6000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-svc + namespace: default +spec: + ports: + - name: service + port: 6000 + targetPort: 6000 + selector: + app: embedding-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 15 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + ports: + - containerPort: 9000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-svc + namespace: default +spec: + ports: + - name: service + port: 9000 + targetPort: 9000 + selector: + app: llm-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: opea/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + ports: + - containerPort: 8000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-svc + namespace: default +spec: + ports: + - name: service + port: 8000 + targetPort: 8000 + selector: + app: reranking-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml new file mode 100644 index 000000000..94ce45e05 --- /dev/null +++ b/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml @@ -0,0 +1,622 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 4000Mi + requests: + cpu: 8 + memory: 4000Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 32 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + resources: + limits: + cpu: 8 + memory: 2500Mi + requests: + cpu: 8 + memory: 2500Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml new file mode 100644 index 000000000..364001308 --- /dev/null +++ b/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml @@ -0,0 +1,622 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 4000Mi + requests: + cpu: 8 + memory: 4000Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + resources: + limits: + cpu: 8 + memory: 2500Mi + requests: + cpu: 8 + memory: 2500Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml new file mode 100644 index 000000000..2faff0783 --- /dev/null +++ b/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml @@ -0,0 +1,622 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 4000Mi + requests: + cpu: 8 + memory: 4000Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc + namespace: default +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + cpu: 76 + memory: 20000Mi + requests: + cpu: 76 + memory: 20000Mi + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc + namespace: default +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 16 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HUGGING_FACE_HUB_TOKEN + value: ${HF_TOKEN} + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /mnt/models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc + namespace: default +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: TEI_EMBEDDING_ENDPOINT + valueFrom: + configMapKeyRef: + name: qna-config + key: TEI_EMBEDDING_ENDPOINT + - name: HUGGINGFACEHUB_API_TOKEN + valueFrom: + configMapKeyRef: + name: qna-config + key: HUGGINGFACEHUB_API_TOKEN + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + resources: + limits: + cpu: 8 + memory: 2500Mi + requests: + cpu: 8 + memory: 2500Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc + namespace: default +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 + + +--- + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + + +--- diff --git a/ChatQnA/chatqna.yaml b/ChatQnA/chatqna.yaml index 32cc433a7..78a996a7c 100644 --- a/ChatQnA/chatqna.yaml +++ b/ChatQnA/chatqna.yaml @@ -19,7 +19,7 @@ opea_micro_services: tei-embedding-service: host: ${TEI_EMBEDDING_SERVICE_IP} ports: ${TEI_EMBEDDING_SERVICE_PORT} - image: opea/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:latest volumes: - "./data:/data" runtime: habana @@ -48,7 +48,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -56,10 +56,13 @@ opea_micro_services: - SYS_NICE ipc: host environment: - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml index 922f74dcf..6d30a6c23 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml @@ -69,7 +69,7 @@ services: INDEX_NAME: ${INDEX_NAME} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-reranking-server ports: - "6041:80" diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml index f169f88da..9852ca77e 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-embedding-server ports: - "6006:80" @@ -75,7 +75,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-reranking-server ports: - "8808:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml index e5aa98713..6c8c0191f 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -108,7 +108,7 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8005:80" @@ -118,11 +118,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: ${llm_service_devices} OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml index 5680155bd..1132cf44e 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tgi-guardrails-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-guardrails-server ports: - "8088:80" @@ -35,11 +35,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE @@ -60,7 +64,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -141,7 +145,7 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8008:80" @@ -151,11 +155,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml index cad1adfdd..157c6e43e 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -108,7 +108,7 @@ services: # HF_HUB_ENABLE_HF_TRANSFER: 0 # restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8005:80" @@ -118,11 +118,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml index 5548b3355..b221ebe06 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -73,7 +73,7 @@ services: INDEX_NAME: ${INDEX_NAME} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-reranking-gaudi-server ports: - "8808:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml index a0a00ea15..2552d7a41 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -73,7 +73,7 @@ services: INDEX_NAME: ${INDEX_NAME} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-reranking-gaudi-server ports: - "8808:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml index 2b96d29bc..dfa0ad46a 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml @@ -25,7 +25,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" @@ -75,7 +75,7 @@ services: INDEX_NAME: ${INDEX_NAME} restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8005:80" @@ -85,11 +85,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md index 8ada1e525..6ffda97fa 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md @@ -56,16 +56,16 @@ f810f3b4d329 opea/embedding-tei:latest "python e 2fa17d84605f opea/dataprep-redis:latest "python prepare_doc_…" 2 minutes ago Up 2 minutes 0.0.0.0:6007->6007/tcp, :::6007->6007/tcp dataprep-redis-server 69e1fb59e92c opea/retriever-redis:latest "/home/user/comps/re…" 2 minutes ago Up 2 minutes 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-redis-server 313b9d14928a opea/reranking-tei:latest "python reranking_te…" 2 minutes ago Up 2 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp reranking-tei-gaudi-server -05c40b636239 ghcr.io/huggingface/tgi-gaudi:1.2.1 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server -174bd43fa6b5 opea/tei-gaudi:latest "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8090->80/tcp, :::8090->80/tcp tei-embedding-gaudi-server +05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server +174bd43fa6b5 ghcr.io/huggingface/tei-gaudi:latest "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8090->80/tcp, :::8090->80/tcp tei-embedding-gaudi-server 74084469aa33 redis/redis-stack:7.2.0-v9 "/entrypoint.sh" 2 minutes ago Up 2 minutes 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp redis-vector-db 88399dbc9e43 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8808->80/tcp, :::8808->80/tcp tei-reranking-gaudi-server ``` -In this case, `ghcr.io/huggingface/tgi-gaudi:1.2.1` Existed. +In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.5` Existed. ``` -05c40b636239 ghcr.io/huggingface/tgi-gaudi:1.2.1 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server +05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server ``` Next we can check the container logs to get to know what happened during the docker start. @@ -76,7 +76,7 @@ Check the log of container by: `docker logs -t` -View the logs of `ghcr.io/huggingface/tgi-gaudi:1.2.1` +View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.5` `docker logs 05c40b636239 -t` @@ -105,7 +105,7 @@ So just make sure the devices are available. Here is another failure example: ``` -f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:1.2.1 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server +f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server ``` Check the log by `docker logs f7a08f9867f9 -t`. @@ -122,7 +122,7 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co ``` tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:1.2.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8008:80" @@ -131,9 +131,13 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml index 906f6fcf7..573a2de3a 100644 --- a/ChatQnA/docker_image_build/build.yaml +++ b/ChatQnA/docker_image_build/build.yaml @@ -125,12 +125,6 @@ services: dockerfile: comps/guardrails/llama_guard/langchain/Dockerfile extends: chatqna image: ${REGISTRY:-opea}/guardrails-tgi:${TAG:-latest} - tei-gaudi: - build: - context: tei-gaudi - dockerfile: Dockerfile-hpu - extends: chatqna - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} vllm: build: context: vllm diff --git a/ChatQnA/kubernetes/intel/README_gmc.md b/ChatQnA/kubernetes/intel/README_gmc.md index c56354cfc..99799391b 100644 --- a/ChatQnA/kubernetes/intel/README_gmc.md +++ b/ChatQnA/kubernetes/intel/README_gmc.md @@ -27,8 +27,8 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services. For Gaudi: -- tei-embedding-service: opea/tei-gaudi:latest -- tgi-service: ghcr.io/huggingface/tgi-gaudi:1.2.1 +- tei-embedding-service: ghcr.io/huggingface/tei-gaudi:latest +- tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.5 > [NOTE] > Please refer to [Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/cpu/xeon/README.md) or [Gaudi README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/hpu/gaudi/README.md) to build the OPEA images. These too will be available on Docker Hub soon to simplify use. diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml index 075225b26..7d7f3615f 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml @@ -1477,7 +1477,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data @@ -1558,7 +1558,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml index cfccad27a..e92252581 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml @@ -1298,7 +1298,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh index bcbea8c0d..8f6dd963c 100644 --- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh @@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna-guardrails chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi tei-gaudi guardrails-tgi" + service_list="chatqna-guardrails chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi guardrails-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh b/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh index f4250d435..c760d9eb7 100644 --- a/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh @@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna-no-wrapper chatqna-ui dataprep-redis retriever-redis tei-gaudi" + service_list="chatqna-no-wrapper chatqna-ui dataprep-redis retriever-redis" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh b/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh index dc2ac315a..70ad7dc08 100644 --- a/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh +++ b/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="chatqna-no-wrapper chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh index e98a76311..37b822285 100644 --- a/ChatQnA/tests/test_compose_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_on_gaudi.sh @@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi tei-gaudi nginx" + service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh index b7275cf8e..e110984d6 100644 --- a/ChatQnA/tests/test_compose_on_xeon.sh +++ b/ChatQnA/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh index 30e803e7e..ab45e195c 100644 --- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh @@ -17,13 +17,13 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei tei-gaudi llm-vllm-hpu llm-vllm" + service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-hpu llm-vllm" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_vllm_on_xeon.sh b/ChatQnA/tests/test_compose_vllm_on_xeon.sh index a069df297..e5274a199 100644 --- a/ChatQnA/tests/test_compose_vllm_on_xeon.sh +++ b/ChatQnA/tests/test_compose_vllm_on_xeon.sh @@ -23,7 +23,7 @@ function build_docker_images() { service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm vllm" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh index 607ed93f6..4e2b12e61 100644 --- a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh @@ -17,13 +17,13 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei tei-gaudi llm-vllm-ray-hpu llm-vllm-ray" + service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-ray-hpu llm-vllm-ray" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh index d8b0bf681..3de5c435d 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh @@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna-without-rerank chatqna-ui dataprep-redis embedding-tei retriever-redis llm-tgi tei-gaudi" + service_list="chatqna-without-rerank chatqna-ui dataprep-redis embedding-tei retriever-redis llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh index 9176a8339..3ab079765 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="chatqna-without-rerank chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/CodeGen/codegen.yaml b/CodeGen/codegen.yaml index 7e4f423c3..95f2d78e6 100644 --- a/CodeGen/codegen.yaml +++ b/CodeGen/codegen.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -17,7 +17,11 @@ opea_micro_services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml index c1bae7b90..153b9f59a 100644 --- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8028:80" @@ -15,7 +15,11 @@ services: https_proxy: ${https_proxy} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml b/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml index 359ce24e6..c4a43a7c3 100644 --- a/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml +++ b/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeGen/tests/test_compose_on_gaudi.sh b/CodeGen/tests/test_compose_on_gaudi.sh index 46c937d0b..ec1658314 100644 --- a/CodeGen/tests/test_compose_on_gaudi.sh +++ b/CodeGen/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codegen codegen-ui llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/CodeTrans/codetrans.yaml b/CodeTrans/codetrans.yaml index a16c5a484..9d7f70b4e 100644 --- a/CodeTrans/codetrans.yaml +++ b/CodeTrans/codetrans.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -17,7 +17,11 @@ opea_micro_services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml index f6285d2ef..09b82ed3f 100644 --- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: codetrans-tgi-service ports: - "8008:80" @@ -15,7 +15,11 @@ services: https_proxy: ${https_proxy} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml b/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml index 9f07b22a7..541f31179 100644 --- a/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml +++ b/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh index 9416f09c7..b246f4dc9 100644 --- a/CodeTrans/tests/test_compose_on_gaudi.sh +++ b/CodeTrans/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codetrans codetrans-ui llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml index 76823db7b..831659dca 100644 --- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml @@ -28,7 +28,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "8090:80" diff --git a/DocIndexRetriever/docker_image_build/build.yaml b/DocIndexRetriever/docker_image_build/build.yaml index b0775069b..3ed44fa24 100644 --- a/DocIndexRetriever/docker_image_build/build.yaml +++ b/DocIndexRetriever/docker_image_build/build.yaml @@ -35,9 +35,3 @@ services: dockerfile: comps/dataprep/redis/langchain/Dockerfile extends: doc-index-retriever image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} - tei-gaudi: - build: - context: tei-gaudi - dockerfile: Dockerfile-hpu - extends: doc-index-retriever - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} diff --git a/DocIndexRetriever/tests/test_compose_on_gaudi.sh b/DocIndexRetriever/tests/test_compose_on_gaudi.sh index 01cca6a42..04f32a7b5 100644 --- a/DocIndexRetriever/tests/test_compose_on_gaudi.sh +++ b/DocIndexRetriever/tests/test_compose_on_gaudi.sh @@ -19,14 +19,12 @@ function build_docker_images() { if [ ! -d "GenAIComps" ] ; then git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ fi - if [ ! -d "tei-gaudi" ] ; then - git clone https://github.com/huggingface/tei-gaudi - fi echo "Build all the images with --no-cache, check docker_image_build.log for details..." docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log docker pull redis/redis-stack:7.2.0-v9 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml index 78cb90ae1..e9f3a96f8 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8008:80" @@ -11,8 +11,11 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true volumes: - "./data:/data" runtime: habana diff --git a/DocSum/docsum.yaml b/DocSum/docsum.yaml index d4c8cb1c4..bc87bc5b4 100644 --- a/DocSum/docsum.yaml +++ b/DocSum/docsum.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -17,7 +17,11 @@ opea_micro_services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} diff --git a/DocSum/kubernetes/intel/README_gmc.md b/DocSum/kubernetes/intel/README_gmc.md index 99d096279..b050d7249 100644 --- a/DocSum/kubernetes/intel/README_gmc.md +++ b/DocSum/kubernetes/intel/README_gmc.md @@ -9,7 +9,7 @@ The DocSum application is defined as a Custom Resource (CR) file that the above The DocSum pipeline uses prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the the image ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the -service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3. +service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.5`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3. [NOTE] Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/docker_compose/intel/cpu/xeon/README.md) or diff --git a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml index 15bf181e1..5c10f3c76 100644 --- a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml +++ b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.1" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh index abfd158f2..f9cbd7a82 100644 --- a/DocSum/tests/test_compose_on_gaudi.sh +++ b/DocSum/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="docsum docsum-ui llm-docsum-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml index 238e271f6..1ee36bd30 100644 --- a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8008:80" @@ -18,6 +18,10 @@ services: HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} PREFILL_BATCH_BUCKET_SIZE: 1 BATCH_BUCKET_SIZE: 8 + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/FaqGen/faqgen.yaml b/FaqGen/faqgen.yaml index 8832c0c93..8d354871e 100644 --- a/FaqGen/faqgen.yaml +++ b/FaqGen/faqgen.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:1.2.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -14,10 +14,13 @@ opea_micro_services: - SYS_NICE ipc: host environment: - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} diff --git a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml index 76a68080c..2703cbc4e 100644 --- a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml +++ b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml @@ -39,7 +39,15 @@ spec: value: "8" - name: PORT value: "80" - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 imagePullPolicy: IfNotPresent securityContext: capabilities: @@ -72,7 +80,7 @@ spec: volumes: - name: model-volume hostPath: - path: /home/sdp/cesg + path: /mnt/models type: Directory - name: shm emptyDir: diff --git a/FaqGen/tests/test_compose_on_gaudi.sh b/FaqGen/tests/test_compose_on_gaudi.sh index 5fd9ae34a..a58339780 100644 --- a/FaqGen/tests/test_compose_on_gaudi.sh +++ b/FaqGen/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="faqgen faqgen-ui llm-faqgen-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:1.2.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md index 6021c7938..7870aa629 100644 --- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md @@ -32,18 +32,7 @@ docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$ht docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile . ``` -### 5. Build TEI Gaudi Image - -Since a TEI Gaudi Docker image hasn't been published, we'll need to build it from the [tei-guadi](https://github.com/huggingface/tei-gaudi) repository. - -```bash -git clone https://github.com/huggingface/tei-gaudi -cd tei-gaudi/ -docker build --no-cache -f Dockerfile-hpu -t opea/tei-gaudi:latest . -cd ../.. -``` - -### 6. Build MegaService Docker Image +### 5. Build MegaService Docker Image To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `searchqna.py` Python script. Build the MegaService Docker image using the command below: @@ -62,12 +51,11 @@ docker build --no-cache -t opea/searchqna:latest --build-arg https_proxy=$https_ Then run the command `docker images`, you will have -1. `opea/tei-gaudi:latest` -2. `opea/embedding-tei:latest` -3. `opea/web-retriever-chroma:latest` -4. `opea/reranking-tei:latest` -5. `opea/llm-tgi:latest` -6. `opea/searchqna:latest` +1. `opea/embedding-tei:latest` +2. `opea/web-retriever-chroma:latest` +3. `opea/reranking-tei:latest` +4. `opea/llm-tgi:latest` +5. `opea/searchqna:latest` ## 🚀 Set the environment variables diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml index ee3b32b6f..5ade94cc1 100644 --- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tei-embedding-service: - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + image: ghcr.io/huggingface/tei-gaudi:latest container_name: tei-embedding-gaudi-server ports: - "3001:80" @@ -80,7 +80,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "3006:80" @@ -90,11 +90,15 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/SearchQnA/docker_image_build/build.yaml b/SearchQnA/docker_image_build/build.yaml index e94e9cf4b..7d8629e8f 100644 --- a/SearchQnA/docker_image_build/build.yaml +++ b/SearchQnA/docker_image_build/build.yaml @@ -41,9 +41,3 @@ services: dockerfile: comps/llms/text-generation/tgi/Dockerfile extends: searchqna image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest} - tei-gaudi: - build: - context: tei-gaudi - dockerfile: Dockerfile-hpu - extends: searchqna - image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} diff --git a/SearchQnA/tests/test_compose_on_gaudi.sh b/SearchQnA/tests/test_compose_on_gaudi.sh index 03091fae1..cefadaa88 100644 --- a/SearchQnA/tests/test_compose_on_gaudi.sh +++ b/SearchQnA/tests/test_compose_on_gaudi.sh @@ -17,14 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - git clone https://github.com/huggingface/tei-gaudi echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="searchqna searchqna-ui embedding-tei web-retriever-chroma reranking-tei llm-tgi tei-gaudi" + service_list="searchqna searchqna-ui embedding-tei web-retriever-chroma reranking-tei llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tei-gaudi:latest docker images && sleep 1s } diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml index 3d8b0ab47..c470c441a 100644 --- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,18 +3,23 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-gaudi-server ports: - "8008:80" environment: + no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE @@ -31,6 +36,7 @@ services: - "9000:9000" ipc: host environment: + no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} @@ -47,6 +53,7 @@ services: ports: - "8888:8888" environment: + - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} @@ -61,6 +68,7 @@ services: ports: - "5173:5173" environment: + - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - BASE_URL=${BACKEND_SERVICE_ENDPOINT} diff --git a/Translation/tests/test_compose_on_gaudi.sh b/Translation/tests/test_compose_on_gaudi.sh index 558ec9e28..9515c95af 100644 --- a/Translation/tests/test_compose_on_gaudi.sh +++ b/Translation/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="translation translation-ui llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s } @@ -166,7 +166,7 @@ function main() { validate_microservices validate_megaservice - validate_frontend + #validate_frontend stop_docker echo y | docker system prune diff --git a/Translation/translation.yaml b/Translation/translation.yaml index 23aac4675..882eca8e2 100644 --- a/Translation/translation.yaml +++ b/Translation/translation.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 volumes: - "./data:/data" runtime: habana @@ -14,10 +14,17 @@ opea_micro_services: - SYS_NICE ipc: host environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true model-id: ${LLM_MODEL_ID} llm: host: ${LLM_SERVICE_HOST_IP} @@ -25,6 +32,9 @@ opea_micro_services: image: opea/llm-tgi:latest endpoint: /v1/chat/completions environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} ui: diff --git a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 687e2ab03..45732e832 100644 --- a/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/VisualQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: llava-tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.4 + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-llava-gaudi-server ports: - "8399:80" @@ -17,6 +17,11 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true runtime: habana cap_add: - SYS_NICE diff --git a/VisualQnA/tests/test_compose_on_gaudi.sh b/VisualQnA/tests/test_compose_on_gaudi.sh index f28f245ab..a489a2c7a 100644 --- a/VisualQnA/tests/test_compose_on_gaudi.sh +++ b/VisualQnA/tests/test_compose_on_gaudi.sh @@ -21,7 +21,7 @@ function build_docker_images() { echo "Build all the images with --no-cache, check docker_image_build.log for details..." docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.4 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 docker images && sleep 1s }