From 46ebb78aa3c76d10b9461f011f69fd80cd99833f Mon Sep 17 00:00:00 2001 From: dolpher Date: Tue, 8 Apr 2025 22:39:40 +0800 Subject: [PATCH] Sync values yaml file for 1.3 release (#1748) Signed-off-by: Dolpher Du --- AgentQnA/kubernetes/helm/cpu-values.yaml | 22 +++++++ AgentQnA/kubernetes/helm/gaudi-values.yaml | 19 ++++++ .../kubernetes/helm/cpu-multilang-values.yaml | 15 +++++ AudioQnA/kubernetes/helm/cpu-tgi-values.yaml | 12 ++++ AudioQnA/kubernetes/helm/cpu-values.yaml | 9 ++- .../kubernetes/helm/gaudi-tgi-values.yaml | 49 +++++++++++++++ AudioQnA/kubernetes/helm/gaudi-values.yaml | 43 ++++++------- .../kubernetes/helm/cpu-qdrant-values.yaml | 12 ++++ ChatQnA/kubernetes/helm/cpu-tgi-values.yaml | 19 +----- .../helm/faqgen-cpu-tgi-values.yaml | 16 +++++ .../kubernetes/helm/faqgen-cpu-values.yaml | 12 ++++ .../helm/faqgen-gaudi-tgi-values.yaml | 60 +++++++++++++++++++ .../kubernetes/helm/faqgen-gaudi-values.yaml | 53 ++++++++++++++++ ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml | 10 ---- ...udi-vllm-values.yaml => gaudi-values.yaml} | 15 +---- .../helm/guardrails-gaudi-values.yaml | 32 +++------- ChatQnA/kubernetes/helm/norerank-values.yaml | 8 +-- CodeGen/kubernetes/helm/cpu-tgi-values.yaml | 9 +++ CodeGen/kubernetes/helm/cpu-values.yaml | 6 +- CodeGen/kubernetes/helm/gaudi-tgi-values.yaml | 33 ++++++++++ CodeGen/kubernetes/helm/gaudi-values.yaml | 42 ++++++------- CodeTrans/kubernetes/helm/cpu-tgi-values.yaml | 9 +++ CodeTrans/kubernetes/helm/cpu-values.yaml | 6 +- .../kubernetes/helm/gaudi-tgi-values.yaml | 33 ++++++++++ CodeTrans/kubernetes/helm/gaudi-values.yaml | 47 ++++++++------- DocSum/kubernetes/helm/cpu-tgi-values.yaml | 9 +++ DocSum/kubernetes/helm/cpu-values.yaml | 8 ++- DocSum/kubernetes/helm/gaudi-tgi-values.yaml | 32 ++++++++++ DocSum/kubernetes/helm/gaudi-values.yaml | 44 +++++++------- SearchQnA/kubernetes/helm/gaudi-values.yaml | 6 -- VisualQnA/kubernetes/helm/cpu-tgi-values.yaml | 9 +++ VisualQnA/kubernetes/helm/cpu-values.yaml | 10 ++-- .../kubernetes/helm/gaudi-tgi-values.yaml | 37 ++++++++++++ VisualQnA/kubernetes/helm/gaudi-values.yaml | 46 ++++++-------- 34 files changed, 580 insertions(+), 212 deletions(-) create mode 100644 AgentQnA/kubernetes/helm/cpu-values.yaml create mode 100644 AudioQnA/kubernetes/helm/cpu-multilang-values.yaml create mode 100644 AudioQnA/kubernetes/helm/cpu-tgi-values.yaml create mode 100644 AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml create mode 100644 ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml create mode 100644 ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml create mode 100644 ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml create mode 100644 ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml create mode 100644 ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml rename ChatQnA/kubernetes/helm/{gaudi-vllm-values.yaml => gaudi-values.yaml} (78%) create mode 100644 CodeGen/kubernetes/helm/cpu-tgi-values.yaml create mode 100644 CodeGen/kubernetes/helm/gaudi-tgi-values.yaml create mode 100644 CodeTrans/kubernetes/helm/cpu-tgi-values.yaml create mode 100644 CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml create mode 100644 DocSum/kubernetes/helm/cpu-tgi-values.yaml create mode 100644 DocSum/kubernetes/helm/gaudi-tgi-values.yaml create mode 100644 VisualQnA/kubernetes/helm/cpu-tgi-values.yaml create mode 100644 VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml diff --git a/AgentQnA/kubernetes/helm/cpu-values.yaml b/AgentQnA/kubernetes/helm/cpu-values.yaml new file mode 100644 index 000000000..1def3bca9 --- /dev/null +++ b/AgentQnA/kubernetes/helm/cpu-values.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false +vllm: + enabled: true + LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"] + +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml index 2d171ea22..fc040abc0 100644 --- a/AgentQnA/kubernetes/helm/gaudi-values.yaml +++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml @@ -4,13 +4,32 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values +tgi: + enabled: false vllm: enabled: true + accelDevice: "gaudi" image: repository: opea/vllm-gaudi + resources: + limits: + habana.ai/gaudi: 4 + LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct" + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + VLLM_SKIP_WARMUP: true + shmSize: 16Gi + extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"] + supervisor: llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" ragagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" diff --git a/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml new file mode 100644 index 000000000..84ff9161b --- /dev/null +++ b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false +vllm: + enabled: true + +speecht5: + enabled: false +gpt-sovits: + enabled: true + +image: + repository: opea/audioqna-multilang diff --git a/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 000000000..ada4ee984 --- /dev/null +++ b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,12 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true +vllm: + enabled: false + +speecht5: + enabled: true +gpt-sovits: + enabled: false diff --git a/AudioQnA/kubernetes/helm/cpu-values.yaml b/AudioQnA/kubernetes/helm/cpu-values.yaml index 97818ae44..5a3c42335 100644 --- a/AudioQnA/kubernetes/helm/cpu-values.yaml +++ b/AudioQnA/kubernetes/helm/cpu-values.yaml @@ -2,4 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 tgi: - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + enabled: false +vllm: + enabled: true + +speecht5: + enabled: true +gpt-sovits: + enabled: false diff --git a/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml new file mode 100644 index 000000000..03e209306 --- /dev/null +++ b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml @@ -0,0 +1,49 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + CUDA_GRAPHS: "" + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false + +whisper: + image: + repository: opea/whisper-gaudi + resources: + limits: + habana.ai/gaudi: 1 + +speecht5: + enabled: true + image: + repository: opea/speecht5-gaudi + resources: + limits: + habana.ai/gaudi: 1 +gpt-sovits: + enabled: false diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml index 9b06ff429..90fb43293 100644 --- a/AudioQnA/kubernetes/helm/gaudi-values.yaml +++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml @@ -2,35 +2,27 @@ # SPDX-License-Identifier: Apache-2.0 tgi: + enabled: false +vllm: + enabled: true accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" + repository: opea/vllm-gaudi + startupProbe: + failureThreshold: 360 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" - CUDA_GRAPHS: "" - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] whisper: image: @@ -40,8 +32,11 @@ whisper: habana.ai/gaudi: 1 speecht5: + enabled: true image: repository: opea/speecht5-gaudi resources: limits: habana.ai/gaudi: 1 +gpt-sovits: + enabled: false diff --git a/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml b/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml new file mode 100644 index 000000000..ac2865b17 --- /dev/null +++ b/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml @@ -0,0 +1,12 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +redis-vector-db: + enabled: false +qdrant: + enabled: true + +data-prep: + DATAPREP_BACKEND: "QDRANT" +retriever-usvc: + RETRIEVER_BACKEND: "QDRANT" diff --git a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml index f552e1d5b..b80c44f04 100644 --- a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml +++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml @@ -44,11 +44,6 @@ tgi: # cpu: 6 # memory: 65Gi - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 readinessProbe: initialDelaySeconds: 16 periodSeconds: 8 @@ -65,17 +60,12 @@ teirerank: # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: resources: limits: - cpu: 4 + cpu: 24 memory: 30Gi requests: cpu: 2 memory: 25Gi - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 readinessProbe: initialDelaySeconds: 8 periodSeconds: 8 @@ -91,17 +81,12 @@ tei: # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: resources: limits: - cpu: 4 + cpu: 24 memory: 4Gi requests: cpu: 2 memory: 3Gi - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 24 - timeoutSeconds: 2 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml new file mode 100644 index 000000000..6ce475595 --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml @@ -0,0 +1,16 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "TGI" + service: + port: 80 +vllm: + enabled: false +tgi: + enabled: true diff --git a/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml b/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml new file mode 100644 index 000000000..2f05eeed3 --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml @@ -0,0 +1,12 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "vLLM" + service: + port: 80 diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml new file mode 100644 index 000000000..88fca4ed5 --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml @@ -0,0 +1,60 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "TGI" + service: + port: 80 +vllm: + enabled: false +# TGI: largest bottleneck for ChatQnA +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + # higher limits are needed with extra input tokens added by rerank + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +teirerank: + accelDevice: "gaudi" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + MAX_WARMUP_SEQUENCE_LENGTH: "512" + image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + resources: + limits: + habana.ai/gaudi: 1 +# securityContext: +# readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml new file mode 100644 index 000000000..de52226f4 --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "vLLM" + service: + port: 80 +tgi: + enabled: false +vllm: + enabled: true + shmSize: 1Gi + accelDevice: "gaudi" + image: + repository: opea/vllm-gaudi + resources: + limits: + habana.ai/gaudi: 1 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + VLLM_SKIP_WARMUP: true + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "gaudi" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + MAX_WARMUP_SEQUENCE_LENGTH: "512" + image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + resources: + limits: + habana.ai/gaudi: 1 + # securityContext: + # readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml index d4da00c97..027fecb3f 100644 --- a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml +++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml @@ -26,10 +26,6 @@ tgi: USE_FLASH_ATTENTION: "true" FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -54,8 +50,6 @@ teirerank: habana.ai/gaudi: 1 securityContext: readOnlyRootFilesystem: false - livenessProbe: - timeoutSeconds: 1 readinessProbe: timeoutSeconds: 1 @@ -73,7 +67,3 @@ teirerank: # habana.ai/gaudi: 1 # securityContext: # readOnlyRootFilesystem: false -# livenessProbe: -# timeoutSeconds: 1 -# readinessProbe: -# timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml b/ChatQnA/kubernetes/helm/gaudi-values.yaml similarity index 78% rename from ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml rename to ChatQnA/kubernetes/helm/gaudi-values.yaml index 76eafae02..2bf9f3dab 100644 --- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml +++ b/ChatQnA/kubernetes/helm/gaudi-values.yaml @@ -16,18 +16,7 @@ vllm: limits: habana.ai/gaudi: 1 startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 180 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 + failureThreshold: 360 PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" OMPI_MCA_btl_vader_single_copy_mechanism: "none" @@ -55,7 +44,5 @@ teirerank: habana.ai/gaudi: 1 securityContext: readOnlyRootFilesystem: false - livenessProbe: - timeoutSeconds: 1 readinessProbe: timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml index 8e8a491a0..dc025b386 100644 --- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml +++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml @@ -1,9 +1,12 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -image: - repository: opea/chatqna-guardrails +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values +image: + repository: opea/chatqna +CHATQNA_TYPE: "CHATQNA_GUARDRAILS" # guardrails related config guardrails-usvc: enabled: true @@ -22,10 +25,6 @@ guardrails-usvc: # habana.ai/gaudi: 1 # securityContext: # readOnlyRootFilesystem: false -# livenessProbe: -# timeoutSeconds: 1 -# readinessProbe: -# timeoutSeconds: 1 teirerank: accelDevice: "gaudi" @@ -39,8 +38,6 @@ teirerank: habana.ai/gaudi: 1 securityContext: readOnlyRootFilesystem: false - livenessProbe: - timeoutSeconds: 1 readinessProbe: timeoutSeconds: 1 @@ -62,10 +59,6 @@ tgi-guardrails: LIMIT_HPU_GRAPH: "true" USE_FLASH_ATTENTION: "true" FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -88,18 +81,7 @@ vllm: limits: habana.ai/gaudi: 1 startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 180 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 + failureThreshold: 360 PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" OMPI_MCA_btl_vader_single_copy_mechanism: "none" diff --git a/ChatQnA/kubernetes/helm/norerank-values.yaml b/ChatQnA/kubernetes/helm/norerank-values.yaml index aa069c1c7..f3bcc8b47 100644 --- a/ChatQnA/kubernetes/helm/norerank-values.yaml +++ b/ChatQnA/kubernetes/helm/norerank-values.yaml @@ -1,11 +1,7 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Accelerate inferencing in heaviest components to improve performance -# by overriding their subchart values - -image: - repository: opea/chatqna-without-rerank +CHATQNA_TYPE: "CHATQNA_NO_RERANK" teirerank: enabled: false diff --git a/CodeGen/kubernetes/helm/cpu-tgi-values.yaml b/CodeGen/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 000000000..accd8674b --- /dev/null +++ b/CodeGen/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI diff --git a/CodeGen/kubernetes/helm/cpu-values.yaml b/CodeGen/kubernetes/helm/cpu-values.yaml index b49541359..ccc843b1c 100644 --- a/CodeGen/kubernetes/helm/cpu-values.yaml +++ b/CodeGen/kubernetes/helm/cpu-values.yaml @@ -2,4 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 tgi: - LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct + enabled: false +vllm: + enabled: true +llm-uservice: + TEXTGEN_BACKEND: vLLM diff --git a/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml b/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml new file mode 100644 index 000000000..7aaae3f27 --- /dev/null +++ b/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI diff --git a/CodeGen/kubernetes/helm/gaudi-values.yaml b/CodeGen/kubernetes/helm/gaudi-values.yaml index 25ac2c395..95fcce29c 100644 --- a/CodeGen/kubernetes/helm/gaudi-values.yaml +++ b/CodeGen/kubernetes/helm/gaudi-values.yaml @@ -2,32 +2,26 @@ # SPDX-License-Identifier: Apache-2.0 tgi: + enabled: false + +vllm: + enabled: true accelDevice: "gaudi" - LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" + repository: opea/vllm-gaudi + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + startupProbe: + failureThreshold: 360 resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + ] + +llm-uservice: + TEXTGEN_BACKEND: vLLM + retryTimeoutSeconds: 720 diff --git a/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml b/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 000000000..accd8674b --- /dev/null +++ b/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI diff --git a/CodeTrans/kubernetes/helm/cpu-values.yaml b/CodeTrans/kubernetes/helm/cpu-values.yaml index 313f05075..ccc843b1c 100644 --- a/CodeTrans/kubernetes/helm/cpu-values.yaml +++ b/CodeTrans/kubernetes/helm/cpu-values.yaml @@ -2,4 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 tgi: - LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 + enabled: false +vllm: + enabled: true +llm-uservice: + TEXTGEN_BACKEND: vLLM diff --git a/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml new file mode 100644 index 000000000..34f5eb4b7 --- /dev/null +++ b/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI diff --git a/CodeTrans/kubernetes/helm/gaudi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-values.yaml index 89ed25928..cac19a63f 100644 --- a/CodeTrans/kubernetes/helm/gaudi-values.yaml +++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml @@ -1,32 +1,33 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + tgi: + enabled: false + +vllm: + enabled: true accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" + repository: opea/vllm-gaudi + startupProbe: + failureThreshold: 360 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] + +llm-uservice: + TEXTGEN_BACKEND: vLLM + retryTimeoutSeconds: 720 diff --git a/DocSum/kubernetes/helm/cpu-tgi-values.yaml b/DocSum/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 000000000..9277da305 --- /dev/null +++ b/DocSum/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +llm-uservice: + DOCSUM_BACKEND: "TGI" +tgi: + enabled: true +vllm: + enabled: false diff --git a/DocSum/kubernetes/helm/cpu-values.yaml b/DocSum/kubernetes/helm/cpu-values.yaml index 6f2ab7768..677e2a89b 100644 --- a/DocSum/kubernetes/helm/cpu-values.yaml +++ b/DocSum/kubernetes/helm/cpu-values.yaml @@ -1,7 +1,9 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +llm-uservice: + DOCSUM_BACKEND: "vLLM" tgi: - enabled: true -vllm: enabled: false +vllm: + enabled: true diff --git a/DocSum/kubernetes/helm/gaudi-tgi-values.yaml b/DocSum/kubernetes/helm/gaudi-tgi-values.yaml new file mode 100644 index 000000000..818da3cbd --- /dev/null +++ b/DocSum/kubernetes/helm/gaudi-tgi-values.yaml @@ -0,0 +1,32 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +vllm: + enabled: false + +llm-uservice: + DOCSUM_BACKEND: "TGI" + +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + CUDA_GRAPHS: "" + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 diff --git a/DocSum/kubernetes/helm/gaudi-values.yaml b/DocSum/kubernetes/helm/gaudi-values.yaml index eda0abe8c..f0ccc0d21 100644 --- a/DocSum/kubernetes/helm/gaudi-values.yaml +++ b/DocSum/kubernetes/helm/gaudi-values.yaml @@ -1,36 +1,32 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -vllm: +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: enabled: false llm-uservice: - DOCSUM_BACKEND: "TGI" + DOCSUM_BACKEND: "vLLM" + retryTimeoutSeconds: 720 -tgi: +vllm: enabled: true - accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" + repository: opea/vllm-gaudi resources: limits: habana.ai/gaudi: 1 - CUDA_GRAPHS: "" - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + failureThreshold: 360 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml index ef327645d..a1abc1a44 100644 --- a/SearchQnA/kubernetes/helm/gaudi-values.yaml +++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml @@ -18,10 +18,6 @@ tgi: LIMIT_HPU_GRAPH: true USE_FLASH_ATTENTION: true FLASH_ATTENTION_RECOMPUTE: true - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -44,7 +40,5 @@ tei: resources: limits: habana.ai/gaudi: 1 - livenessProbe: - timeoutSeconds: 1 readinessProbe: timeoutSeconds: 1 diff --git a/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml b/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 000000000..dac19e816 --- /dev/null +++ b/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true +vllm: + enabled: false +lvm-uservice: + LVM_BACKEND: "TGI" diff --git a/VisualQnA/kubernetes/helm/cpu-values.yaml b/VisualQnA/kubernetes/helm/cpu-values.yaml index acc9d4e28..18a42ccd7 100644 --- a/VisualQnA/kubernetes/helm/cpu-values.yaml +++ b/VisualQnA/kubernetes/helm/cpu-values.yaml @@ -1,7 +1,9 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +vllm: + enabled: true tgi: - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" - LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf + enabled: false +lvm-uservice: + LVM_BACKEND: "vLLM" diff --git a/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml new file mode 100644 index 000000000..8868f8d93 --- /dev/null +++ b/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml @@ -0,0 +1,37 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +# TGI: largest bottleneck for VisualQnA +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "4096" + MAX_TOTAL_TOKENS: "8192" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false +lvm-uservice: + LVM_BACKEND: "TGI" diff --git a/VisualQnA/kubernetes/helm/gaudi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-values.yaml index eb6494a14..bc6d47e9d 100644 --- a/VisualQnA/kubernetes/helm/gaudi-values.yaml +++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml @@ -1,36 +1,24 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Accelerate inferencing in heaviest components to improve performance -# by overriding their subchart values - -# TGI: largest bottleneck for VisualQnA -tgi: +vllm: + enabled: true accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" + repository: opea/vllm-gaudi + tag: "latest" + LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf + VLLM_SKIP_WARMUP: true + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false" + extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"] resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 +tgi: + enabled: false +lvm-uservice: + LVM_BACKEND: "vLLM" + # The default model is not stable on Gaudi, use the older model. + # https://github.com/HabanaAI/vllm-fork/issues/841 + LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf