Sync value files from GenAIInfra (#1428)

All gaudi values updated with extra flags.
Added helm support for 2 new examples Text2Image and SearchQnA. Minor fix for llm-uservice.

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
dolpher
2025-01-22 17:44:11 +08:00
committed by GitHub
parent 5c36443b11
commit ee0e5cc8d9
34 changed files with 343 additions and 1487 deletions

View File

@@ -4,35 +4,13 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
resources:
limits:
habana.ai/gaudi: 4
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
extraCmdArgs: ["--sharded","true","--num-shard","4"]
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
repository: opea/vllm-gaudi
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm

View File

@@ -5,7 +5,7 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
failureThreshold: 120
whisper:
image:
repository: opea/whisper-gaudi
resources:
limits:
habana.ai/gaudi: 1
speecht5:
image:
repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1

View File

@@ -0,0 +1,112 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Override CPU resource request and probe timing values in specific subcharts
#
# RESOURCES
#
# Resource request matching actual resource usage (with enough slack)
# is important when service is scaled up, so that right amount of pods
# get scheduled to right nodes.
#
# Because resource usage depends on the used devices, model, data type
# and SW versions, and this top-level chart has overrides for them,
# resource requests need to be specified here too.
#
# To test service without resource request, use "resources: {}".
#
# PROBES
#
# Inferencing pods startup / warmup takes *much* longer on CPUs than
# with acceleration devices, and their responses are also slower,
# especially when node is running several instances of these services.
#
# Kubernetes restarting pod before its startup finishes, or not
# sending it queries because it's not in ready state due to slow
# readiness responses, does really NOT help in getting faster responses.
#
# => probe timings need to be increased when running on CPU.
vllm:
enabled: false
tgi:
enabled: true
# TODO: add Helm value also for TGI data type option:
# https://github.com/opea-project/GenAIExamples/issues/330
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
#resources:
# limits:
# cpu: 8
# memory: 70Gi
# requests:
# cpu: 6
# memory: 65Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 16
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 180
timeoutSeconds: 2
teirerank:
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources:
limits:
cpu: 4
memory: 30Gi
requests:
cpu: 2
memory: 25Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 8
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120
tei:
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources:
limits:
cpu: 4
memory: 4Gi
requests:
cpu: 2
memory: 3Gi
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 2
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120

View File

@@ -1,109 +1,5 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Override CPU resource request and probe timing values in specific subcharts
#
# RESOURCES
#
# Resource request matching actual resource usage (with enough slack)
# is important when service is scaled up, so that right amount of pods
# get scheduled to right nodes.
#
# Because resource usage depends on the used devices, model, data type
# and SW versions, and this top-level chart has overrides for them,
# resource requests need to be specified here too.
#
# To test service without resource request, use "resources: {}".
#
# PROBES
#
# Inferencing pods startup / warmup takes *much* longer on CPUs than
# with acceleration devices, and their responses are also slower,
# especially when node is running several instances of these services.
#
# Kubernetes restarting pod before its startup finishes, or not
# sending it queries because it's not in ready state due to slow
# readiness responses, does really NOT help in getting faster responses.
#
# => probe timings need to be increased when running on CPU.
tgi:
# TODO: add Helm value also for TGI data type option:
# https://github.com/opea-project/GenAIExamples/issues/330
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
resources:
limits:
cpu: 8
memory: 70Gi
requests:
cpu: 6
memory: 65Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 16
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 180
timeoutSeconds: 2
teirerank:
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources:
limits:
cpu: 4
memory: 30Gi
requests:
cpu: 2
memory: 25Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 8
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120
tei:
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources:
limits:
cpu: 4
memory: 4Gi
requests:
cpu: 2
memory: 3Gi
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 2
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 120
image:
repository: opea/chatqna

View File

@@ -4,12 +4,15 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
vllm:
enabled: false
# TGI: largest bottleneck for ChatQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1

View File

@@ -6,9 +6,9 @@
tgi:
enabled: false
vllm:
enabled: true
shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
"--max-seq_len-to-capture", "2048"
]
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
#

View File

@@ -44,44 +44,13 @@ teirerank:
readinessProbe:
timeoutSeconds: 1
tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
resources:
limits:
habana.ai/gaudi: 1
# higher limits are needed with extra input tokens added by rerank
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
tgi-guardrails:
enabled: true
accelDevice: "gaudi"
LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
@@ -106,3 +75,38 @@ tgi-guardrails:
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
tgi:
enabled: false
vllm:
enabled: true
shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]

View File

@@ -1,14 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
image:
repository: opea/chatqna-guardrails
# guardrails related config
guardrails-usvc:
enabled: true
# SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
tgi-guardrails:
enabled: true
LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"

View File

@@ -1,25 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# To override values in subchart tgi
tgi:
accelDevice: "nvidia"
image:
repository: ghcr.io/huggingface/text-generation-inference
tag: "2.2.0"
resources:
limits:
nvidia.com/gpu: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -1,117 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
ROLLOUT_TIMEOUT_SECONDS="1800s"
KUBECTL_TIMEOUT_SECONDS="60s"
function validate_chatqna() {
local ns=$1
local log=$2
max_retry=20
# make sure microservice retriever-usvc is ready
# try to curl retriever-svc for max_retry times
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
curl http://$endpoint_url/v1/retrieval -X POST \
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice retriever failed, exit with error."
return 1
fi
# make sure microservice tgi-svc is ready
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
curl http://$endpoint_url/generate -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice tgi failed, exit with error."
return 1
fi
# check megaservice works
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$log.log
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice failed, please check the logs in $LOGFILE!"
return ${exit_code}
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] &&
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [[ $status == false ]]; then
echo "Response check failed, please check the logs in artifacts!"
return 1
else
echo "Response check succeed!"
fi
return 0
}
function install_chatqna() {
echo "Testing manifests chatqna_guardrails"
local ns=$1
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
kubectl create namespace $ns
# install guardrails
kubectl apply -f chatqna-guardrails.yaml -n $ns
# Sleep enough time for chatqna_guardrails to be ready
sleep 60
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_ChatQnA)
pushd ChatQnA/tests/common
bash _test_manifest_utils.sh init_ChatQnA
popd
;;
install_ChatQnA)
NAMESPACE=$2
install_chatqna $NAMESPACE
popd
;;
validate_ChatQnA)
NAMESPACE=$2
SERVICE_NAME=chatqna-guardrails
validate_chatqna $NAMESPACE chatqna-guardrails
ret=$?
if [ $ret -ne 0 ]; then
exit $ret
fi
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,117 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
ROLLOUT_TIMEOUT_SECONDS="1800s"
KUBECTL_TIMEOUT_SECONDS="60s"
function validate_chatqna() {
local ns=$1
local log=$2
max_retry=10
# make sure microservice retriever-usvc is ready
# try to curl retriever-svc for max_retry times
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
curl http://$endpoint_url/v1/retrieval -X POST \
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice retriever failed, exit with error."
return 1
fi
# make sure microservice tgi-svc is ready
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
curl http://$endpoint_url/generate -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice tgi failed, exit with error."
return 1
fi
# check megaservice works
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$log.log
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice failed, please check the logs in $LOGFILE!"
return ${exit_code}
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] &&
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
return 1
else
echo "Response check succeed!"
fi
return 0
}
function install_chatqna() {
echo "Testing manifests chatqna_guardrails"
local ns=$1
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
kubectl create namespace $ns
# install guardrail
kubectl apply -f chatqna-guardrails.yaml -n $ns
# Sleep enough time for chatqna_guardrails to be ready
sleep 60
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_ChatQnA)
pushd ChatQnA/tests/common
bash _test_manifest_utils.sh init_ChatQnA
popd
;;
install_ChatQnA)
NAMESPACE=$2
install_chatqna $NAMESPACE
popd
;;
validate_ChatQnA)
NAMESPACE=$2
SERVICE_NAME=chatqna-guardrails
validate_chatqna $NAMESPACE chatqna-guardrails
ret=$?
if [ $ret -ne 0 ]; then
exit $ret
fi
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,113 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
ROLLOUT_TIMEOUT_SECONDS="1800s"
KUBECTL_TIMEOUT_SECONDS="60s"
function install_chatqna {
echo "namespace is $NAMESPACE"
kubectl apply -f chatqna.yaml -n $NAMESPACE
# Sleep enough time for retreiver-usvc to be ready
sleep 60
}
function validate_chatqna() {
local ns=$1
local log=$2
max_retry=20
# make sure microservice retriever-usvc is ready
# try to curl retriever-svc for max_retry times
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
curl http://$endpoint_url/v1/retrieval -X POST \
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice retriever failed, exit with error."
return 1
fi
# make sure microservice tgi-svc is ready
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
curl http://$endpoint_url/generate -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice tgi failed, exit with error."
return 1
fi
# check megaservice works
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$log.log
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice failed, please check the logs in $LOGFILE!"
return ${exit_code}
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] &&
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
return 1
else
echo "Response check succeed!"
fi
return 0
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_ChatQnA)
pushd ChatQnA/tests/common
bash _test_manifest_utils.sh init_ChatQnA
popd
;;
install_ChatQnA)
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
NAMESPACE=$2
install_chatqna
popd
;;
validate_ChatQnA)
NAMESPACE=$2
SERVICE_NAME=chatqna
validate_chatqna $NAMESPACE chatqna
ret=$?
if [ $ret -ne 0 ]; then
exit $ret
fi
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,112 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
ROLLOUT_TIMEOUT_SECONDS="1800s"
KUBECTL_TIMEOUT_SECONDS="60s"
function install_chatqna {
echo "namespace is $NAMESPACE"
kubectl apply -f chatqna.yaml -n $NAMESPACE
# Sleep enough time for retreiver-usvc to be ready
sleep 60
}
function validate_chatqna() {
local ns=$1
local log=$2
max_retry=10
# make sure microservice retriever-usvc is ready
# try to curl retriever-svc for max_retry times
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
curl http://$endpoint_url/v1/retrieval -X POST \
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice retriever failed, exit with error."
return 1
fi
# make sure microservice tgi-svc is ready
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
curl http://$endpoint_url/generate -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice tgi failed, exit with error."
return 1
fi
# check megaservice works
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$log.log
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice failed, please check the logs in $LOGFILE!"
return ${exit_code}
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] &&
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
return 1
else
echo "Response check succeed!"
fi
return 0
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_ChatQnA)
pushd ChatQnA/tests/common
bash _test_manifest_utils.sh init_ChatQnA
popd
;;
install_ChatQnA)
pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
NAMESPACE=$2
install_chatqna
popd
;;
validate_ChatQnA)
NAMESPACE=$2
SERVICE_NAME=chatqna
validate_chatqna $NAMESPACE chatqna
ret=$?
if [ $ret -ne 0 ]; then
exit $ret
fi
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,118 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
ROLLOUT_TIMEOUT_SECONDS="1800s"
KUBECTL_TIMEOUT_SECONDS="60s"
function validate_chatqna() {
local ns=$1
local log=$2
max_retry=20
# make sure microservice retriever-usvc is ready
# try to curl retriever-svc for max_retry times
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
curl http://$endpoint_url/v1/retrieval -X POST \
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice retriever failed, exit with error."
return 1
fi
# make sure microservice vllm-svc is ready
for ((i=1; i<=max_retry; i++))
do
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-vllm" $ns)
curl http://$endpoint_url/v1/chat/completions -X POST \
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
-H 'Content-Type: application/json' && break
sleep 30
done
# if i is bigger than max_retry, then exit with error
if [ $i -gt $max_retry ]; then
echo "Microservice vllm failed, exit with error."
return 1
fi
# check megaservice works
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$log.log
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice failed, please check the logs in $LOGFILE!"
return ${exit_code}
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] &&
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
return 1
else
echo "Response check succeed!"
fi
return 0
}
function install_chatqna() {
echo "Testing manifests chatqna_vllm"
local ns=$1
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
kubectl create namespace $ns
# install guardrail
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
kubectl apply -f chatqna-vllm.yaml -n $ns
# Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
sleep 280
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_ChatQnA)
pushd ChatQnA/tests/common
bash _test_manifest_utils.sh init_ChatQnA
popd
;;
install_ChatQnA)
NAMESPACE=$2
install_chatqna $NAMESPACE
popd
;;
validate_ChatQnA)
NAMESPACE=$2
SERVICE_NAME=chatqna-vllm
validate_chatqna $NAMESPACE chatqna-vllm
ret=$?
if [ $ret -ne 0 ]; then
exit $ret
fi
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -6,13 +6,18 @@ tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -1,85 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_codegen() {
# executed under path manifest/codegen/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_codegen {
echo "namespace is $NAMESPACE"
kubectl apply -f codegen.yaml -n $NAMESPACE
}
function validate_codegen() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/codegen..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-d '{"messages": "def print_hello_world():"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_CodeGen)
pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
init_codegen
popd
;;
install_CodeGen)
pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
NAMESPACE=$2
install_codegen
popd
;;
validate_CodeGen)
NAMESPACE=$2
SERVICE_NAME=codegen
validate_codegen
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,85 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_codegen() {
# executed under path manifest/codegen/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_codegen {
echo "namespace is $NAMESPACE"
kubectl apply -f codegen.yaml -n $NAMESPACE
}
function validate_codegen() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/codegen..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-d '{"messages": "def print_hello_world():"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_CodeGen)
pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
init_codegen
popd
;;
install_CodeGen)
pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
NAMESPACE=$2
install_codegen
popd
;;
validate_CodeGen)
NAMESPACE=$2
SERVICE_NAME=codegen
validate_codegen
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -5,13 +5,18 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -1,86 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_codetrans() {
# executed under path manifest/codetrans/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_codetrans {
echo "namespace is $NAMESPACE"
kubectl apply -f codetrans.yaml -n $NAMESPACE
}
function validate_codetrans() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/codetrans \
-H 'Content-Type: application/json' \
-d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_CodeTrans)
pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
init_codetrans
popd
;;
install_CodeTrans)
pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
NAMESPACE=$2
install_codetrans
popd
;;
validate_CodeTrans)
NAMESPACE=$2
SERVICE_NAME=codetrans
validate_codetrans
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,86 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_codetrans() {
# executed under path manifest/codetrans/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_codetrans {
echo "namespace is $NAMESPACE"
kubectl apply -f codetrans.yaml -n $NAMESPACE
}
function validate_codetrans() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/codetrans \
-H 'Content-Type: application/json' \
-d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_CodeTrans)
pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
init_codetrans
popd
;;
install_CodeTrans)
pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
NAMESPACE=$2
install_codetrans
popd
;;
validate_CodeTrans)
NAMESPACE=$2
SERVICE_NAME=codetrans
validate_codetrans
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -2,4 +2,6 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
enabled: true
vllm:
enabled: false

View File

@@ -1,16 +1,21 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
vllm:
enabled: false
llm-uservice:
DOCSUM_BACKEND: "TGI"
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true

View File

@@ -1,87 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_docsum() {
# executed under path manifest/docsum/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_docsum {
echo "namespace is $NAMESPACE"
kubectl apply -f docsum.yaml -n $NAMESPACE
}
function validate_docsum() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/docsum..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/docsum \
-H 'Content-Type: multipart/form-data' \
-F 'type=text' \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_DocSum)
pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
init_docsum
popd
;;
install_DocSum)
pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
NAMESPACE=$2
install_docsum
popd
;;
validate_DocSum)
NAMESPACE=$2
SERVICE_NAME=docsum
validate_docsum
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,87 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_docsum() {
# executed under path manifest/docsum/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_docsum {
echo "namespace is $NAMESPACE"
kubectl apply -f docsum.yaml -n $NAMESPACE
}
function validate_docsum() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/docsum..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/docsum \
-H 'Content-Type: multipart/form-data' \
-F 'type=text' \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
exit 1
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_DocSum)
pushd DocSum/kubernetes/intel/cpu/xeon/manifest
init_docsum
popd
;;
install_DocSum)
pushd DocSum/kubernetes/intel/cpu/xeon/manifest
NAMESPACE=$2
install_docsum
popd
;;
validate_DocSum)
NAMESPACE=$2
SERVICE_NAME=docsum
validate_docsum
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -5,13 +5,25 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: "0"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
PREFILL_BATCH_BUCKET_SIZE: 1
BATCH_BUCKET_SIZE: 8
extraCmdArgs:
- "--max-batch-total-tokens"
- "65536"
- "--max-batch-prefill-tokens"
- "4096"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -0,0 +1,18 @@
# Deploy SearchQnA on Kubernetes cluster
- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
## Deploy on Xeon
```
export HFTOKEN="insert-your-huggingface-token-here"
helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
```
## Deploy on Gaudi
```
export HFTOKEN="insert-your-huggingface-token-here"
helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
```

View File

@@ -0,0 +1,7 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
llm_uservice:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3

View File

@@ -0,0 +1,50 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
tei:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tei-gaudi
tag: "1.5.0"
OMPI_MCA_btl_vader_single_copy_mechanism: none
MAX_WARMUP_SEQUENCE_LENGTH: 512
securityContext:
readOnlyRootFilesystem: false
resources:
limits:
habana.ai/gaudi: 1
livenessProbe:
timeoutSeconds: 1
readinessProbe:
timeoutSeconds: 1

View File

@@ -0,0 +1,18 @@
# Deploy txt2img on Kubernetes cluster
- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
## Deploy on Xeon
```
export HFTOKEN="insert-your-huggingface-token-here"
helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
```
## Deploy on Gaudi
```
export HFTOKEN="insert-your-huggingface-token-here"
helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
```

View File

@@ -0,0 +1,6 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
text2image:
image:
repository: opea/text2image

View File

@@ -0,0 +1,30 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
text2image:
accelDevice: "gaudi"
image:
repository: opea/text2image-gaudi
resources:
limits:
habana.ai/gaudi: 1
# The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
# User should change the resource limits for other models
hugepages-2Mi: 256Mi
volumes:
- name: hugepage-2mi
emptyDir:
medium: HugePages-2Mi
volumeMounts:
- name: hugepage-2mi
mountPath: /hugepages-2Mi
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -1,86 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_translation() {
# executed under path manifest/translation/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_translation {
echo "namespace is $NAMESPACE"
kubectl apply -f translation.yaml -n $NAMESPACE
sleep 50s
}
function validate_translation() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/translation..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/translation \
-H 'Content-Type: application/json' \
-d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice translation failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_Translation)
pushd Translation/kubernetes/intel/hpu/gaudi/manifest
init_translation
popd
;;
install_Translation)
pushd Translation/kubernetes/intel/hpu/gaudi/manifest
NAMESPACE=$2
install_translation
popd
;;
validate_Translation)
NAMESPACE=$2
SERVICE_NAME=translation
validate_translation
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -1,85 +0,0 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -xe
USER_ID=$(whoami)
LOG_PATH=/home/$(whoami)/logs
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
IMAGE_REPO=${IMAGE_REPO:-opea}
IMAGE_TAG=${IMAGE_TAG:-latest}
function init_translation() {
# executed under path manifest/translation/xeon
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
# replace microservice image tag
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
# set huggingface token
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
}
function install_translation {
echo "namespace is $NAMESPACE"
kubectl apply -f translation.yaml -n $NAMESPACE
}
function validate_translation() {
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
echo "try to curl http://${ip_address}:${port}/v1/translation..."
# generate a random logfile name to avoid conflict among multiple runners
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
# Curl the Mega Service
curl http://${ip_address}:${port}/v1/translation \
-H 'Content-Type: application/json' \
-d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
exit_code=$?
if [ $exit_code -ne 0 ]; then
echo "Megaservice translation failed, please check the logs in $LOGFILE!"
exit 1
fi
echo "Checking response results, make sure the output is reasonable. "
local status=false
if [[ -f $LOGFILE ]] && \
[[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
status=true
fi
if [ $status == false ]; then
echo "Response check failed, please check the logs in artifacts!"
else
echo "Response check succeed!"
fi
}
if [ $# -eq 0 ]; then
echo "Usage: $0 <function_name>"
exit 1
fi
case "$1" in
init_Translation)
pushd Translation/kubernetes/intel/cpu/xeon/manifest
init_translation
popd
;;
install_Translation)
pushd Translation/kubernetes/intel/cpu/xeon/manifest
NAMESPACE=$2
install_translation
popd
;;
validate_Translation)
NAMESPACE=$2
SERVICE_NAME=translation
validate_translation
;;
*)
echo "Unknown function: $1"
;;
esac

View File

@@ -9,13 +9,18 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.6"
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5