Sync value files from GenAIInfra (#1428)
All gaudi values updated with extra flags. Added helm support for 2 new examples Text2Image and SearchQnA. Minor fix for llm-uservice. Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
@@ -4,35 +4,13 @@
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
tgi:
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 4
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
extraCmdArgs: ["--sharded","true","--num-shard","4"]
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
repository: opea/vllm-gaudi
|
||||
supervisor:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
ragagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
sqlagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
|
||||
@@ -5,7 +5,7 @@ tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
@@ -33,11 +33,15 @@ tgi:
|
||||
failureThreshold: 120
|
||||
|
||||
whisper:
|
||||
image:
|
||||
repository: opea/whisper-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
|
||||
speecht5:
|
||||
image:
|
||||
repository: opea/speecht5-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
|
||||
112
ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
112
ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Override CPU resource request and probe timing values in specific subcharts
|
||||
#
|
||||
# RESOURCES
|
||||
#
|
||||
# Resource request matching actual resource usage (with enough slack)
|
||||
# is important when service is scaled up, so that right amount of pods
|
||||
# get scheduled to right nodes.
|
||||
#
|
||||
# Because resource usage depends on the used devices, model, data type
|
||||
# and SW versions, and this top-level chart has overrides for them,
|
||||
# resource requests need to be specified here too.
|
||||
#
|
||||
# To test service without resource request, use "resources: {}".
|
||||
#
|
||||
# PROBES
|
||||
#
|
||||
# Inferencing pods startup / warmup takes *much* longer on CPUs than
|
||||
# with acceleration devices, and their responses are also slower,
|
||||
# especially when node is running several instances of these services.
|
||||
#
|
||||
# Kubernetes restarting pod before its startup finishes, or not
|
||||
# sending it queries because it's not in ready state due to slow
|
||||
# readiness responses, does really NOT help in getting faster responses.
|
||||
#
|
||||
# => probe timings need to be increased when running on CPU.
|
||||
|
||||
vllm:
|
||||
enabled: false
|
||||
tgi:
|
||||
enabled: true
|
||||
# TODO: add Helm value also for TGI data type option:
|
||||
# https://github.com/opea-project/GenAIExamples/issues/330
|
||||
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
|
||||
#resources:
|
||||
# limits:
|
||||
# cpu: 8
|
||||
# memory: 70Gi
|
||||
# requests:
|
||||
# cpu: 6
|
||||
# memory: 65Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 16
|
||||
periodSeconds: 8
|
||||
timeoutSeconds: 4
|
||||
startupProbe:
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
failureThreshold: 180
|
||||
timeoutSeconds: 2
|
||||
|
||||
teirerank:
|
||||
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
|
||||
|
||||
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
memory: 30Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 25Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
timeoutSeconds: 4
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 120
|
||||
|
||||
tei:
|
||||
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
|
||||
|
||||
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
memory: 4Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 3Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 2
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 2
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 120
|
||||
@@ -1,109 +1,5 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Override CPU resource request and probe timing values in specific subcharts
|
||||
#
|
||||
# RESOURCES
|
||||
#
|
||||
# Resource request matching actual resource usage (with enough slack)
|
||||
# is important when service is scaled up, so that right amount of pods
|
||||
# get scheduled to right nodes.
|
||||
#
|
||||
# Because resource usage depends on the used devices, model, data type
|
||||
# and SW versions, and this top-level chart has overrides for them,
|
||||
# resource requests need to be specified here too.
|
||||
#
|
||||
# To test service without resource request, use "resources: {}".
|
||||
#
|
||||
# PROBES
|
||||
#
|
||||
# Inferencing pods startup / warmup takes *much* longer on CPUs than
|
||||
# with acceleration devices, and their responses are also slower,
|
||||
# especially when node is running several instances of these services.
|
||||
#
|
||||
# Kubernetes restarting pod before its startup finishes, or not
|
||||
# sending it queries because it's not in ready state due to slow
|
||||
# readiness responses, does really NOT help in getting faster responses.
|
||||
#
|
||||
# => probe timings need to be increased when running on CPU.
|
||||
|
||||
tgi:
|
||||
# TODO: add Helm value also for TGI data type option:
|
||||
# https://github.com/opea-project/GenAIExamples/issues/330
|
||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
||||
|
||||
# Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 8
|
||||
memory: 70Gi
|
||||
requests:
|
||||
cpu: 6
|
||||
memory: 65Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 16
|
||||
periodSeconds: 8
|
||||
timeoutSeconds: 4
|
||||
startupProbe:
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
failureThreshold: 180
|
||||
timeoutSeconds: 2
|
||||
|
||||
teirerank:
|
||||
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
|
||||
|
||||
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
memory: 30Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 25Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
timeoutSeconds: 4
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 120
|
||||
|
||||
tei:
|
||||
EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
|
||||
|
||||
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
memory: 4Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 3Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 2
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 2
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 120
|
||||
image:
|
||||
repository: opea/chatqna
|
||||
|
||||
@@ -4,12 +4,15 @@
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
vllm:
|
||||
enabled: false
|
||||
# TGI: largest bottleneck for ChatQnA
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
@@ -6,9 +6,9 @@
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
|
||||
vllm:
|
||||
enabled: true
|
||||
shmSize: 1Gi
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: opea/vllm-gaudi
|
||||
@@ -19,7 +19,7 @@ vllm:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
failureThreshold: 180
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
@@ -39,7 +39,6 @@ vllm:
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
|
||||
# Reranking: second largest bottleneck when reranking is in use
|
||||
# (i.e. query context docs have been uploaded with data-prep)
|
||||
#
|
||||
|
||||
@@ -44,44 +44,13 @@ teirerank:
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
|
||||
tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
# higher limits are needed with extra input tokens added by rerank
|
||||
MAX_INPUT_LENGTH: "2048"
|
||||
MAX_TOTAL_TOKENS: "4096"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
|
||||
tgi-guardrails:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
@@ -106,3 +75,38 @@ tgi-guardrails:
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
shmSize: 1Gi
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: opea/vllm-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 180
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
image:
|
||||
repository: opea/chatqna-guardrails
|
||||
|
||||
# guardrails related config
|
||||
guardrails-usvc:
|
||||
enabled: true
|
||||
# SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
|
||||
SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
|
||||
tgi-guardrails:
|
||||
enabled: true
|
||||
LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
|
||||
@@ -1,25 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# To override values in subchart tgi
|
||||
tgi:
|
||||
accelDevice: "nvidia"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/text-generation-inference
|
||||
tag: "2.2.0"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
ROLLOUT_TIMEOUT_SECONDS="1800s"
|
||||
KUBECTL_TIMEOUT_SECONDS="60s"
|
||||
|
||||
function validate_chatqna() {
|
||||
local ns=$1
|
||||
local log=$2
|
||||
max_retry=20
|
||||
# make sure microservice retriever-usvc is ready
|
||||
# try to curl retriever-svc for max_retry times
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
|
||||
curl http://$endpoint_url/v1/retrieval -X POST \
|
||||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice retriever failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
# make sure microservice tgi-svc is ready
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
|
||||
curl http://$endpoint_url/generate -X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice tgi failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# check megaservice works
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$log.log
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
|
||||
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice failed, please check the logs in $LOGFILE!"
|
||||
return ${exit_code}
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] &&
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
if [[ $status == false ]]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
return 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
function install_chatqna() {
|
||||
echo "Testing manifests chatqna_guardrails"
|
||||
local ns=$1
|
||||
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
|
||||
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
|
||||
kubectl create namespace $ns
|
||||
# install guardrails
|
||||
kubectl apply -f chatqna-guardrails.yaml -n $ns
|
||||
# Sleep enough time for chatqna_guardrails to be ready
|
||||
sleep 60
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_ChatQnA)
|
||||
pushd ChatQnA/tests/common
|
||||
bash _test_manifest_utils.sh init_ChatQnA
|
||||
popd
|
||||
;;
|
||||
install_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
install_chatqna $NAMESPACE
|
||||
popd
|
||||
;;
|
||||
validate_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=chatqna-guardrails
|
||||
validate_chatqna $NAMESPACE chatqna-guardrails
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
exit $ret
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
ROLLOUT_TIMEOUT_SECONDS="1800s"
|
||||
KUBECTL_TIMEOUT_SECONDS="60s"
|
||||
|
||||
function validate_chatqna() {
|
||||
local ns=$1
|
||||
local log=$2
|
||||
max_retry=10
|
||||
# make sure microservice retriever-usvc is ready
|
||||
# try to curl retriever-svc for max_retry times
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
|
||||
curl http://$endpoint_url/v1/retrieval -X POST \
|
||||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice retriever failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
# make sure microservice tgi-svc is ready
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
|
||||
curl http://$endpoint_url/generate -X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice tgi failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# check megaservice works
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$log.log
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
|
||||
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice failed, please check the logs in $LOGFILE!"
|
||||
return ${exit_code}
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] &&
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
return 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
function install_chatqna() {
|
||||
echo "Testing manifests chatqna_guardrails"
|
||||
local ns=$1
|
||||
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
|
||||
pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
|
||||
kubectl create namespace $ns
|
||||
# install guardrail
|
||||
kubectl apply -f chatqna-guardrails.yaml -n $ns
|
||||
# Sleep enough time for chatqna_guardrails to be ready
|
||||
sleep 60
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_ChatQnA)
|
||||
pushd ChatQnA/tests/common
|
||||
bash _test_manifest_utils.sh init_ChatQnA
|
||||
popd
|
||||
;;
|
||||
install_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
install_chatqna $NAMESPACE
|
||||
popd
|
||||
;;
|
||||
validate_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=chatqna-guardrails
|
||||
validate_chatqna $NAMESPACE chatqna-guardrails
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
exit $ret
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
ROLLOUT_TIMEOUT_SECONDS="1800s"
|
||||
KUBECTL_TIMEOUT_SECONDS="60s"
|
||||
|
||||
function install_chatqna {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f chatqna.yaml -n $NAMESPACE
|
||||
# Sleep enough time for retreiver-usvc to be ready
|
||||
sleep 60
|
||||
}
|
||||
|
||||
function validate_chatqna() {
|
||||
local ns=$1
|
||||
local log=$2
|
||||
max_retry=20
|
||||
# make sure microservice retriever-usvc is ready
|
||||
# try to curl retriever-svc for max_retry times
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
|
||||
curl http://$endpoint_url/v1/retrieval -X POST \
|
||||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice retriever failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
# make sure microservice tgi-svc is ready
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
|
||||
curl http://$endpoint_url/generate -X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice tgi failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# check megaservice works
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$log.log
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
|
||||
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice failed, please check the logs in $LOGFILE!"
|
||||
return ${exit_code}
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] &&
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
return 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_ChatQnA)
|
||||
pushd ChatQnA/tests/common
|
||||
bash _test_manifest_utils.sh init_ChatQnA
|
||||
popd
|
||||
;;
|
||||
install_ChatQnA)
|
||||
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
|
||||
NAMESPACE=$2
|
||||
install_chatqna
|
||||
popd
|
||||
;;
|
||||
validate_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=chatqna
|
||||
validate_chatqna $NAMESPACE chatqna
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
exit $ret
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,112 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
ROLLOUT_TIMEOUT_SECONDS="1800s"
|
||||
KUBECTL_TIMEOUT_SECONDS="60s"
|
||||
|
||||
function install_chatqna {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f chatqna.yaml -n $NAMESPACE
|
||||
# Sleep enough time for retreiver-usvc to be ready
|
||||
sleep 60
|
||||
}
|
||||
|
||||
function validate_chatqna() {
|
||||
local ns=$1
|
||||
local log=$2
|
||||
max_retry=10
|
||||
# make sure microservice retriever-usvc is ready
|
||||
# try to curl retriever-svc for max_retry times
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
|
||||
curl http://$endpoint_url/v1/retrieval -X POST \
|
||||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice retriever failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
# make sure microservice tgi-svc is ready
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
|
||||
curl http://$endpoint_url/generate -X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice tgi failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# check megaservice works
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$log.log
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
|
||||
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice failed, please check the logs in $LOGFILE!"
|
||||
return ${exit_code}
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] &&
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
return 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_ChatQnA)
|
||||
pushd ChatQnA/tests/common
|
||||
bash _test_manifest_utils.sh init_ChatQnA
|
||||
popd
|
||||
;;
|
||||
install_ChatQnA)
|
||||
pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
|
||||
NAMESPACE=$2
|
||||
install_chatqna
|
||||
popd
|
||||
;;
|
||||
validate_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=chatqna
|
||||
validate_chatqna $NAMESPACE chatqna
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
exit $ret
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,118 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
ROLLOUT_TIMEOUT_SECONDS="1800s"
|
||||
KUBECTL_TIMEOUT_SECONDS="60s"
|
||||
|
||||
function validate_chatqna() {
|
||||
local ns=$1
|
||||
local log=$2
|
||||
max_retry=20
|
||||
# make sure microservice retriever-usvc is ready
|
||||
# try to curl retriever-svc for max_retry times
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
|
||||
curl http://$endpoint_url/v1/retrieval -X POST \
|
||||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice retriever failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# make sure microservice vllm-svc is ready
|
||||
for ((i=1; i<=max_retry; i++))
|
||||
do
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-vllm" $ns)
|
||||
curl http://$endpoint_url/v1/chat/completions -X POST \
|
||||
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
|
||||
-H 'Content-Type: application/json' && break
|
||||
sleep 30
|
||||
done
|
||||
# if i is bigger than max_retry, then exit with error
|
||||
if [ $i -gt $max_retry ]; then
|
||||
echo "Microservice vllm failed, exit with error."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# check megaservice works
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$log.log
|
||||
endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
|
||||
curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice failed, please check the logs in $LOGFILE!"
|
||||
return ${exit_code}
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] &&
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
return 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
function install_chatqna() {
|
||||
echo "Testing manifests chatqna_vllm"
|
||||
local ns=$1
|
||||
bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
|
||||
kubectl create namespace $ns
|
||||
# install guardrail
|
||||
pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
|
||||
kubectl apply -f chatqna-vllm.yaml -n $ns
|
||||
# Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
|
||||
sleep 280
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_ChatQnA)
|
||||
pushd ChatQnA/tests/common
|
||||
bash _test_manifest_utils.sh init_ChatQnA
|
||||
popd
|
||||
;;
|
||||
install_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
install_chatqna $NAMESPACE
|
||||
popd
|
||||
;;
|
||||
validate_ChatQnA)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=chatqna-vllm
|
||||
validate_chatqna $NAMESPACE chatqna-vllm
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
exit $ret
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -6,13 +6,18 @@ tgi:
|
||||
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_codegen() {
|
||||
# executed under path manifest/codegen/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_codegen {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f codegen.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_codegen() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/codegen..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
|
||||
-d '{"messages": "def print_hello_world():"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_CodeGen)
|
||||
pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
|
||||
init_codegen
|
||||
popd
|
||||
;;
|
||||
install_CodeGen)
|
||||
pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
|
||||
NAMESPACE=$2
|
||||
install_codegen
|
||||
popd
|
||||
;;
|
||||
validate_CodeGen)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=codegen
|
||||
validate_codegen
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_codegen() {
|
||||
# executed under path manifest/codegen/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_codegen {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f codegen.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_codegen() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/codegen..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
|
||||
-d '{"messages": "def print_hello_world():"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_CodeGen)
|
||||
pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
|
||||
init_codegen
|
||||
popd
|
||||
;;
|
||||
install_CodeGen)
|
||||
pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
|
||||
NAMESPACE=$2
|
||||
install_codegen
|
||||
popd
|
||||
;;
|
||||
validate_CodeGen)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=codegen
|
||||
validate_codegen
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -5,13 +5,18 @@ tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_codetrans() {
|
||||
# executed under path manifest/codetrans/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_codetrans {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f codetrans.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_codetrans() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/codetrans \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_CodeTrans)
|
||||
pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
|
||||
init_codetrans
|
||||
popd
|
||||
;;
|
||||
install_CodeTrans)
|
||||
pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
|
||||
NAMESPACE=$2
|
||||
install_codetrans
|
||||
popd
|
||||
;;
|
||||
validate_CodeTrans)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=codetrans
|
||||
validate_codetrans
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,86 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_codetrans() {
|
||||
# executed under path manifest/codetrans/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_codetrans {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f codetrans.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_codetrans() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/codetrans \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "print" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_CodeTrans)
|
||||
pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
|
||||
init_codetrans
|
||||
popd
|
||||
;;
|
||||
install_CodeTrans)
|
||||
pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
|
||||
NAMESPACE=$2
|
||||
install_codetrans
|
||||
popd
|
||||
;;
|
||||
validate_CodeTrans)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=codetrans
|
||||
validate_codetrans
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -2,4 +2,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
vllm:
|
||||
enabled: false
|
||||
|
||||
llm-uservice:
|
||||
DOCSUM_BACKEND: "TGI"
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_docsum() {
|
||||
# executed under path manifest/docsum/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_docsum {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f docsum.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_docsum() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/docsum..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/docsum \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'type=text' \
|
||||
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_DocSum)
|
||||
pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
|
||||
init_docsum
|
||||
popd
|
||||
;;
|
||||
install_DocSum)
|
||||
pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
|
||||
NAMESPACE=$2
|
||||
install_docsum
|
||||
popd
|
||||
;;
|
||||
validate_DocSum)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=docsum
|
||||
validate_docsum
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_docsum() {
|
||||
# executed under path manifest/docsum/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_docsum {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f docsum.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_docsum() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/docsum..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/docsum \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'type=text' \
|
||||
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
exit 1
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_DocSum)
|
||||
pushd DocSum/kubernetes/intel/cpu/xeon/manifest
|
||||
init_docsum
|
||||
popd
|
||||
;;
|
||||
install_DocSum)
|
||||
pushd DocSum/kubernetes/intel/cpu/xeon/manifest
|
||||
NAMESPACE=$2
|
||||
install_docsum
|
||||
popd
|
||||
;;
|
||||
validate_DocSum)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=docsum
|
||||
validate_docsum
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -5,13 +5,25 @@ tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: "0"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
PREFILL_BATCH_BUCKET_SIZE: 1
|
||||
BATCH_BUCKET_SIZE: 8
|
||||
extraCmdArgs:
|
||||
- "--max-batch-total-tokens"
|
||||
- "65536"
|
||||
- "--max-batch-prefill-tokens"
|
||||
- "4096"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
18
SearchQnA/kubernetes/helm/README.md
Normal file
18
SearchQnA/kubernetes/helm/README.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# Deploy SearchQnA on Kubernetes cluster
|
||||
|
||||
- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
|
||||
- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
|
||||
|
||||
## Deploy on Xeon
|
||||
|
||||
```
|
||||
export HFTOKEN="insert-your-huggingface-token-here"
|
||||
helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
|
||||
```
|
||||
|
||||
## Deploy on Gaudi
|
||||
|
||||
```
|
||||
export HFTOKEN="insert-your-huggingface-token-here"
|
||||
helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
|
||||
```
|
||||
7
SearchQnA/kubernetes/helm/cpu-values.yaml
Normal file
7
SearchQnA/kubernetes/helm/cpu-values.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
||||
llm_uservice:
|
||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
||||
50
SearchQnA/kubernetes/helm/gaudi-values.yaml
Normal file
50
SearchQnA/kubernetes/helm/gaudi-values.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "2048"
|
||||
MAX_TOTAL_TOKENS: "4096"
|
||||
CUDA_GRAPHS: ""
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
|
||||
tei:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tei-gaudi
|
||||
tag: "1.5.0"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
MAX_WARMUP_SEQUENCE_LENGTH: 512
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: false
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
livenessProbe:
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
18
Text2Image/kubernetes/helm/README.md
Normal file
18
Text2Image/kubernetes/helm/README.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# Deploy txt2img on Kubernetes cluster
|
||||
|
||||
- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
|
||||
- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
|
||||
|
||||
## Deploy on Xeon
|
||||
|
||||
```
|
||||
export HFTOKEN="insert-your-huggingface-token-here"
|
||||
helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
|
||||
```
|
||||
|
||||
## Deploy on Gaudi
|
||||
|
||||
```
|
||||
export HFTOKEN="insert-your-huggingface-token-here"
|
||||
helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
|
||||
```
|
||||
6
Text2Image/kubernetes/helm/cpu-values.yaml
Normal file
6
Text2Image/kubernetes/helm/cpu-values.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
text2image:
|
||||
image:
|
||||
repository: opea/text2image
|
||||
30
Text2Image/kubernetes/helm/gaudi-values.yaml
Normal file
30
Text2Image/kubernetes/helm/gaudi-values.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
text2image:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: opea/text2image-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
# The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
|
||||
# User should change the resource limits for other models
|
||||
hugepages-2Mi: 256Mi
|
||||
volumes:
|
||||
- name: hugepage-2mi
|
||||
emptyDir:
|
||||
medium: HugePages-2Mi
|
||||
volumeMounts:
|
||||
- name: hugepage-2mi
|
||||
mountPath: /hugepages-2Mi
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
@@ -1,86 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_translation() {
|
||||
# executed under path manifest/translation/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_translation {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f translation.yaml -n $NAMESPACE
|
||||
sleep 50s
|
||||
}
|
||||
|
||||
function validate_translation() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/translation..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/translation \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice translation failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_Translation)
|
||||
pushd Translation/kubernetes/intel/hpu/gaudi/manifest
|
||||
init_translation
|
||||
popd
|
||||
;;
|
||||
install_Translation)
|
||||
pushd Translation/kubernetes/intel/hpu/gaudi/manifest
|
||||
NAMESPACE=$2
|
||||
install_translation
|
||||
popd
|
||||
;;
|
||||
validate_Translation)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=translation
|
||||
validate_translation
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
USER_ID=$(whoami)
|
||||
LOG_PATH=/home/$(whoami)/logs
|
||||
MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
|
||||
IMAGE_REPO=${IMAGE_REPO:-opea}
|
||||
IMAGE_TAG=${IMAGE_TAG:-latest}
|
||||
|
||||
function init_translation() {
|
||||
# executed under path manifest/translation/xeon
|
||||
# replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
|
||||
# replace microservice image tag
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
|
||||
# replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
|
||||
# set huggingface token
|
||||
find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
|
||||
}
|
||||
|
||||
function install_translation {
|
||||
echo "namespace is $NAMESPACE"
|
||||
kubectl apply -f translation.yaml -n $NAMESPACE
|
||||
}
|
||||
|
||||
function validate_translation() {
|
||||
ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
|
||||
port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
|
||||
echo "try to curl http://${ip_address}:${port}/v1/translation..."
|
||||
|
||||
# generate a random logfile name to avoid conflict among multiple runners
|
||||
LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
|
||||
# Curl the Mega Service
|
||||
curl http://${ip_address}:${port}/v1/translation \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Megaservice translation failed, please check the logs in $LOGFILE!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking response results, make sure the output is reasonable. "
|
||||
local status=false
|
||||
if [[ -f $LOGFILE ]] && \
|
||||
[[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
|
||||
status=true
|
||||
fi
|
||||
|
||||
if [ $status == false ]; then
|
||||
echo "Response check failed, please check the logs in artifacts!"
|
||||
else
|
||||
echo "Response check succeed!"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <function_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
init_Translation)
|
||||
pushd Translation/kubernetes/intel/cpu/xeon/manifest
|
||||
init_translation
|
||||
popd
|
||||
;;
|
||||
install_Translation)
|
||||
pushd Translation/kubernetes/intel/cpu/xeon/manifest
|
||||
NAMESPACE=$2
|
||||
install_translation
|
||||
popd
|
||||
;;
|
||||
validate_Translation)
|
||||
NAMESPACE=$2
|
||||
SERVICE_NAME=translation
|
||||
validate_translation
|
||||
;;
|
||||
*)
|
||||
echo "Unknown function: $1"
|
||||
;;
|
||||
esac
|
||||
@@ -9,13 +9,18 @@ tgi:
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.0.6"
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
Reference in New Issue
Block a user