Add support for latest deepseek models on Gaudi (#1491)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
Liang Lv
2025-02-05 08:30:04 +08:00
committed by GitHub
parent a4d028e8ea
commit 9adf7a6af0
10 changed files with 14 additions and 5 deletions

View File

@@ -10,7 +10,7 @@ Quick Start:
2. Run Docker Compose. 2. Run Docker Compose.
3. Consume the ChatQnA Service. 3. Consume the ChatQnA Service.
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
## Quick Start: 1.Setup Environment Variable ## Quick Start: 1.Setup Environment Variable

View File

@@ -92,6 +92,7 @@ services:
HABANA_VISIBLE_DEVICES: all HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID} LLM_MODEL_ID: ${LLM_MODEL_ID}
NUM_CARDS: ${NUM_CARDS}
VLLM_TORCH_PROFILER_DIR: "/mnt" VLLM_TORCH_PROFILER_DIR: "/mnt"
healthcheck: healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"] test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
@@ -102,7 +103,7 @@ services:
cap_add: cap_add:
- SYS_NICE - SYS_NICE
ipc: host ipc: host
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
chatqna-gaudi-backend-server: chatqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
container_name: chatqna-gaudi-backend-server container_name: chatqna-gaudi-backend-server

View File

@@ -133,12 +133,13 @@ services:
HABANA_VISIBLE_DEVICES: all HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID} LLM_MODEL_ID: ${LLM_MODEL_ID}
NUM_CARDS: ${NUM_CARDS}
VLLM_TORCH_PROFILER_DIR: "/mnt" VLLM_TORCH_PROFILER_DIR: "/mnt"
runtime: habana runtime: habana
cap_add: cap_add:
- SYS_NICE - SYS_NICE
ipc: host ipc: host
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
chatqna-gaudi-backend-server: chatqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest} image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
container_name: chatqna-gaudi-guardrails-server container_name: chatqna-gaudi-guardrails-server

View File

@@ -101,11 +101,12 @@ services:
LIMIT_HPU_GRAPH: true LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true FLASH_ATTENTION_RECOMPUTE: true
NUM_CARDS: ${NUM_CARDS}
runtime: habana runtime: habana
cap_add: cap_add:
- SYS_NICE - SYS_NICE
ipc: host ipc: host
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
jaeger: jaeger:
image: jaegertracing/all-in-one:latest image: jaegertracing/all-in-one:latest
container_name: jaeger container_name: jaeger

View File

@@ -73,12 +73,13 @@ services:
HABANA_VISIBLE_DEVICES: all HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID} LLM_MODEL_ID: ${LLM_MODEL_ID}
NUM_CARDS: ${NUM_CARDS}
VLLM_TORCH_PROFILER_DIR: "/mnt" VLLM_TORCH_PROFILER_DIR: "/mnt"
runtime: habana runtime: habana
cap_add: cap_add:
- SYS_NICE - SYS_NICE
ipc: host ipc: host
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
chatqna-gaudi-backend-server: chatqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest} image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
container_name: chatqna-gaudi-backend-server container_name: chatqna-gaudi-backend-server

View File

@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export RERANK_MODEL_ID="BAAI/bge-reranker-base" export RERANK_MODEL_ID="BAAI/bge-reranker-base"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export INDEX_NAME="rag-redis" export INDEX_NAME="rag-redis"
export NUM_CARDS=1
# Set it as a non-null string, such as true, if you want to enable logging facility, # Set it as a non-null string, such as true, if you want to enable logging facility,
# otherwise, keep it as "" to disable it. # otherwise, keep it as "" to disable it.
export LOGFLAG="" export LOGFLAG=""

View File

@@ -47,6 +47,7 @@ function start_services() {
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export RERANK_MODEL_ID="BAAI/bge-reranker-base" export RERANK_MODEL_ID="BAAI/bge-reranker-base"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export NUM_CARDS=1
export INDEX_NAME="rag-redis" export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B" export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"

View File

@@ -45,6 +45,7 @@ function start_services() {
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export RERANK_MODEL_ID="BAAI/bge-reranker-base" export RERANK_MODEL_ID="BAAI/bge-reranker-base"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export NUM_CARDS=1
export INDEX_NAME="rag-redis" export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export host_ip=${ip_address} export host_ip=${ip_address}

View File

@@ -46,6 +46,7 @@ function start_services() {
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export RERANK_MODEL_ID="BAAI/bge-reranker-base" export RERANK_MODEL_ID="BAAI/bge-reranker-base"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export NUM_CARDS=1
export INDEX_NAME="rag-redis" export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+') export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')

View File

@@ -45,6 +45,7 @@ function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi cd $WORKPATH/docker_compose/intel/hpu/gaudi
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export NUM_CARDS=1
export INDEX_NAME="rag-redis" export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}