Add support for latest deepseek models on Gaudi (#1491)
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
@@ -10,7 +10,7 @@ Quick Start:
|
|||||||
2. Run Docker Compose.
|
2. Run Docker Compose.
|
||||||
3. Consume the ChatQnA Service.
|
3. Consume the ChatQnA Service.
|
||||||
|
|
||||||
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
|
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
|
||||||
|
|
||||||
## Quick Start: 1.Setup Environment Variable
|
## Quick Start: 1.Setup Environment Variable
|
||||||
|
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ services:
|
|||||||
HABANA_VISIBLE_DEVICES: all
|
HABANA_VISIBLE_DEVICES: all
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||||
|
NUM_CARDS: ${NUM_CARDS}
|
||||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
|
test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
|
||||||
@@ -102,7 +103,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-backend-server
|
container_name: chatqna-gaudi-backend-server
|
||||||
|
|||||||
@@ -133,12 +133,13 @@ services:
|
|||||||
HABANA_VISIBLE_DEVICES: all
|
HABANA_VISIBLE_DEVICES: all
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||||
|
NUM_CARDS: ${NUM_CARDS}
|
||||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||||
runtime: habana
|
runtime: habana
|
||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-guardrails-server
|
container_name: chatqna-gaudi-guardrails-server
|
||||||
|
|||||||
@@ -101,11 +101,12 @@ services:
|
|||||||
LIMIT_HPU_GRAPH: true
|
LIMIT_HPU_GRAPH: true
|
||||||
USE_FLASH_ATTENTION: true
|
USE_FLASH_ATTENTION: true
|
||||||
FLASH_ATTENTION_RECOMPUTE: true
|
FLASH_ATTENTION_RECOMPUTE: true
|
||||||
|
NUM_CARDS: ${NUM_CARDS}
|
||||||
runtime: habana
|
runtime: habana
|
||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||||
jaeger:
|
jaeger:
|
||||||
image: jaegertracing/all-in-one:latest
|
image: jaegertracing/all-in-one:latest
|
||||||
container_name: jaeger
|
container_name: jaeger
|
||||||
|
|||||||
@@ -73,12 +73,13 @@ services:
|
|||||||
HABANA_VISIBLE_DEVICES: all
|
HABANA_VISIBLE_DEVICES: all
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||||
|
NUM_CARDS: ${NUM_CARDS}
|
||||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||||
runtime: habana
|
runtime: habana
|
||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-backend-server
|
container_name: chatqna-gaudi-backend-server
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
|||||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
export INDEX_NAME="rag-redis"
|
export INDEX_NAME="rag-redis"
|
||||||
|
export NUM_CARDS=1
|
||||||
# Set it as a non-null string, such as true, if you want to enable logging facility,
|
# Set it as a non-null string, such as true, if you want to enable logging facility,
|
||||||
# otherwise, keep it as "" to disable it.
|
# otherwise, keep it as "" to disable it.
|
||||||
export LOGFLAG=""
|
export LOGFLAG=""
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ function start_services() {
|
|||||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
export NUM_CARDS=1
|
||||||
export INDEX_NAME="rag-redis"
|
export INDEX_NAME="rag-redis"
|
||||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||||
export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
|
export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ function start_services() {
|
|||||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
export NUM_CARDS=1
|
||||||
export INDEX_NAME="rag-redis"
|
export INDEX_NAME="rag-redis"
|
||||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||||
export host_ip=${ip_address}
|
export host_ip=${ip_address}
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ function start_services() {
|
|||||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
export NUM_CARDS=1
|
||||||
export INDEX_NAME="rag-redis"
|
export INDEX_NAME="rag-redis"
|
||||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ function start_services() {
|
|||||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
export NUM_CARDS=1
|
||||||
export INDEX_NAME="rag-redis"
|
export INDEX_NAME="rag-redis"
|
||||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user