Add support for latest deepseek models on Gaudi (#1491)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
2025-02-05 08:30:04 +08:00
parent a4d028e8ea
commit 9adf7a6af0
10 changed files with 14 additions and 5 deletions
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -10,7 +10,7 @@ Quick Start:
 2. Run Docker Compose.
 3. Consume the ChatQnA Service.

-Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.

 ## Quick Start: 1.Setup Environment Variable

--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -92,6 +92,7 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
@@ -102,7 +103,7 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -133,12 +133,13 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
    container_name: chatqna-gaudi-guardrails-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -101,11 +101,12 @@ services:
      LIMIT_HPU_GRAPH: true
      USE_FLASH_ATTENTION: true
      FLASH_ATTENTION_RECOMPUTE: true
+      NUM_CARDS: ${NUM_CARDS}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
  jaeger:
    image: jaegertracing/all-in-one:latest
    container_name: jaeger
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -73,12 +73,13 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 export INDEX_NAME="rag-redis"
+export NUM_CARDS=1
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
 export LOGFLAG=""