Using TGI official release docker image for intel cpu (#581)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
2024-08-18 17:17:44 +08:00
parent e81e0e557c
commit b2771ad3f2
25 changed files with 48 additions and 39 deletions
--- a/AudioQnA/docker/xeon/compose.yaml
+++ b/AudioQnA/docker/xeon/compose.yaml
@@ -41,7 +41,7 @@ services:
    environment:
      TTS_ENDPOINT: ${TTS_ENDPOINT}
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "3006:80"
@@ -53,7 +53,7 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/ChatQnA/docker/gpu/compose.yaml
+++ b/ChatQnA/docker/gpu/compose.yaml
@@ -116,7 +116,7 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.0
+    image: ghcr.io/huggingface/text-generation-inference:2.2.0
    container_name: tgi-server
    ports:
      - "8008:80"
--- a/ChatQnA/docker/xeon/compose.yaml
+++ b/ChatQnA/docker/xeon/compose.yaml
@@ -102,7 +102,7 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "9009:80"
@@ -116,7 +116,7 @@ services:
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/ChatQnA/docker/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker/xeon/compose_qdrant.yaml
@@ -102,7 +102,7 @@ services:
      HF_HUB_ENABLE_HF_TRANSFER: 0
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "6042:80"
@@ -116,7 +116,7 @@ services:
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/ChatQnA/kubernetes/README.md
+++ b/ChatQnA/kubernetes/README.md
@@ -20,7 +20,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 - retriever: opea/retriever-redis:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - reranking: opea/reranking-tei:latest
- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
+- tgi-service: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 - llm: opea/llm-tgi:latest
 - chaqna-xeon-backend-server: opea/chatqna:latest

--- a/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
+++ b/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
@@ -190,6 +190,7 @@ metadata:
 data:
  MODEL_ID: "Intel/neural-chat-7b-v3-3"
  PORT: "2080"
+  CUDA_GRAPHS: "0"
  HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
  HF_TOKEN: "insert-your-huggingface-token-here"
  MAX_INPUT_TOKENS: "1024"
@@ -993,7 +994,7 @@ spec:
                name: chatqna-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/CodeGen/docker/xeon/compose.yaml
+++ b/CodeGen/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "8028:80"
@@ -15,7 +15,7 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/CodeGen/kubernetes/manifests/xeon/codegen.yaml
+++ b/CodeGen/kubernetes/manifests/xeon/codegen.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
  MODEL_ID: "meta-llama/CodeLlama-7b-hf"
  PORT: "2080"
+  CUDA_GRAPHS: "0"
  HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
  HF_TOKEN: "insert-your-huggingface-token-here"
  MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                name: codegen-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
+++ b/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
@@ -117,6 +117,8 @@ spec:
              value: ise-uiuc/Magicoder-S-DS-6.7B
            - name: PORT
              value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
            - name: http_proxy
              value:
            - name: https_proxy
@@ -124,7 +126,7 @@ spec:
            - name: no_proxy
              value:
          securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/CodeGen/tests/test_codegen_on_xeon.sh
+++ b/CodeGen/tests/test_codegen_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    service_list="codegen codegen-ui llm-tgi"
    docker compose -f docker_build_compose.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    docker images
 }

--- a/CodeTrans/docker/xeon/compose.yaml
+++ b/CodeTrans/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: codetrans-tgi-service
    ports:
      - "8008:80"
@@ -15,7 +15,7 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
+++ b/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
  MODEL_ID: "HuggingFaceH4/mistral-7b-grok"
  PORT: "2080"
+  CUDA_GRAPHS: "0"
  HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
  HF_TOKEN: "insert-your-huggingface-token-here"
  MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                name: codetrans-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/DocSum/docker/xeon/compose.yaml
+++ b/DocSum/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "8008:80"
@@ -16,7 +16,7 @@ services:
    volumes:
      - "./data:/data"
    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
    container_name: llm-docsum-server
--- a/DocSum/kubernetes/README.md
+++ b/DocSum/kubernetes/README.md
@@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.

 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the
-the image ghcr.io/huggingface/text-generation-inference:1.4. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+the image ghcr.io/huggingface/text-generation-inference:latest-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
 service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3.

 [NOTE]
--- a/DocSum/kubernetes/manifests/xeon/docsum.yaml
+++ b/DocSum/kubernetes/manifests/xeon/docsum.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
  MODEL_ID: "Intel/neural-chat-7b-v3-3"
  PORT: "2080"
+  CUDA_GRAPHS: "0"
  HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
  HF_TOKEN: "insert-your-huggingface-token-here"
  MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                name: docsum-tgi-config
          securityContext:
            {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
+++ b/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
@@ -117,6 +117,8 @@ spec:
              value: Intel/neural-chat-7b-v3-3
            - name: PORT
              value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
            - name: http_proxy
              value:
            - name: https_proxy
@@ -124,7 +126,7 @@ spec:
            - name: no_proxy
              value:
          securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/FaqGen/docker/xeon/compose.yaml
+++ b/FaqGen/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-xeon-server
    ports:
      - "8008:80"
@@ -16,7 +16,7 @@ services:
    volumes:
      - "./data:/data"
    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm_faqgen:
    image: ${REGISTRY:-opea}/llm-faqgen-tgi:${TAG:-latest}
    container_name: llm-faqgen-server
--- a/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
@@ -48,6 +48,8 @@ spec:
        args:
        - --model-id
        - 'meta-llama/Meta-Llama-3-8B-Instruct'
+        - --cuda_graphs
+        - '0'
        - --max-input-length
        - '3096'
        - --max-total-tokens
--- a/FaqGen/kubernetes/manifests/xeon/faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/xeon/faqgen.yaml
@@ -34,6 +34,8 @@ spec:
        args:
        - --model-id
        - 'meta-llama/Meta-Llama-3-8B-Instruct'
+        - --cuda_graphs
+        - '0'
        - --max-input-length
        - '3096'
        - --max-total-tokens
--- a/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
@@ -117,6 +117,8 @@ spec:
              value: Intel/neural-chat-7b-v3-3
            - name: PORT
              value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
            - name: http_proxy
              value:
            - name: https_proxy
@@ -124,7 +126,7 @@ spec:
            - name: no_proxy
              value:
          securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - mountPath: /data
--- a/SearchQnA/docker/xeon/compose.yaml
+++ b/SearchQnA/docker/xeon/compose.yaml
@@ -73,7 +73,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "3006:80"
@@ -85,7 +85,7 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/SearchQnA/tests/test_searchqna_on_xeon.sh
+++ b/SearchQnA/tests/test_searchqna_on_xeon.sh
@@ -23,7 +23,7 @@ function build_docker_images() {
    docker compose -f docker_build_compose.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    docker images
 }

--- a/Translation/docker/xeon/compose.yaml
+++ b/Translation/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-service
    ports:
      - "8008:80"
@@ -15,7 +15,7 @@ services:
    volumes:
      - "./data:/data"
    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  llm:
    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
    container_name: llm-tgi-server
--- a/VisualQnA/docker/xeon/README.md
+++ b/VisualQnA/docker/xeon/README.md
@@ -68,20 +68,15 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
 cd ../../../..
 ```

-### 4. Build TGI Xeon Image
-
-Since TGI official image has not supported llava-next for CPU, we'll need to build it based on Dockerfile_intel.
+### 4. Pull TGI Xeon Image

 ```bash
-git clone https://github.com/huggingface/text-generation-inference
-cd text-generation-inference/
-docker build -t opea/llava-tgi-xeon:latest --build-arg PLATFORM=cpu --build-arg http_proxy=${http_proxy} --build-arg https_proxy=${https_proxy} . -f Dockerfile_intel
-cd ../
+docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 ```

 Then run the command `docker images`, you will have the following 4 Docker Images:

-1. `opea/llava-tgi-xeon:latest`
+1. `ghcr.io/huggingface/text-generation-inference:latest-intel-cpu`
 2. `opea/lvm-tgi:latest`
 3. `opea/visualqna:latest`
 4. `opea/visualqna-ui:latest`
--- a/VisualQnA/docker/xeon/compose.yaml
+++ b/VisualQnA/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@

 services:
  llava-tgi-service:
-    image: ${REGISTRY:-opea}/llava-tgi-xeon:${TAG:-latest}
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
    container_name: tgi-llava-xeon-server
    ports:
      - "9399:80"