Update TGI image versions (#1625)

Signed-off-by: xiaotia3 <xiaotian.chen@intel.com>
2025-04-01 11:27:51 +08:00
parent 583428c6a7
commit 1bd56af994
36 changed files with 54 additions and 52 deletions
--- a/AgentQnA/docker_compose/amd/gpu/rocm/launch_agent_service_tgi_rocm.sh
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/launch_agent_service_tgi_rocm.sh
@@ -4,7 +4,7 @@
 WORKPATH=$(dirname "$PWD")/..
 export ip_address=${host_ip}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
 export AGENTQNA_TGI_SERVICE_PORT="8085"

 # LLM related environment variables
--- a/AgentQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -6,7 +6,7 @@
 WORKPATH=$(dirname "$PWD")/..
 export ip_address=${host_ip}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
 export AGENTQNA_TGI_SERVICE_PORT="19001"

 # LLM related environment variables
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
@@ -3,7 +3,7 @@

 services:
  tgi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
    container_name: tgi-server
    ports:
      - "8085:80"
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -25,7 +25,7 @@ services:
      https_proxy: ${https_proxy}
    restart: unless-stopped
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: tgi-service
    ports:
      - "3006:80"
--- a/AudioQnA/kubernetes/gmc/README.md
+++ b/AudioQnA/kubernetes/gmc/README.md
@@ -14,7 +14,7 @@ The AudioQnA application is defined as a Custom Resource (CR) file that the abov

 The AudioQnA uses the below prebuilt images if you choose a Xeon deployment

- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
+- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.1
 - llm: opea/llm-textgen:latest
 - asr: opea/asr:latest
 - whisper: opea/whisper:latest
@@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
 For Gaudi:

- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.6
+- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.3.1
 - whisper-gaudi: opea/whisper-gaudi:latest
 - speecht5-gaudi: opea/speecht5-gaudi:latest

--- a/AudioQnA/tests/test_compose_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_on_rocm.sh
@@ -34,8 +34,8 @@ function build_docker_images() {
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper speecht5"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-    echo "docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    echo "docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

--- a/ChatQnA/benchmark/accuracy/README.md
+++ b/ChatQnA/benchmark/accuracy/README.md
@@ -45,10 +45,10 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
 ```
 # please set your llm_port and hf_token

-docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2

 # for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
-docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
 ```

 ### Prepare Dataset
--- a/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
+++ b/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
@@ -19,7 +19,7 @@ docker run -it --rm \
    --ipc=host \
    -e HTTPS_PROXY=$https_proxy \
    -e HTTP_PROXY=$https_proxy \
-    ghcr.io/huggingface/tgi-gaudi:2.0.6 \
+    ghcr.io/huggingface/tgi-gaudi:2.3.1 \
    --model-id $model_name \
    --max-input-tokens $max_input_tokens \
    --max-total-tokens $max_total_tokens \
--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -190,7 +190,7 @@ Change the `xxx_MODEL_ID` below for your needs.
   # Example: NGINX_PORT=80
   export HOST_IP=${host_ip}
   export NGINX_PORT=${your_nginx_port}
-   export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+   export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
   export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
   export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base"
   export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -158,7 +158,7 @@ The default deployment utilizes Gaudi devices primarily for the `vllm-service`,

 ### compose_tgi.yaml - TGI Deployment

-The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.

 | Service Name                 | Image Name                                            | Gaudi Specific |
 | ---------------------------- | ----------------------------------------------------- | -------------- |
@@ -167,7 +167,7 @@ The TGI (Text Generation Inference) deployment and the default deployment differ
 | tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             |
 | retriever                    | opea/retriever:latest                                 | No             |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         |
-| **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Configurable   |
+| **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Configurable   |
 | chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             |
 | chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             |
 | chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             |
@@ -178,7 +178,7 @@ This deployment may allocate more Gaudi resources to the tgi-service to optimize

 The FAQs(frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It add a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.

-The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.

 | Service Name                 | Image Name                                            | Gaudi Use    |
 | ---------------------------- | ----------------------------------------------------- | ------------ |
@@ -214,13 +214,13 @@ This setup might allow for more Gaudi devices to be dedicated to the `vllm-servi

 ### compose_guardrails.yaml - Guardrails Deployment

-The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
+The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.

 | Service Name                 | Image Name                                            | Gaudi Specific | Uses LLM |
 | ---------------------------- | ----------------------------------------------------- | -------------- | -------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             | No       |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             | No       |
-| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | 1 card         | Yes      |
+| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | 1 card         | Yes      |
 | _guardrails_                 | opea/guardrails:latest                                | No             | No       |
 | tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             | No       |
 | retriever                    | opea/retriever:latest                                 | No             | No       |
@@ -262,8 +262,8 @@ The table provides a comprehensive overview of the ChatQnA services utilized acr
 | retriever                    | opea/retriever:latest                                 | No       | Retrieves data from the Redis database and interacts with embedding services.                      |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | Yes      | Reranks text embeddings, typically using Gaudi hardware for enhanced performance.                  |
 | vllm-service                 | opea/vllm-gaudi:latest                                | No       | Handles large language model (LLM) tasks, utilizing Gaudi hardware.                                |
-| tgi-service                  | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware.         |
-| tgi-guardrails-service       | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Provides guardrails functionality, ensuring safe operations within defined limits.                 |
+| tgi-service                  | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Yes      | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware.         |
+| tgi-guardrails-service       | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | Yes      | Provides guardrails functionality, ensuring safe operations within defined limits.                 |
 | guardrails                   | opea/guardrails:latest                                | Yes      | Acts as a safety layer, interfacing with the `tgi-guardrails-service` to enforce safety protocols. |
 | chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No       | Serves as the backend for the ChatQnA application, with variations depending on the deployment.    |
 | chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No       | Provides the user interface for the ChatQnA application.                                           |
@@ -288,7 +288,7 @@ The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `te

 ### tgi-gaurdrails-service

-The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.0.6` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.
+The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.3.1` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.

 ## Conclusion

--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tgi-guardrails-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
    container_name: tgi-guardrails-server
    ports:
      - "8088:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -80,7 +80,7 @@ services:
      MAX_WARMUP_SEQUENCE_LENGTH: 512
    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
    container_name: tgi-gaudi-server
    ports:
      - "8005:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -49,15 +49,15 @@ f810f3b4d329   opea/embedding:latest                               "python embed
 69e1fb59e92c   opea/retriever:latest                             "/home/user/comps/re…"   2 minutes ago   Up 2 minutes                    0.0.0.0:7000->7000/tcp, :::7000->7000/tcp                                              retriever-redis-server
 313b9d14928a   opea/reranking-tei:latest                               "python reranking_te…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8000->8000/tcp, :::8000->8000/tcp                                              reranking-tei-gaudi-server
 174bd43fa6b5   ghcr.io/huggingface/tei-gaudi:1.5.0                    "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8090->80/tcp, :::8090->80/tcp                                                  tei-embedding-gaudi-server
-05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.0.6                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
+05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.3.1                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
 74084469aa33   redis/redis-stack:7.2.0-v9                              "/entrypoint.sh"         2 minutes ago   Up 2 minutes                    0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
 88399dbc9e43   ghcr.io/huggingface/text-embeddings-inference:cpu-1.6   "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8808->80/tcp, :::8808->80/tcp                                                  tei-reranking-gaudi-server
 ```

-In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.6` Existed.
+In this case, `ghcr.io/huggingface/tgi-gaudi:2.3.1` Existed.

 ```
-05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.0.6                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
+05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.3.1                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
 ```

 Next we can check the container logs to get to know what happened during the docker start.
@@ -68,7 +68,7 @@ Check the log of container by:

 `docker logs <CONTAINER ID> -t`

-View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.6`
+View the logs of `ghcr.io/huggingface/tgi-gaudi:2.3.1`

 `docker logs 05c40b636239 -t`

@@ -97,7 +97,7 @@ So just make sure the devices are available.
 Here is another failure example:

 ```
-f7a08f9867f9   ghcr.io/huggingface/tgi-gaudi:2.0.6                     "text-generation-lau…"   16 seconds ago   Exited (2) 14 seconds ago                                                                                          tgi-gaudi-server
+f7a08f9867f9   ghcr.io/huggingface/tgi-gaudi:2.3.1                     "text-generation-lau…"   16 seconds ago   Exited (2) 14 seconds ago                                                                                          tgi-gaudi-server
 ```

 Check the log by `docker logs f7a08f9867f9 -t`.
@@ -114,7 +114,7 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co

 ```
  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
    container_name: tgi-gaudi-server
    ports:
      - "8008:80"
--- a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
+++ b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
@@ -82,7 +82,7 @@ services:
              count: 1
              capabilities: [gpu]
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.2.0
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1
    container_name: tgi-server
    ports:
      - "8008:80"
--- a/ChatQnA/kubernetes/gmc/README.md
+++ b/ChatQnA/kubernetes/gmc/README.md
@@ -25,7 +25,7 @@ Should you desire to use the Gaudi accelerator, two alternate images are used fo
 For Gaudi:

 tei-embedding-service: ghcr.io/huggingface/tei-gaudi:1.5.0
-tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.6
+tgi-service: gghcr.io/huggingface/tgi-gaudi:2.3.1


 > [NOTE]  
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -37,7 +37,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

--- a/CodeGen/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/CodeGen/docker_compose/amd/gpu/rocm/compose.yaml
@@ -4,7 +4,7 @@

 services:
  codegen-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: codegen-tgi-service
    ports:
      - "${CODEGEN_TGI_SERVICE_PORT:-8028}:80"
--- a/CodeTrans/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/compose.yaml
@@ -4,7 +4,7 @@

 services:
  codetrans-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: codetrans-tgi-service
    ports:
      - "${CODETRANS_TGI_SERVICE_PORT:-8008}:80"
--- a/CodeTrans/tests/test_compose_on_rocm.sh
+++ b/CodeTrans/tests/test_compose_on_rocm.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
    service_list="codetrans codetrans-ui llm-textgen nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

--- a/DBQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/DBQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -7,7 +7,7 @@ version: "3.8"

 services:
  dbqna-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: dbqna-tgi-service
    ports:
      - "${DBQNA_TGI_SERVICE_PORT:-8008}:80"
--- a/DBQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/DBQnA/docker_compose/intel/cpu/xeon/README.md
@@ -108,7 +108,7 @@ docker run --name test-text2sql-postgres --ipc=host -e POSTGRES_USER=${POSTGRES_

 ```bash

-docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $model
+docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
 ```

 - Start Text-to-SQL Service
--- a/DBQnA/tests/test_compose_on_rocm.sh
+++ b/DBQnA/tests/test_compose_on_rocm.sh
@@ -31,7 +31,7 @@ function build_docker_images() {
    service_list="text2sql text2sql-react-ui"

    docker compose -f build.yaml build ${service_list} --no-cache > "${LOG_PATH}"/docker_image_build.log
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

--- a/DocSum/docker_compose/amd/gpu/rocm/README.md
+++ b/DocSum/docker_compose/amd/gpu/rocm/README.md
@@ -72,7 +72,7 @@ For gated models, you also need to provide [HuggingFace token](https://huggingfa
 Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.

 ```bash
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
 export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export HOST_IP=${host_ip}
 export DOCSUM_TGI_SERVICE_PORT="18882"
--- a/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
@@ -3,7 +3,7 @@

 services:
  docsum-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: docsum-tgi-service
    ports:
      - "${DOCSUM_TGI_SERVICE_PORT}:80"
--- a/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
@@ -5,7 +5,7 @@

 export MAX_INPUT_TOKENS=2048
 export MAX_TOTAL_TOKENS=4096
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
 export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export HOST_IP=${host_ip}
 export DOCSUM_TGI_SERVICE_PORT="8008"
--- a/DocSum/kubernetes/gmc/README.md
+++ b/DocSum/kubernetes/gmc/README.md
@@ -9,7 +9,7 @@ The DocSum application is defined as a Custom Resource (CR) file that the above

 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the
 the image `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
-service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.6`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
+service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.3.1`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.

 [NOTE]
 Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/docker_compose/intel/cpu/xeon/README.md) or
--- a/DocSum/tests/test_compose_on_rocm.sh
+++ b/DocSum/tests/test_compose_on_rocm.sh
@@ -15,7 +15,7 @@ export MAX_INPUT_TOKENS=1024
 export MAX_TOTAL_TOKENS=2048
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
 export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export HOST_IP=${ip_address}
 export host_ip=${ip_address}
@@ -45,7 +45,7 @@ function build_docker_images() {
    service_list="docsum docsum-gradio-ui whisper llm-docsum"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1
    docker images && sleep 1s
 }

--- a/DocSum/tests/test_compose_on_xeon.sh
+++ b/DocSum/tests/test_compose_on_xeon.sh
@@ -67,6 +67,8 @@ function build_docker_images() {
    service_list="docsum docsum-gradio-ui whisper llm-docsum vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1
+
    docker images && sleep 1s
 }

--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -93,7 +93,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tgi-rocm:
-    image: ghcr.io/huggingface/text-generation-inference:3.0.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: tgi-llava-rocm-server
    ports:
      - "8399:80"
--- a/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/SearchQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -86,7 +86,7 @@ services:
    restart: unless-stopped

  search-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: search-tgi-service
    ports:
      - "${SEARCH_TGI_SERVICE_PORT:-3006}:80"
--- a/SearchQnA/tests/test_compose_on_gaudi.sh
+++ b/SearchQnA/tests/test_compose_on_gaudi.sh
@@ -37,7 +37,7 @@ function build_docker_images() {

    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
    docker images && sleep 1s
 }

--- a/SearchQnA/tests/test_compose_on_rocm.sh
+++ b/SearchQnA/tests/test_compose_on_rocm.sh
@@ -24,7 +24,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

--- a/Translation/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/Translation/docker_compose/amd/gpu/rocm/compose.yaml
@@ -3,7 +3,7 @@

 services:
  translation-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    container_name: translation-tgi-service
    ports:
      - "${TRANSLATION_TGI_SERVICE_PORT:-8008}:80"
--- a/Translation/tests/test_compose_on_rocm.sh
+++ b/Translation/tests/test_compose_on_rocm.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
    service_list="translation translation-ui llm-textgen nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

--- a/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -32,7 +32,7 @@ rm -rf vllm-fork
 ```bash
 # TGI (Optional)

-docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
 ```

 ### 3. Build MegaService Docker Image
@@ -58,7 +58,7 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
 Then run the command `docker images`, you will have the following 5 Docker Images:

 1. `opea/vllm-gaudi:latest`
-2. `ghcr.io/huggingface/tgi-gaudi:2.0.6` (Optional)
+2. `ghcr.io/huggingface/tgi-gaudi:2.3.1` (Optional)
 3. `opea/lvm:latest`
 4. `opea/visualqna:latest`
 5. `opea/visualqna-ui:latest`