Update TGI image versions (#1625)

Signed-off-by: xiaotia3 <xiaotian.chen@intel.com>
This commit is contained in:
Xiaotian Chen
2025-04-01 11:27:51 +08:00
committed by GitHub
parent 583428c6a7
commit 1bd56af994
36 changed files with 54 additions and 52 deletions

View File

@@ -4,7 +4,7 @@
WORKPATH=$(dirname "$PWD")/..
export ip_address=${host_ip}
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
export AGENTQNA_TGI_SERVICE_PORT="8085"
# LLM related environment variables

View File

@@ -6,7 +6,7 @@
WORKPATH=$(dirname "$PWD")/..
export ip_address=${host_ip}
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
export AGENTQNA_TGI_IMAGE=ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
export AGENTQNA_TGI_SERVICE_PORT="19001"
# LLM related environment variables

View File

@@ -3,7 +3,7 @@
services:
tgi-server:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-server
ports:
- "8085:80"

View File

@@ -25,7 +25,7 @@ services:
https_proxy: ${https_proxy}
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: tgi-service
ports:
- "3006:80"

View File

@@ -14,7 +14,7 @@ The AudioQnA application is defined as a Custom Resource (CR) file that the abov
The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.1
- llm: opea/llm-textgen:latest
- asr: opea/asr:latest
- whisper: opea/whisper:latest
@@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
For Gaudi:
- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.6
- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.3.1
- whisper-gaudi: opea/whisper-gaudi:latest
- speecht5-gaudi: opea/speecht5-gaudi:latest

View File

@@ -34,8 +34,8 @@ function build_docker_images() {
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="audioqna audioqna-ui whisper speecht5"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
echo "docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
echo "docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
docker images && sleep 1s
}

View File

@@ -45,10 +45,10 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
```
# please set your llm_port and hf_token
docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
# for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
```
### Prepare Dataset

View File

@@ -19,7 +19,7 @@ docker run -it --rm \
--ipc=host \
-e HTTPS_PROXY=$https_proxy \
-e HTTP_PROXY=$https_proxy \
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
--model-id $model_name \
--max-input-tokens $max_input_tokens \
--max-total-tokens $max_total_tokens \

View File

@@ -190,7 +190,7 @@ Change the `xxx_MODEL_ID` below for your needs.
# Example: NGINX_PORT=80
export HOST_IP=${host_ip}
export NGINX_PORT=${your_nginx_port}
export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base"
export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"

View File

@@ -158,7 +158,7 @@ The default deployment utilizes Gaudi devices primarily for the `vllm-service`,
### compose_tgi.yaml - TGI Deployment
The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
| Service Name | Image Name | Gaudi Specific |
| ---------------------------- | ----------------------------------------------------- | -------------- |
@@ -167,7 +167,7 @@ The TGI (Text Generation Inference) deployment and the default deployment differ
| tei-embedding-service | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No |
| retriever | opea/retriever:latest | No |
| tei-reranking-service | ghcr.io/huggingface/tei-gaudi:1.5.0 | 1 card |
| **tgi-service** | ghcr.io/huggingface/tgi-gaudi:2.0.6 | Configurable |
| **tgi-service** | ghcr.io/huggingface/tgi-gaudi:2.3.1 | Configurable |
| chatqna-gaudi-backend-server | opea/chatqna:latest | No |
| chatqna-gaudi-ui-server | opea/chatqna-ui:latest | No |
| chatqna-gaudi-nginx-server | opea/nginx:latest | No |
@@ -178,7 +178,7 @@ This deployment may allocate more Gaudi resources to the tgi-service to optimize
The FAQs(frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It add a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.
The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
| Service Name | Image Name | Gaudi Use |
| ---------------------------- | ----------------------------------------------------- | ------------ |
@@ -214,13 +214,13 @@ This setup might allow for more Gaudi devices to be dedicated to the `vllm-servi
### compose_guardrails.yaml - Guardrails Deployment
The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
| Service Name | Image Name | Gaudi Specific | Uses LLM |
| ---------------------------- | ----------------------------------------------------- | -------------- | -------- |
| redis-vector-db | redis/redis-stack:7.2.0-v9 | No | No |
| dataprep-redis-service | opea/dataprep:latest | No | No |
| _tgi-guardrails-service_ | ghcr.io/huggingface/tgi-gaudi:2.0.6 | 1 card | Yes |
| _tgi-guardrails-service_ | ghcr.io/huggingface/tgi-gaudi:2.3.1 | 1 card | Yes |
| _guardrails_ | opea/guardrails:latest | No | No |
| tei-embedding-service | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No | No |
| retriever | opea/retriever:latest | No | No |
@@ -262,8 +262,8 @@ The table provides a comprehensive overview of the ChatQnA services utilized acr
| retriever | opea/retriever:latest | No | Retrieves data from the Redis database and interacts with embedding services. |
| tei-reranking-service | ghcr.io/huggingface/tei-gaudi:1.5.0 | Yes | Reranks text embeddings, typically using Gaudi hardware for enhanced performance. |
| vllm-service | opea/vllm-gaudi:latest | No | Handles large language model (LLM) tasks, utilizing Gaudi hardware. |
| tgi-service | ghcr.io/huggingface/tgi-gaudi:2.0.6 | Yes | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware. |
| tgi-guardrails-service | ghcr.io/huggingface/tgi-gaudi:2.0.6 | Yes | Provides guardrails functionality, ensuring safe operations within defined limits. |
| tgi-service | ghcr.io/huggingface/tgi-gaudi:2.3.1 | Yes | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware. |
| tgi-guardrails-service | ghcr.io/huggingface/tgi-gaudi:2.3.1 | Yes | Provides guardrails functionality, ensuring safe operations within defined limits. |
| guardrails | opea/guardrails:latest | Yes | Acts as a safety layer, interfacing with the `tgi-guardrails-service` to enforce safety protocols. |
| chatqna-gaudi-backend-server | opea/chatqna:latest | No | Serves as the backend for the ChatQnA application, with variations depending on the deployment. |
| chatqna-gaudi-ui-server | opea/chatqna-ui:latest | No | Provides the user interface for the ChatQnA application. |
@@ -288,7 +288,7 @@ The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `te
### tgi-gaurdrails-service
The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.0.6` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.
The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.3.1` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.
## Conclusion

View File

@@ -26,7 +26,7 @@ services:
TEI_ENDPOINT: http://tei-embedding-service:80
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
tgi-guardrails-service:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-guardrails-server
ports:
- "8088:80"

View File

@@ -80,7 +80,7 @@ services:
MAX_WARMUP_SEQUENCE_LENGTH: 512
command: --model-id ${RERANK_MODEL_ID} --auto-truncate
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
ports:
- "8005:80"

View File

@@ -49,15 +49,15 @@ f810f3b4d329 opea/embedding:latest "python embed
69e1fb59e92c opea/retriever:latest "/home/user/comps/re…" 2 minutes ago Up 2 minutes 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-redis-server
313b9d14928a opea/reranking-tei:latest "python reranking_te…" 2 minutes ago Up 2 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp reranking-tei-gaudi-server
174bd43fa6b5 ghcr.io/huggingface/tei-gaudi:1.5.0 "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8090->80/tcp, :::8090->80/tcp tei-embedding-gaudi-server
05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server
05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.3.1 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server
74084469aa33 redis/redis-stack:7.2.0-v9 "/entrypoint.sh" 2 minutes ago Up 2 minutes 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp redis-vector-db
88399dbc9e43 ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8808->80/tcp, :::8808->80/tcp tei-reranking-gaudi-server
```
In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.6` Existed.
In this case, `ghcr.io/huggingface/tgi-gaudi:2.3.1` Existed.
```
05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server
05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.3.1 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server
```
Next we can check the container logs to get to know what happened during the docker start.
@@ -68,7 +68,7 @@ Check the log of container by:
`docker logs <CONTAINER ID> -t`
View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.6`
View the logs of `ghcr.io/huggingface/tgi-gaudi:2.3.1`
`docker logs 05c40b636239 -t`
@@ -97,7 +97,7 @@ So just make sure the devices are available.
Here is another failure example:
```
f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server
f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:2.3.1 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server
```
Check the log by `docker logs f7a08f9867f9 -t`.
@@ -114,7 +114,7 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co
```
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
ports:
- "8008:80"

View File

@@ -82,7 +82,7 @@ services:
count: 1
capabilities: [gpu]
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.2.0
image: ghcr.io/huggingface/text-generation-inference:2.4.1
container_name: tgi-server
ports:
- "8008:80"

View File

@@ -25,7 +25,7 @@ Should you desire to use the Gaudi accelerator, two alternate images are used fo
For Gaudi:
tei-embedding-service: ghcr.io/huggingface/tei-gaudi:1.5.0
tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.6
tgi-service: gghcr.io/huggingface/tgi-gaudi:2.3.1
> [NOTE]

View File

@@ -37,7 +37,7 @@ function build_docker_images() {
service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

View File

@@ -35,7 +35,7 @@ function build_docker_images() {
service_list="chatqna chatqna-ui dataprep retriever nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

View File

@@ -4,7 +4,7 @@
services:
codegen-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: codegen-tgi-service
ports:
- "${CODEGEN_TGI_SERVICE_PORT:-8028}:80"

View File

@@ -4,7 +4,7 @@
services:
codetrans-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: codetrans-tgi-service
ports:
- "${CODETRANS_TGI_SERVICE_PORT:-8008}:80"

View File

@@ -35,7 +35,7 @@ function build_docker_images() {
service_list="codetrans codetrans-ui llm-textgen nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
docker images && sleep 1s
}

View File

@@ -7,7 +7,7 @@ version: "3.8"
services:
dbqna-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: dbqna-tgi-service
ports:
- "${DBQNA_TGI_SERVICE_PORT:-8008}:80"

View File

@@ -108,7 +108,7 @@ docker run --name test-text2sql-postgres --ipc=host -e POSTGRES_USER=${POSTGRES_
```bash
docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $model
docker run -d --name="test-text2sql-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HF_TOKEN} -e model=${model} ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
```
- Start Text-to-SQL Service

View File

@@ -31,7 +31,7 @@ function build_docker_images() {
service_list="text2sql text2sql-react-ui"
docker compose -f build.yaml build ${service_list} --no-cache > "${LOG_PATH}"/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
docker images && sleep 1s
}

View File

@@ -72,7 +72,7 @@ For gated models, you also need to provide [HuggingFace token](https://huggingfa
Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
```bash
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export HOST_IP=${host_ip}
export DOCSUM_TGI_SERVICE_PORT="18882"

View File

@@ -3,7 +3,7 @@
services:
docsum-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: docsum-tgi-service
ports:
- "${DOCSUM_TGI_SERVICE_PORT}:80"

View File

@@ -5,7 +5,7 @@
export MAX_INPUT_TOKENS=2048
export MAX_TOTAL_TOKENS=4096
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export HOST_IP=${host_ip}
export DOCSUM_TGI_SERVICE_PORT="8008"

View File

@@ -9,7 +9,7 @@ The DocSum application is defined as a Custom Resource (CR) file that the above
The DocSum pipeline uses prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the
the image `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.6`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.3.1`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
[NOTE]
Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/docker_compose/intel/cpu/xeon/README.md) or

View File

@@ -15,7 +15,7 @@ export MAX_INPUT_TOKENS=1024
export MAX_TOTAL_TOKENS=2048
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export HOST_IP=${ip_address}
export host_ip=${ip_address}
@@ -45,7 +45,7 @@ function build_docker_images() {
service_list="docsum docsum-gradio-ui whisper llm-docsum"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:1.4
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1
docker images && sleep 1s
}

View File

@@ -67,6 +67,8 @@ function build_docker_images() {
service_list="docsum docsum-gradio-ui whisper llm-docsum vllm"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1
docker images && sleep 1s
}

View File

@@ -93,7 +93,7 @@ services:
RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
restart: unless-stopped
tgi-rocm:
image: ghcr.io/huggingface/text-generation-inference:3.0.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: tgi-llava-rocm-server
ports:
- "8399:80"

View File

@@ -86,7 +86,7 @@ services:
restart: unless-stopped
search-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: search-tgi-service
ports:
- "${SEARCH_TGI_SERVICE_PORT:-3006}:80"

View File

@@ -37,7 +37,7 @@ function build_docker_images() {
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
docker images && sleep 1s
}

View File

@@ -24,7 +24,7 @@ function build_docker_images() {
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
docker images && sleep 1s
}

View File

@@ -3,7 +3,7 @@
services:
translation-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
container_name: translation-tgi-service
ports:
- "${TRANSLATION_TGI_SERVICE_PORT:-8008}:80"

View File

@@ -34,7 +34,7 @@ function build_docker_images() {
service_list="translation translation-ui llm-textgen nginx"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
docker images && sleep 1s
}

View File

@@ -32,7 +32,7 @@ rm -rf vllm-fork
```bash
# TGI (Optional)
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
```
### 3. Build MegaService Docker Image
@@ -58,7 +58,7 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
Then run the command `docker images`, you will have the following 5 Docker Images:
1. `opea/vllm-gaudi:latest`
2. `ghcr.io/huggingface/tgi-gaudi:2.0.6` (Optional)
2. `ghcr.io/huggingface/tgi-gaudi:2.3.1` (Optional)
3. `opea/lvm:latest`
4. `opea/visualqna:latest`
5. `opea/visualqna-ui:latest`