From 3d3ac59bfb38b1e593bed0a87825bca1f8c9daad Mon Sep 17 00:00:00 2001 From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com> Date: Mon, 20 Jan 2025 22:47:56 +0800 Subject: [PATCH] [ChatQnA] Update the default LLM to llama3-8B on cpu/gpu/hpu (#1430) Update the default LLM to llama3-8B on cpu/nvgpu/amdgpu/gaudi for docker-compose deployment to avoid the potential model serving issue or the missing chat-template issue using neural-chat-7b. Slow serving issue of neural-chat-7b on ICX: #1420 Signed-off-by: Wang, Kai Lawrence --- ChatQnA/README.md | 20 ++++++++++------- ChatQnA/chatqna.py | 2 +- ChatQnA/docker_compose/amd/gpu/rocm/README.md | 14 +++++++----- .../docker_compose/amd/gpu/rocm/set_env.sh | 2 +- .../docker_compose/intel/cpu/xeon/README.md | 22 ++++++++++--------- .../intel/cpu/xeon/README_pinecone.md | 18 ++++++++------- .../intel/cpu/xeon/README_qdrant.md | 16 ++++++++------ .../docker_compose/intel/cpu/xeon/set_env.sh | 2 +- .../docker_compose/intel/hpu/gaudi/README.md | 16 ++++++++------ .../hpu/gaudi/how_to_validate_service.md | 4 ++-- .../docker_compose/intel/hpu/gaudi/set_env.sh | 2 +- ChatQnA/docker_compose/nvidia/gpu/README.md | 14 +++++++----- ChatQnA/docker_compose/nvidia/gpu/set_env.sh | 2 +- .../tests/test_compose_guardrails_on_gaudi.sh | 4 ++-- ChatQnA/tests/test_compose_on_gaudi.sh | 4 ++-- ChatQnA/tests/test_compose_on_rocm.sh | 2 +- ChatQnA/tests/test_compose_on_xeon.sh | 4 ++-- .../tests/test_compose_pinecone_on_xeon.sh | 4 ++-- ChatQnA/tests/test_compose_qdrant_on_xeon.sh | 4 ++-- ChatQnA/tests/test_compose_tgi_on_gaudi.sh | 4 ++-- ChatQnA/tests/test_compose_tgi_on_xeon.sh | 4 ++-- .../test_compose_without_rerank_on_gaudi.sh | 4 ++-- .../test_compose_without_rerank_on_xeon.sh | 4 ++-- .../components/Conversation/Conversation.tsx | 2 +- .../ui/svelte/src/lib/network/chat/Network.ts | 2 +- 25 files changed, 96 insertions(+), 80 deletions(-) diff --git a/ChatQnA/README.md b/ChatQnA/README.md index 37fafc358..c605e883c 100644 --- a/ChatQnA/README.md +++ b/ChatQnA/README.md @@ -8,7 +8,7 @@ RAG bridges the knowledge gap by dynamically fetching relevant information from | Cloud Provider | Intel Architecture | Intel Optimized Cloud Module for Terraform | Comments | | -------------------- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | -| AWS | 4th Gen Intel Xeon with Intel AMX | [AWS Module](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna) | Uses Intel/neural-chat-7b-v3-3 by default | +| AWS | 4th Gen Intel Xeon with Intel AMX | [AWS Module](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna) | Uses meta-llama/Meta-Llama-3-8B-Instruct by default | | AWS Falcon2-11B | 4th Gen Intel Xeon with Intel AMX | [AWS Module with Falcon11B](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna-falcon11B) | Uses TII Falcon2-11B LLM Model | | GCP | 5th Gen Intel Xeon with Intel AMX | [GCP Module](https://github.com/intel/terraform-intel-gcp-vm/tree/main/examples/gen-ai-xeon-opea-chatqna) | Also supports Confidential AI by using Intel® TDX with 4th Gen Xeon | | Azure | 5th Gen Intel Xeon with Intel AMX | Work-in-progress | Work-in-progress | @@ -25,7 +25,7 @@ Use this if you are not using Terraform and have provisioned your system with an ## Manually Deploy ChatQnA Service -The ChatQnA service can be effortlessly deployed on Intel Gaudi2, Intel Xeon Scalable Processors and Nvidia GPU. +The ChatQnA service can be effortlessly deployed on Intel Gaudi2, Intel Xeon Scalable Processors,Nvidia GPU and AMD GPU. Two types of ChatQnA pipeline are supported now: `ChatQnA with/without Rerank`. And the `ChatQnA without Rerank` pipeline (including Embedding, Retrieval, and LLM) is offered for Xeon customers who can not run rerank service on HPU yet require high performance and accuracy. @@ -35,7 +35,11 @@ Quick Start Deployment Steps: 2. Run Docker Compose. 3. Consume the ChatQnA Service. -Note: If you do not have docker installed you can run this script to install docker : `bash docker_compose/install_docker.sh` +Note: + +1. If you do not have docker installed you can run this script to install docker : `bash docker_compose/install_docker.sh`. + +2. The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). ### Quick Start: 1.Setup Environment Variable @@ -209,11 +213,11 @@ Gaudi default compose.yaml By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` in `docker_compose/xxx/set_env.sh` for your needs. diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py index 30e154c9e..104c6fdb1 100644 --- a/ChatQnA/chatqna.py +++ b/ChatQnA/chatqna.py @@ -57,7 +57,7 @@ RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0") RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 80)) LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0") LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80)) -LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3") +LLM_MODEL = os.getenv("LLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct") def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs): diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/README.md b/ChatQnA/docker_compose/amd/gpu/rocm/README.md index eadfc2f5d..cfd924554 100644 --- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md +++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md @@ -10,6 +10,8 @@ Quick Start Deployment Steps: 2. Run Docker Compose. 3. Consume the ChatQnA Service. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## Quick Start: 1.Setup Environment Variable To set up environment variables for deploying ChatQnA services, follow these steps: @@ -155,11 +157,11 @@ Then run the command `docker images`, you will have the following 5 Docker Image By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -179,7 +181,7 @@ Change the `xxx_MODEL_ID` below for your needs. export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base" - export CHATQNA_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export CHATQNA_TGI_SERVICE_PORT=8008 export CHATQNA_TEI_EMBEDDING_PORT=8090 export CHATQNA_TEI_EMBEDDING_ENDPOINT="http://${HOST_IP}:${CHATQNA_TEI_EMBEDDING_PORT}" diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh b/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh index 8071ebdd9..6d6480ac6 100644 --- a/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh +++ b/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh @@ -6,7 +6,7 @@ export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base" -export CHATQNA_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export CHATQNA_TGI_SERVICE_PORT=18008 export CHATQNA_TEI_EMBEDDING_PORT=18090 export CHATQNA_TEI_EMBEDDING_ENDPOINT="http://${HOST_IP}:${CHATQNA_TEI_EMBEDDING_PORT}" diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md index 40610ad73..01a00a819 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md @@ -10,6 +10,8 @@ Quick Start: 2. Run Docker Compose. 3. Consume the ChatQnA Service. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## Quick Start: 1.Setup Environment Variable To set up environment variables for deploying ChatQnA services, follow these steps: @@ -180,11 +182,11 @@ Then run the command `docker images`, you will have the following 5 Docker Image By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -195,7 +197,7 @@ For users in China who are unable to download models directly from Huggingface, ```bash export HF_TOKEN=${your_hf_token} export HF_ENDPOINT="https://hf-mirror.com" - model_name="Intel/neural-chat-7b-v3-3" + model_name="meta-llama/Meta-Llama-3-8B-Instruct" # Start vLLM LLM Service docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80 # Start TGI LLM Service @@ -204,7 +206,7 @@ For users in China who are unable to download models directly from Huggingface, 2. Offline - - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/ai-modelscope/neural-chat-7b-v3-1/files) for model `neural-chat-7b-v3-1`. + - Search your model name in ModelScope. For example, check [this page](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/files) for model `Meta-Llama-3-8B-Instruct`. - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`. @@ -337,7 +339,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v # either vLLM or TGI service curl http://${host_ip}:9009/v1/chat/completions \ -X POST \ - -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` @@ -450,7 +452,7 @@ Users could follow previous section to testing vLLM microservice or ChatQnA Mega ```bash curl http://${host_ip}:9009/start_profile \ -H "Content-Type: application/json" \ - -d '{"model": "Intel/neural-chat-7b-v3-3"}' + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct"}' ``` Users would see below docker logs from vllm-service if profiling is started correctly. @@ -473,7 +475,7 @@ By following command, users could stop vLLM profliing and generate a \*.pt.trace # vLLM Service curl http://${host_ip}:9009/stop_profile \ -H "Content-Type: application/json" \ - -d '{"model": "Intel/neural-chat-7b-v3-3"}' + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct"}' ``` Users would see below docker logs from vllm-service if profiling is stopped correctly. diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md index e7b564db2..8e8a9cd44 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md @@ -10,6 +10,8 @@ Quick Start: 2. Run Docker Compose. 3. Consume the ChatQnA Service. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## Quick Start: 1.Setup Environment Variable To set up environment variables for deploying ChatQnA services, follow these steps: @@ -183,11 +185,11 @@ Then run the command `docker images`, you will have the following 5 Docker Image By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -198,13 +200,13 @@ For users in China who are unable to download models directly from Huggingface, ```bash export HF_TOKEN=${your_hf_token} export HF_ENDPOINT="https://hf-mirror.com" - model_name="Intel/neural-chat-7b-v3-3" + model_name="meta-llama/Meta-Llama-3-8B-Instruct" docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80 ``` 2. Offline - - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/ai-modelscope/neural-chat-7b-v3-1/files) for model `neural-chat-7b-v3-1`. + - Search your model name in ModelScope. For example, check [this page](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/files) for model `Meta-Llama-3-8B-Instruct`. - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`. @@ -324,7 +326,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v ```bash curl http://${host_ip}:9009/v1/chat/completions \ -X POST \ - -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md index c0b173436..9ca08fc24 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md @@ -4,6 +4,8 @@ This document outlines the deployment process for a ChatQnA application utilizin The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## 🚀 Apply Xeon Server on AWS To apply a Xeon server on AWS, start by creating an AWS account if you don't have one already. Then, head to the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home) to begin the process. Within the EC2 service, select the Amazon EC2 M7i or M7i-flex instance type to leverage the power of 4th Generation Intel Xeon Scalable processors. These instances are optimized for high-performance computing and demanding workloads. @@ -141,11 +143,11 @@ Then run the command `docker images`, you will have the following 5 Docker Image By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -181,7 +183,7 @@ export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" -export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-qdrant" ``` @@ -256,7 +258,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v ```bash curl http://${host_ip}:6042/v1/chat/completions \ -X POST \ - -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh index 607225cf4..cc6779cf7 100755 --- a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh +++ b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh @@ -9,7 +9,7 @@ popd > /dev/null export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" -export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" # Set it as a non-null string, such as true, if you want to enable logging facility, # otherwise, keep it as "" to disable it. diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md index 02e418205..642170468 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md @@ -10,6 +10,8 @@ Quick Start: 2. Run Docker Compose. 3. Consume the ChatQnA Service. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## Quick Start: 1.Setup Environment Variable To set up environment variables for deploying ChatQnA services, follow these steps: @@ -178,11 +180,11 @@ If Guardrails docker image is built, you will find one more image: By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -193,7 +195,7 @@ For users in China who are unable to download models directly from Huggingface, ```bash export HF_TOKEN=${your_hf_token} export HF_ENDPOINT="https://hf-mirror.com" - model_name="Intel/neural-chat-7b-v3-3" + model_name="meta-llama/Meta-Llama-3-8B-Instruct" # Start vLLM LLM Service docker run -p 8007:80 -v ./data:/data --name vllm-gaudi-server -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e VLLM_TORCH_PROFILER_DIR="/mnt" --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 # Start TGI LLM Service @@ -202,7 +204,7 @@ For users in China who are unable to download models directly from Huggingface, 2. Offline - - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/ai-modelscope/neural-chat-7b-v3-1/files) for model `neural-chat-7b-v3-1`. + - Search your model name in ModelScope. For example, check [this page](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/files) for model `Meta-Llama-3-8B-Instruct`. - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`. diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md index 7d4e40b6a..bb624bd68 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md @@ -231,7 +231,7 @@ and the log shows model warm up, please wait for a while and try it later. ``` 2024-06-05T05:45:27.707509646Z 2024-06-05T05:45:27.707361Z WARN text_generation_router: router/src/main.rs:357: `--revision` is not set 2024-06-05T05:45:27.707539740Z 2024-06-05T05:45:27.707379Z WARN text_generation_router: router/src/main.rs:358: We strongly advise to set it to a known supported commit. -2024-06-05T05:45:27.852525522Z 2024-06-05T05:45:27.852437Z INFO text_generation_router: router/src/main.rs:379: Serving revision bdd31cf498d13782cc7497cba5896996ce429f91 of model Intel/neural-chat-7b-v3-3 +2024-06-05T05:45:27.852525522Z 2024-06-05T05:45:27.852437Z INFO text_generation_router: router/src/main.rs:379: Serving revision bdd31cf498d13782cc7497cba5896996ce429f91 of model meta-llama/Meta-Llama-3-8B-Instruct 2024-06-05T05:45:27.867833811Z 2024-06-05T05:45:27.867759Z INFO text_generation_router: router/src/main.rs:221: Warming up model ``` @@ -239,7 +239,7 @@ and the log shows model warm up, please wait for a while and try it later. ``` curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{ - "model": "Intel/neural-chat-7b-v3-3", + "model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?" }' ``` diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh index fc4074e61..4e06c3b28 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -9,7 +9,7 @@ popd > /dev/null export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" -export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" # Set it as a non-null string, such as true, if you want to enable logging facility, # otherwise, keep it as "" to disable it. diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md index b284ab47d..b1ab3e8ba 100644 --- a/ChatQnA/docker_compose/nvidia/gpu/README.md +++ b/ChatQnA/docker_compose/nvidia/gpu/README.md @@ -9,6 +9,8 @@ Quick Start Deployment Steps: 3. Run Docker Compose. 4. Consume the ChatQnA Service. +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). + ## Quick Start: 1.Setup Environment Variable To set up environment variables for deploying ChatQnA services, follow these steps: @@ -165,11 +167,11 @@ Then run the command `docker images`, you will have the following 5 Docker Image By default, the embedding, reranking and LLM models are set to a default value as listed below: -| Service | Model | -| --------- | ------------------------- | -| Embedding | BAAI/bge-base-en-v1.5 | -| Reranking | BAAI/bge-reranker-base | -| LLM | Intel/neural-chat-7b-v3-3 | +| Service | Model | +| --------- | ----------------------------------- | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| LLM | meta-llama/Meta-Llama-3-8B-Instruct | Change the `xxx_MODEL_ID` below for your needs. @@ -287,7 +289,7 @@ docker compose up -d ```bash curl http://${host_ip}:8008/v1/chat/completions \ -X POST \ - -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` diff --git a/ChatQnA/docker_compose/nvidia/gpu/set_env.sh b/ChatQnA/docker_compose/nvidia/gpu/set_env.sh index dd7421e68..c59ae9c6b 100644 --- a/ChatQnA/docker_compose/nvidia/gpu/set_env.sh +++ b/ChatQnA/docker_compose/nvidia/gpu/set_env.sh @@ -6,7 +6,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" -export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090" export INDEX_NAME="rag-redis" export MEGA_SERVICE_HOST_IP=${host_ip} diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh index 22de3e78a..08fc577ed 100644 --- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh @@ -34,7 +34,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B" @@ -132,7 +132,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-gaudi-server" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' # guardrails microservice validate_service \ diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh index 099f032bc..4737a9603 100644 --- a/ChatQnA/tests/test_compose_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_on_gaudi.sh @@ -32,7 +32,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export host_ip=${ip_address} @@ -115,7 +115,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-gaudi-server" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh index e73a0aef5..9ea597977 100644 --- a/ChatQnA/tests/test_compose_on_rocm.sh +++ b/ChatQnA/tests/test_compose_on_rocm.sh @@ -18,7 +18,7 @@ export HOST_IP=${ip_address} export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base" -export CHATQNA_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export CHATQNA_TGI_SERVICE_PORT=9009 export CHATQNA_TEI_EMBEDDING_PORT=8090 export CHATQNA_TEI_EMBEDDING_ENDPOINT="http://${HOST_IP}:${CHATQNA_TEI_EMBEDDING_PORT}" diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh index 80c74f1a3..e4cc097d7 100644 --- a/ChatQnA/tests/test_compose_on_xeon.sh +++ b/ChatQnA/tests/test_compose_on_xeon.sh @@ -33,7 +33,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export host_ip=${ip_address} @@ -115,7 +115,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh index 4f6dd1158..4e9845a77 100755 --- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh +++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh @@ -33,7 +33,7 @@ function start_services() { export no_proxy=${no_proxy},${ip_address} export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export PINECONE_API_KEY=${PINECONE_KEY_LANGCHAIN_TEST} export PINECONE_INDEX_NAME="langchain-test" export INDEX_NAME="langchain-test" @@ -153,7 +153,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh index d0ad922fd..91f8a597a 100644 --- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh +++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh @@ -31,7 +31,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-qdrant" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} @@ -137,7 +137,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh index 303df2b61..0b075d04a 100644 --- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh @@ -45,7 +45,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+') @@ -173,7 +173,7 @@ function validate_microservices() { "content" \ "tgi-llm" \ "tgi-gaudi-server" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_tgi_on_xeon.sh b/ChatQnA/tests/test_compose_tgi_on_xeon.sh index 0746756f3..9d96dcc52 100644 --- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh +++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh @@ -45,7 +45,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} @@ -170,7 +170,7 @@ function validate_microservices() { "content" \ "tgi-llm" \ "tgi-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh index 58d3f71c7..e89e8d5cd 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh @@ -32,7 +32,7 @@ function build_docker_images() { function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} @@ -149,7 +149,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-gaudi-server" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' } function validate_megaservice() { diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh index ebb76eb6c..b619b3e30 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh @@ -32,7 +32,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/cpu/xeon export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} @@ -148,7 +148,7 @@ function validate_microservices() { "content" \ "vllm-llm" \ "vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' } function validate_megaservice() { diff --git a/ChatQnA/ui/react/src/components/Conversation/Conversation.tsx b/ChatQnA/ui/react/src/components/Conversation/Conversation.tsx index 02736d8bd..d7c7fcfc4 100644 --- a/ChatQnA/ui/react/src/components/Conversation/Conversation.tsx +++ b/ChatQnA/ui/react/src/components/Conversation/Conversation.tsx @@ -58,7 +58,7 @@ const Conversation = ({ title }: ConversationProps) => { conversationId: selectedConversationId, userPrompt, messages, - model: "Intel/neural-chat-7b-v3-3", + model: "meta-llama/Meta-Llama-3-8B-Instruct", }) setPrompt("") } diff --git a/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts b/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts index cafc34675..060c5a5ff 100644 --- a/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts +++ b/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts @@ -21,7 +21,7 @@ const MODEL_ID = env.MODEL_ID; export async function fetchTextStream(query: string) { let payload = {}; let url = ""; - let modelId = "Intel/neural-chat-7b-v3-3"; + let modelId = "meta-llama/Meta-Llama-3-8B-Instruct"; if (MODEL_ID) { modelId = MODEL_ID;