Compare commits
71 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de96cd4dcf | ||
|
|
555e2405b9 | ||
|
|
7a92435269 | ||
|
|
c9085c3c68 | ||
|
|
36aaed748b | ||
|
|
9180f1066d | ||
|
|
5aecea8e47 | ||
|
|
6723395e31 | ||
|
|
785ffb9a1e | ||
|
|
428ba481b2 | ||
|
|
769105b986 | ||
|
|
2dfcfa0436 | ||
|
|
8a5ad1fc72 | ||
|
|
24cacaaa48 | ||
|
|
6ead1b12db | ||
|
|
8dac9d1035 | ||
|
|
c1b5ba281f | ||
|
|
641f60c76c | ||
|
|
8f8d3af7c3 | ||
|
|
e4de76da78 | ||
|
|
ce38a84372 | ||
|
|
e8b07c28ec | ||
|
|
7b3a125bdf | ||
|
|
fba0de45d2 | ||
|
|
f2a5644d9c | ||
|
|
6cd7827365 | ||
|
|
3d8009aa91 | ||
|
|
78f8ae524d | ||
|
|
6abf7652e8 | ||
|
|
25c1aefc27 | ||
|
|
d46df4331d | ||
|
|
23a77df302 | ||
|
|
852bc7027c | ||
|
|
a7eced4161 | ||
|
|
caec354324 | ||
|
|
d482554a6b | ||
|
|
2ae6871fc5 | ||
|
|
2ac5be9921 | ||
|
|
799881a3fa | ||
|
|
e5c6418c81 | ||
|
|
0c0edffc5b | ||
|
|
9f36e84c1c | ||
|
|
8c547c2ba5 | ||
|
|
80dd86f122 | ||
|
|
6d781f7b2b | ||
|
|
abafd5de20 | ||
|
|
970b869838 | ||
|
|
87ff149f61 | ||
|
|
c39a569ab2 | ||
|
|
81b02bb947 | ||
|
|
47069ac70c | ||
|
|
6ce7730863 | ||
|
|
ad5523bac7 | ||
|
|
88a8235f21 | ||
|
|
63ad850052 | ||
|
|
9a0c547112 | ||
|
|
26a6da4123 | ||
|
|
45d5da2ddd | ||
|
|
1b3291a1c8 | ||
|
|
7ac8cf517a | ||
|
|
44a689b0bf | ||
|
|
388d3eb5c5 | ||
|
|
ef9ad61440 | ||
|
|
4c41a5db83 | ||
|
|
9adf7a6af0 | ||
|
|
a4d028e8ea | ||
|
|
32d4f714fd | ||
|
|
fdbc27a9b5 | ||
|
|
5f4b1828a5 | ||
|
|
39abef8be8 | ||
|
|
ed163087ba |
19
.github/workflows/_example-workflow.yml
vendored
19
.github/workflows/_example-workflow.yml
vendored
@@ -43,7 +43,11 @@ on:
|
||||
inject_commit:
|
||||
default: false
|
||||
required: false
|
||||
type: string
|
||||
type: boolean
|
||||
use_model_cache:
|
||||
default: false
|
||||
required: false
|
||||
type: boolean
|
||||
|
||||
jobs:
|
||||
####################################################################################################
|
||||
@@ -74,12 +78,16 @@ jobs:
|
||||
cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
|
||||
docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
|
||||
if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
cd vllm && git rev-parse HEAD && cd ../
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
# Get the latest tag
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# make sure do not change the pwd
|
||||
git rev-parse HEAD && cd ../
|
||||
fi
|
||||
if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
|
||||
git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
|
||||
sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt
|
||||
fi
|
||||
git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
|
||||
cd GenAIComps && git rev-parse HEAD && cd ../
|
||||
@@ -106,6 +114,7 @@ jobs:
|
||||
tag: ${{ inputs.tag }}
|
||||
example: ${{ inputs.example }}
|
||||
hardware: ${{ inputs.node }}
|
||||
use_model_cache: ${{ inputs.use_model_cache }}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@@ -127,7 +136,7 @@ jobs:
|
||||
####################################################################################################
|
||||
test-gmc-pipeline:
|
||||
needs: [build-images]
|
||||
if: ${{ fromJSON(inputs.test_gmc) }}
|
||||
if: false # ${{ fromJSON(inputs.test_gmc) }}
|
||||
uses: ./.github/workflows/_gmc-e2e.yml
|
||||
with:
|
||||
example: ${{ inputs.example }}
|
||||
|
||||
1
.github/workflows/_helm-e2e.yml
vendored
1
.github/workflows/_helm-e2e.yml
vendored
@@ -97,6 +97,7 @@ jobs:
|
||||
|
||||
helm-test:
|
||||
needs: [get-test-case]
|
||||
if: ${{ needs.get-test-case.outputs.value_files != '[]' }}
|
||||
strategy:
|
||||
matrix:
|
||||
value_file: ${{ fromJSON(needs.get-test-case.outputs.value_files) }}
|
||||
|
||||
19
.github/workflows/_run-docker-compose.yml
vendored
19
.github/workflows/_run-docker-compose.yml
vendored
@@ -28,6 +28,10 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
use_model_cache:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
jobs:
|
||||
get-test-case:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -85,12 +89,17 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$run_test_cases" ] && [[ $(printf '%s\n' "${changed_files[@]}" | grep ${{ inputs.example }} | grep /tests/) ]]; then
|
||||
run_test_cases=$other_test_cases
|
||||
fi
|
||||
|
||||
test_cases=$(echo $run_test_cases | tr ' ' '\n' | sort -u | jq -R '.' | jq -sc '.')
|
||||
echo "test_cases=$test_cases"
|
||||
echo "test_cases=$test_cases" >> $GITHUB_OUTPUT
|
||||
|
||||
compose-test:
|
||||
needs: [get-test-case]
|
||||
if: ${{ needs.get-test-case.outputs.test_cases != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
test_case: ${{ fromJSON(needs.get-test-case.outputs.test_cases) }}
|
||||
@@ -126,6 +135,7 @@ jobs:
|
||||
shell: bash
|
||||
env:
|
||||
HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
|
||||
GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
|
||||
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||
PINECONE_KEY: ${{ secrets.PINECONE_KEY }}
|
||||
@@ -138,9 +148,18 @@ jobs:
|
||||
example: ${{ inputs.example }}
|
||||
hardware: ${{ inputs.hardware }}
|
||||
test_case: ${{ matrix.test_case }}
|
||||
use_model_cache: ${{ inputs.use_model_cache }}
|
||||
run: |
|
||||
cd ${{ github.workspace }}/$example/tests
|
||||
if [[ "$IMAGE_REPO" == "" ]]; then export IMAGE_REPO="${OPEA_IMAGE_REPO}opea"; fi
|
||||
if [[ "$use_model_cache" == "true" ]]; then
|
||||
if [ -d "/data2/hf_model" ]; then
|
||||
export model_cache="/data2/hf_model"
|
||||
else
|
||||
echo "Model cache directory /data2/hf_model does not exist"
|
||||
export model_cache="~/.cache/huggingface/hub"
|
||||
fi
|
||||
fi
|
||||
if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi
|
||||
|
||||
- name: Clean up container after test
|
||||
|
||||
2
.github/workflows/manual-docker-publish.yml
vendored
2
.github/workflows/manual-docker-publish.yml
vendored
@@ -41,9 +41,11 @@ jobs:
|
||||
|
||||
publish:
|
||||
needs: [get-image-list]
|
||||
if: ${{ needs.get-image-list.outputs.matrix != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
|
||||
fail-fast: false
|
||||
runs-on: "docker-build-${{ inputs.node }}"
|
||||
steps:
|
||||
- uses: docker/login-action@v3.2.0
|
||||
|
||||
1
.github/workflows/manual-docker-scan.yml
vendored
1
.github/workflows/manual-docker-scan.yml
vendored
@@ -47,6 +47,7 @@ jobs:
|
||||
scan-docker:
|
||||
needs: get-image-list
|
||||
runs-on: "docker-build-${{ inputs.node }}"
|
||||
if: ${{ needs.get-image-list.outputs.matrix != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
image: ${{ fromJson(needs.get-image-list.outputs.matrix) }}
|
||||
|
||||
35
.github/workflows/manual-example-workflow.yml
vendored
35
.github/workflows/manual-example-workflow.yml
vendored
@@ -20,11 +20,11 @@ on:
|
||||
description: "Tag to apply to images"
|
||||
required: true
|
||||
type: string
|
||||
deploy_gmc:
|
||||
default: false
|
||||
description: 'Whether to deploy gmc'
|
||||
required: true
|
||||
type: boolean
|
||||
# deploy_gmc:
|
||||
# default: false
|
||||
# description: 'Whether to deploy gmc'
|
||||
# required: true
|
||||
# type: boolean
|
||||
build:
|
||||
default: true
|
||||
description: 'Build test required images for Examples'
|
||||
@@ -40,11 +40,11 @@ on:
|
||||
description: 'Test examples with helm charts'
|
||||
required: false
|
||||
type: boolean
|
||||
test_gmc:
|
||||
default: false
|
||||
description: 'Test examples with gmc'
|
||||
required: false
|
||||
type: boolean
|
||||
# test_gmc:
|
||||
# default: false
|
||||
# description: 'Test examples with gmc'
|
||||
# required: false
|
||||
# type: boolean
|
||||
opea_branch:
|
||||
default: "main"
|
||||
description: 'OPEA branch for image build'
|
||||
@@ -54,7 +54,12 @@ on:
|
||||
default: false
|
||||
description: "inject commit to docker images true or false"
|
||||
required: false
|
||||
type: string
|
||||
type: boolean
|
||||
use_model_cache:
|
||||
default: false
|
||||
description: "use model cache true or false"
|
||||
required: false
|
||||
type: boolean
|
||||
|
||||
permissions: read-all
|
||||
jobs:
|
||||
@@ -76,7 +81,8 @@ jobs:
|
||||
|
||||
build-deploy-gmc:
|
||||
needs: [get-test-matrix]
|
||||
if: ${{ fromJSON(inputs.deploy_gmc) }}
|
||||
if: false
|
||||
#${{ fromJSON(inputs.deploy_gmc) }}
|
||||
strategy:
|
||||
matrix:
|
||||
node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
|
||||
@@ -89,7 +95,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
run-examples:
|
||||
needs: [get-test-matrix, build-deploy-gmc]
|
||||
needs: [get-test-matrix] #[get-test-matrix, build-deploy-gmc]
|
||||
if: always()
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -104,7 +110,8 @@ jobs:
|
||||
build: ${{ fromJSON(inputs.build) }}
|
||||
test_compose: ${{ fromJSON(inputs.test_compose) }}
|
||||
test_helmchart: ${{ fromJSON(inputs.test_helmchart) }}
|
||||
test_gmc: ${{ fromJSON(inputs.test_gmc) }}
|
||||
# test_gmc: ${{ fromJSON(inputs.test_gmc) }}
|
||||
opea_branch: ${{ inputs.opea_branch }}
|
||||
inject_commit: ${{ inputs.inject_commit }}
|
||||
use_model_cache: ${{ inputs.use_model_cache }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/manual-freeze-tag.yml
vendored
6
.github/workflows/manual-freeze-tag.yml
vendored
@@ -25,9 +25,9 @@ jobs:
|
||||
|
||||
- name: Set up Git
|
||||
run: |
|
||||
git config --global user.name "NeuralChatBot"
|
||||
git config --global user.email "grp_neural_chat_bot@intel.com"
|
||||
git remote set-url origin https://NeuralChatBot:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git
|
||||
git config --global user.name "CICD-at-OPEA"
|
||||
git config --global user.email "CICD@opea.dev"
|
||||
git remote set-url origin https://CICD-at-OPEA:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git
|
||||
|
||||
- name: Run script
|
||||
run: |
|
||||
|
||||
1
.github/workflows/manual-image-build.yml
vendored
1
.github/workflows/manual-image-build.yml
vendored
@@ -51,6 +51,7 @@ jobs:
|
||||
|
||||
image-build:
|
||||
needs: get-test-matrix
|
||||
if: ${{ needs.get-test-matrix.outputs.nodes != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
|
||||
|
||||
@@ -33,6 +33,7 @@ jobs:
|
||||
|
||||
clean-up:
|
||||
needs: get-build-matrix
|
||||
if: ${{ needs.get-image-list.outputs.matrix != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
node: ${{ fromJson(needs.get-build-matrix.outputs.nodes) }}
|
||||
@@ -47,6 +48,7 @@ jobs:
|
||||
|
||||
build:
|
||||
needs: [get-build-matrix, clean-up]
|
||||
if: ${{ needs.get-image-list.outputs.matrix != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
example: ${{ fromJson(needs.get-build-matrix.outputs.examples) }}
|
||||
|
||||
@@ -34,6 +34,7 @@ jobs:
|
||||
|
||||
build-and-test:
|
||||
needs: get-build-matrix
|
||||
if: ${{ needs.get-build-matrix.outputs.examples_json != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }}
|
||||
@@ -53,9 +54,11 @@ jobs:
|
||||
|
||||
publish:
|
||||
needs: [get-build-matrix, get-image-list, build-and-test]
|
||||
if: ${{ needs.get-image-list.outputs.matrix != '' }}
|
||||
strategy:
|
||||
matrix:
|
||||
image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
|
||||
fail-fast: false
|
||||
runs-on: "docker-build-gaudi"
|
||||
steps:
|
||||
- uses: docker/login-action@v3.2.0
|
||||
|
||||
2
.github/workflows/pr-chart-e2e.yml
vendored
2
.github/workflows/pr-chart-e2e.yml
vendored
@@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
helm-chart-test:
|
||||
needs: [job1]
|
||||
if: always() && ${{ needs.job1.outputs.run_matrix.example.length > 0 }}
|
||||
if: always() && ${{ fromJSON(needs.job1.outputs.run_matrix).length != 0 }}
|
||||
uses: ./.github/workflows/_helm-e2e.yml
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
|
||||
|
||||
3
.github/workflows/pr-docker-compose-e2e.yml
vendored
3
.github/workflows/pr-docker-compose-e2e.yml
vendored
@@ -32,15 +32,16 @@ jobs:
|
||||
|
||||
example-test:
|
||||
needs: [get-test-matrix]
|
||||
if: ${{ needs.get-test-matrix.outputs.run_matrix != '' }}
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(needs.get-test-matrix.outputs.run_matrix) }}
|
||||
fail-fast: false
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
uses: ./.github/workflows/_run-docker-compose.yml
|
||||
with:
|
||||
registry: "opea"
|
||||
tag: "ci"
|
||||
example: ${{ matrix.example }}
|
||||
hardware: ${{ matrix.hardware }}
|
||||
use_model_cache: true
|
||||
diff_excluded_files: '\.github|\.md|\.txt|kubernetes|gmc|assets|benchmark'
|
||||
secrets: inherit
|
||||
|
||||
1
.github/workflows/push-image-build.yml
vendored
1
.github/workflows/push-image-build.yml
vendored
@@ -24,6 +24,7 @@ jobs:
|
||||
|
||||
image-build:
|
||||
needs: job1
|
||||
if: ${{ needs.job1.outputs.run_matrix != '{"include":[]}' }}
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
|
||||
fail-fast: false
|
||||
|
||||
6
.github/workflows/scripts/get_test_matrix.sh
vendored
6
.github/workflows/scripts/get_test_matrix.sh
vendored
@@ -12,6 +12,7 @@ run_matrix="{\"include\":["
|
||||
|
||||
examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
|
||||
for example in ${examples}; do
|
||||
if [[ ! -d $WORKSPACE/$example ]]; then continue; fi
|
||||
cd $WORKSPACE/$example
|
||||
if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
|
||||
cd tests
|
||||
@@ -26,7 +27,10 @@ for example in ${examples}; do
|
||||
|
||||
run_hardware=""
|
||||
if [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | cut -d'/' -f2 | grep -E '\.py|Dockerfile*|ui|docker_image_build' ) ]]; then
|
||||
# run test on all hardware if megaservice or ui code change
|
||||
echo "run test on all hardware if megaservice or ui code change..."
|
||||
run_hardware=$hardware_list
|
||||
elif [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | grep 'tests'| cut -d'/' -f3 | grep -vE '^test_|^_test' ) ]]; then
|
||||
echo "run test on all hardware if common test scripts change..."
|
||||
run_hardware=$hardware_list
|
||||
else
|
||||
for hardware in ${hardware_list}; do
|
||||
|
||||
4
.github/workflows/weekly-update-images.yml
vendored
4
.github/workflows/weekly-update-images.yml
vendored
@@ -16,8 +16,8 @@ jobs:
|
||||
freeze-images:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
USER_NAME: "NeuralChatBot"
|
||||
USER_EMAIL: "grp_neural_chat_bot@intel.com"
|
||||
USER_NAME: "CICD-at-OPEA"
|
||||
USER_EMAIL: "CICD@opea.dev"
|
||||
BRANCH_NAME: "update_images_tag"
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -84,7 +84,7 @@ flowchart LR
|
||||
3. Hierarchical multi-agents can improve performance.
|
||||
Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.
|
||||
|
||||
## Deployment with docker
|
||||
## Deploy with docker
|
||||
|
||||
1. Build agent docker image [Optional]
|
||||
|
||||
@@ -217,13 +217,19 @@ docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-a
|
||||
:::
|
||||
::::
|
||||
|
||||
## Deploy AgentQnA UI
|
||||
|
||||
The AgentQnA UI can be deployed locally or using Docker.
|
||||
|
||||
For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
|
||||
|
||||
## Deploy using Helm Chart
|
||||
|
||||
Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.
|
||||
|
||||
## Validate services
|
||||
|
||||
First look at logs of the agent docker containers:
|
||||
1. First look at logs of the agent docker containers:
|
||||
|
||||
```
|
||||
# worker RAG agent
|
||||
@@ -240,35 +246,18 @@ docker logs react-agent-endpoint
|
||||
|
||||
You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
|
||||
|
||||
Second, validate worker RAG agent:
|
||||
2. You can use python to validate the agent system
|
||||
|
||||
```bash
|
||||
# RAG worker agent
|
||||
python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
|
||||
|
||||
# SQL agent
|
||||
python tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
|
||||
|
||||
# supervisor agent: this will test a two-turn conversation
|
||||
python tests/test.py --agent_role "supervisor" --ext_port 9090
|
||||
```
|
||||
curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
|
||||
"messages": "Michael Jackson song Thriller"
|
||||
}'
|
||||
```
|
||||
|
||||
Third, validate worker SQL agent:
|
||||
|
||||
```
|
||||
curl http://${host_ip}:9096/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
|
||||
"messages": "How many employees are in the company"
|
||||
}'
|
||||
```
|
||||
|
||||
Finally, validate supervisor agent:
|
||||
|
||||
```
|
||||
curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
|
||||
"messages": "How many albums does Iron Maiden have?"
|
||||
}'
|
||||
```
|
||||
|
||||
## Deploy AgentQnA UI
|
||||
|
||||
The AgentQnA UI can be deployed locally or using Docker.
|
||||
|
||||
For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
|
||||
|
||||
## How to register your own tools with agent
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ services:
|
||||
ports:
|
||||
- "${AGENTQNA_TGI_SERVICE_PORT-8085}:80"
|
||||
volumes:
|
||||
- /var/opea/agent-service/:/data
|
||||
- ${HF_CACHE_DIR:-/var/opea/agent-service/}:/data
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
|
||||
@@ -60,7 +60,7 @@ This example showcases a hierarchical multi-agent system for question-answering
|
||||
```
|
||||
6. Launch multi-agent system
|
||||
|
||||
The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM.
|
||||
The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use OpenAI GPT-4o-mini as LLM.
|
||||
|
||||
```
|
||||
cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
|
||||
|
||||
@@ -13,6 +13,7 @@ services:
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: rag_agent
|
||||
with_memory: false
|
||||
recursion_limit: ${recursion_limit_worker}
|
||||
llm_engine: openai
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
@@ -35,17 +36,17 @@ services:
|
||||
image: opea/agent:latest
|
||||
container_name: sql-agent-endpoint
|
||||
volumes:
|
||||
- ${WORKDIR}/TAG-Bench/:/home/user/TAG-Bench # SQL database
|
||||
- ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # SQL database
|
||||
ports:
|
||||
- "9096:9096"
|
||||
ipc: host
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: sql_agent
|
||||
with_memory: false
|
||||
db_name: ${db_name}
|
||||
db_path: ${db_path}
|
||||
use_hints: false
|
||||
hints_file: /home/user/TAG-Bench/${db_name}_hints.csv
|
||||
recursion_limit: ${recursion_limit_worker}
|
||||
llm_engine: openai
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
@@ -64,6 +65,7 @@ services:
|
||||
container_name: react-agent-endpoint
|
||||
depends_on:
|
||||
- worker-rag-agent
|
||||
- worker-sql-agent
|
||||
volumes:
|
||||
- ${TOOLSET_PATH}:/home/user/tools/
|
||||
ports:
|
||||
@@ -71,14 +73,15 @@ services:
|
||||
ipc: host
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: react_langgraph
|
||||
strategy: react_llama
|
||||
with_memory: true
|
||||
recursion_limit: ${recursion_limit_supervisor}
|
||||
llm_engine: openai
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
model: ${model}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
stream: false
|
||||
stream: true
|
||||
tools: /home/user/tools/supervisor_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -16,7 +16,7 @@ export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
|
||||
export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
|
||||
export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
|
||||
export CRAG_SERVER=http://${ip_address}:8080
|
||||
export db_name=california_schools
|
||||
export db_path="sqlite:////home/user/TAG-Bench/dev_folder/dev_databases/${db_name}/${db_name}.sqlite"
|
||||
export db_name=Chinook
|
||||
export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
|
||||
|
||||
docker compose -f compose_openai.yaml up -d
|
||||
|
||||
@@ -13,6 +13,7 @@ services:
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: rag_agent_llama
|
||||
with_memory: false
|
||||
recursion_limit: ${recursion_limit_worker}
|
||||
llm_engine: vllm
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
@@ -43,6 +44,7 @@ services:
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: sql_agent_llama
|
||||
with_memory: false
|
||||
db_name: ${db_name}
|
||||
db_path: ${db_path}
|
||||
use_hints: false
|
||||
@@ -74,6 +76,7 @@ services:
|
||||
environment:
|
||||
ip_address: ${ip_address}
|
||||
strategy: react_llama
|
||||
with_memory: true
|
||||
recursion_limit: ${recursion_limit_supervisor}
|
||||
llm_engine: vllm
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
@@ -81,7 +84,7 @@ services:
|
||||
model: ${LLM_MODEL_ID}
|
||||
temperature: ${temperature}
|
||||
max_new_tokens: ${max_new_tokens}
|
||||
stream: false
|
||||
stream: true
|
||||
tools: /home/user/tools/supervisor_agent_tools.yaml
|
||||
require_human_feedback: false
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -14,7 +14,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export HF_CACHE_DIR=${HF_CACHE_DIR}
|
||||
ls $HF_CACHE_DIR
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
|
||||
export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
|
||||
export NUM_SHARDS=4
|
||||
export LLM_ENDPOINT_URL="http://${ip_address}:8086"
|
||||
export temperature=0
|
||||
|
||||
@@ -43,7 +43,6 @@ function build_vllm_docker_image() {
|
||||
fi
|
||||
cd ./vllm-fork
|
||||
git checkout v0.6.4.post2+Gaudi-1.19.0
|
||||
sed -i 's/triton/triton==3.1.0/g' requirements-hpu.txt
|
||||
docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "opea/vllm-gaudi:ci failed"
|
||||
|
||||
@@ -9,7 +9,7 @@ echo "WORKDIR=${WORKDIR}"
|
||||
export ip_address=$(hostname -I | awk '{print $1}')
|
||||
export host_ip=${ip_address}
|
||||
|
||||
export HF_CACHE_DIR=$WORKDIR/hf_cache
|
||||
export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
|
||||
if [ ! -d "$HF_CACHE_DIR" ]; then
|
||||
echo "Creating HF_CACHE directory"
|
||||
mkdir -p "$HF_CACHE_DIR"
|
||||
|
||||
@@ -11,9 +11,9 @@ export ip_address=$(hostname -I | awk '{print $1}')
|
||||
export TOOLSET_PATH=$WORKPATH/tools/
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
model="meta-llama/Meta-Llama-3.1-70B-Instruct"
|
||||
model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
|
||||
|
||||
export HF_CACHE_DIR=/data2/huggingface
|
||||
export HF_CACHE_DIR=${model_cache:-"/data2/huggingface"}
|
||||
if [ ! -d "$HF_CACHE_DIR" ]; then
|
||||
HF_CACHE_DIR=$WORKDIR/hf_cache
|
||||
mkdir -p "$HF_CACHE_DIR"
|
||||
@@ -60,23 +60,6 @@ function start_vllm_service_70B() {
|
||||
echo "Service started successfully"
|
||||
}
|
||||
|
||||
|
||||
function prepare_data() {
|
||||
cd $WORKDIR
|
||||
|
||||
echo "Downloading data..."
|
||||
git clone https://github.com/TAG-Research/TAG-Bench.git
|
||||
cd TAG-Bench/setup
|
||||
chmod +x get_dbs.sh
|
||||
./get_dbs.sh
|
||||
|
||||
echo "Split data..."
|
||||
cd $WORKPATH/tests/sql_agent_test
|
||||
bash run_data_split.sh
|
||||
|
||||
echo "Data preparation done!"
|
||||
}
|
||||
|
||||
function download_chinook_data(){
|
||||
echo "Downloading chinook data..."
|
||||
cd $WORKDIR
|
||||
@@ -113,7 +96,7 @@ function validate_agent_service() {
|
||||
echo "======================Testing worker rag agent======================"
|
||||
export agent_port="9095"
|
||||
prompt="Tell me about Michael Jackson song Thriller"
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
|
||||
# echo $CONTENT
|
||||
local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
|
||||
echo $EXIT_CODE
|
||||
@@ -127,7 +110,7 @@ function validate_agent_service() {
|
||||
echo "======================Testing worker sql agent======================"
|
||||
export agent_port="9096"
|
||||
prompt="How many employees are there in the company?"
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
|
||||
local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
|
||||
echo $CONTENT
|
||||
# echo $EXIT_CODE
|
||||
@@ -140,9 +123,8 @@ function validate_agent_service() {
|
||||
# test supervisor react agent
|
||||
echo "======================Testing supervisor react agent======================"
|
||||
export agent_port="9090"
|
||||
prompt="How many albums does Iron Maiden have?"
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
|
||||
local EXIT_CODE=$(validate "$CONTENT" "21" "react-agent-endpoint")
|
||||
local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
|
||||
local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
|
||||
# echo $CONTENT
|
||||
echo $EXIT_CODE
|
||||
local EXIT_CODE="${EXIT_CODE:0-1}"
|
||||
@@ -153,15 +135,6 @@ function validate_agent_service() {
|
||||
|
||||
}
|
||||
|
||||
function remove_data() {
|
||||
echo "Removing data..."
|
||||
cd $WORKDIR
|
||||
if [ -d "TAG-Bench" ]; then
|
||||
rm -rf TAG-Bench
|
||||
fi
|
||||
echo "Data removed!"
|
||||
}
|
||||
|
||||
function remove_chinook_data(){
|
||||
echo "Removing chinook data..."
|
||||
cd $WORKDIR
|
||||
@@ -189,8 +162,9 @@ function main() {
|
||||
echo "==================== Agent service validated ===================="
|
||||
}
|
||||
|
||||
remove_data
|
||||
|
||||
remove_chinook_data
|
||||
|
||||
main
|
||||
remove_data
|
||||
|
||||
remove_chinook_data
|
||||
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
|
||||
export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
|
||||
export HF_CACHE_DIR=$WORKDIR/hf_cache
|
||||
export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
|
||||
if [ ! -d "$HF_CACHE_DIR" ]; then
|
||||
mkdir -p "$HF_CACHE_DIR"
|
||||
fi
|
||||
|
||||
@@ -1,34 +1,20 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def generate_answer_agent_api(url, prompt):
|
||||
proxies = {"http": ""}
|
||||
payload = {
|
||||
"messages": prompt,
|
||||
}
|
||||
response = requests.post(url, json=payload, proxies=proxies)
|
||||
answer = response.json()["text"]
|
||||
return answer
|
||||
|
||||
|
||||
def process_request(url, query, is_stream=False):
|
||||
proxies = {"http": ""}
|
||||
|
||||
payload = {
|
||||
"messages": query,
|
||||
}
|
||||
|
||||
content = json.dumps(query) if query is not None else None
|
||||
try:
|
||||
resp = requests.post(url=url, json=payload, proxies=proxies, stream=is_stream)
|
||||
resp = requests.post(url=url, data=content, proxies=proxies, stream=is_stream)
|
||||
if not is_stream:
|
||||
ret = resp.json()["text"]
|
||||
print(ret)
|
||||
else:
|
||||
for line in resp.iter_lines(decode_unicode=True):
|
||||
print(line)
|
||||
@@ -38,19 +24,54 @@ def process_request(url, query, is_stream=False):
|
||||
return ret
|
||||
except requests.exceptions.RequestException as e:
|
||||
ret = f"An error occurred:{e}"
|
||||
print(ret)
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
def test_worker_agent(args):
|
||||
url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
|
||||
query = {"role": "user", "messages": args.prompt, "stream": "false"}
|
||||
ret = process_request(url, query)
|
||||
print("Response: ", ret)
|
||||
|
||||
|
||||
def add_message_and_run(url, user_message, thread_id, stream=False):
|
||||
print("User message: ", user_message)
|
||||
query = {"role": "user", "messages": user_message, "thread_id": thread_id, "stream": stream}
|
||||
ret = process_request(url, query, is_stream=stream)
|
||||
print("Response: ", ret)
|
||||
|
||||
|
||||
def test_chat_completion_multi_turn(args):
|
||||
url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
|
||||
thread_id = f"{uuid.uuid4()}"
|
||||
|
||||
# first turn
|
||||
print("===============First turn==================")
|
||||
user_message = "Which artist has the most albums in the database?"
|
||||
add_message_and_run(url, user_message, thread_id, stream=args.stream)
|
||||
print("===============End of first turn==================")
|
||||
|
||||
# second turn
|
||||
print("===============Second turn==================")
|
||||
user_message = "Give me a few examples of the artist's albums?"
|
||||
add_message_and_run(url, user_message, thread_id, stream=args.stream)
|
||||
print("===============End of second turn==================")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--prompt", type=str)
|
||||
parser.add_argument("--stream", action="store_true")
|
||||
args = parser.parse_args()
|
||||
parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
|
||||
parser.add_argument("--ext_port", type=str, default="9090", help="endpoint port")
|
||||
parser.add_argument("--stream", action="store_true", help="streaming mode")
|
||||
parser.add_argument("--prompt", type=str, help="prompt message")
|
||||
parser.add_argument("--agent_role", type=str, default="supervisor", help="supervisor or worker")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
ip_address = os.getenv("ip_address", "localhost")
|
||||
agent_port = os.getenv("agent_port", "9090")
|
||||
url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
|
||||
prompt = args.prompt
|
||||
print(args)
|
||||
|
||||
process_request(url, prompt, args.stream)
|
||||
if args.agent_role == "supervisor":
|
||||
test_chat_completion_multi_turn(args)
|
||||
elif args.agent_role == "worker":
|
||||
test_worker_agent(args)
|
||||
else:
|
||||
raise ValueError("Invalid agent role")
|
||||
|
||||
@@ -78,7 +78,7 @@ bash step3_ingest_data_and_validate_retrieval.sh
|
||||
echo "=================== #3 Data ingestion and validation completed===================="
|
||||
|
||||
echo "=================== #4 Start agent and API server===================="
|
||||
bash step4_launch_and_validate_agent_tgi.sh
|
||||
bash step4_launch_and_validate_agent_gaudi.sh
|
||||
echo "=================== #4 Agent test passed ===================="
|
||||
|
||||
echo "=================== #5 Stop agent and API server===================="
|
||||
|
||||
@@ -18,7 +18,7 @@ Here're some of the project's features:
|
||||
2. cd command to the current folder.
|
||||
|
||||
```
|
||||
cd AgentQnA/ui
|
||||
cd AgentQnA/ui/svelte
|
||||
```
|
||||
|
||||
3. Modify the required .env variables.
|
||||
@@ -41,7 +41,7 @@ Here're some of the project's features:
|
||||
npm run dev
|
||||
```
|
||||
|
||||
- The application will be available at `http://localhost:3000`.
|
||||
- The application will be available at `http://localhost:5173`.
|
||||
|
||||
5. **For Docker Setup:**
|
||||
|
||||
@@ -54,7 +54,7 @@ Here're some of the project's features:
|
||||
- Run the Docker container:
|
||||
|
||||
```
|
||||
docker run -d -p 3000:3000 --name agent-ui opea:agent-ui
|
||||
docker run -d -p 5173:5173 --name agent-ui opea:agent-ui
|
||||
```
|
||||
|
||||
- The application will be available at `http://localhost:3000`.
|
||||
- The application will be available at `http://localhost:5173`.
|
||||
|
||||
@@ -108,7 +108,7 @@
|
||||
<!-- svelte-ignore a11y-click-events-have-key-events -->
|
||||
<div
|
||||
class="relative rounded-xl bg-white p-2 py-8 pl-16"
|
||||
on:click={() => handleCreate(feature)}
|
||||
on:click={() => handleCreate(feature.description)}
|
||||
>
|
||||
<dt class="text-base font-semibold text-gray-900">
|
||||
<div
|
||||
|
||||
@@ -1,48 +1,8 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Stage 1: base setup used by other stages
|
||||
FROM python:3.11-slim AS base
|
||||
|
||||
# get security updates
|
||||
RUN apt-get update && apt-get upgrade -y && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV HOME=/home/user
|
||||
|
||||
RUN useradd -m -s /bin/bash user && \
|
||||
mkdir -p $HOME && \
|
||||
chown -R user $HOME
|
||||
|
||||
WORKDIR $HOME
|
||||
|
||||
|
||||
# Stage 2: latest GenAIComps sources
|
||||
FROM base AS git
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends git
|
||||
RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
|
||||
|
||||
|
||||
# Stage 3: common layer shared by services using GenAIComps
|
||||
FROM base AS comps-base
|
||||
|
||||
# copy just relevant parts
|
||||
COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
|
||||
COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
|
||||
|
||||
WORKDIR $HOME/GenAIComps
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
|
||||
WORKDIR $HOME
|
||||
|
||||
ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
|
||||
|
||||
USER user
|
||||
|
||||
|
||||
# Stage 4: unique part
|
||||
FROM comps-base
|
||||
ARG BASE_TAG=latest
|
||||
FROM opea/comps-base:$BASE_TAG
|
||||
|
||||
COPY ./audioqna.py $HOME/audioqna.py
|
||||
|
||||
|
||||
@@ -1,48 +1,8 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Stage 1: base setup used by other stages
|
||||
FROM python:3.11-slim AS base
|
||||
|
||||
# get security updates
|
||||
RUN apt-get update && apt-get upgrade -y && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV HOME=/home/user
|
||||
|
||||
RUN useradd -m -s /bin/bash user && \
|
||||
mkdir -p $HOME && \
|
||||
chown -R user $HOME
|
||||
|
||||
WORKDIR $HOME
|
||||
|
||||
|
||||
# Stage 2: latest GenAIComps sources
|
||||
FROM base AS git
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends git
|
||||
RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
|
||||
|
||||
|
||||
# Stage 3: common layer shared by services using GenAIComps
|
||||
FROM base AS comps-base
|
||||
|
||||
# copy just relevant parts
|
||||
COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
|
||||
COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
|
||||
|
||||
WORKDIR $HOME/GenAIComps
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
|
||||
WORKDIR $HOME
|
||||
|
||||
ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
|
||||
|
||||
USER user
|
||||
|
||||
|
||||
# Stage 4: unique part
|
||||
FROM comps-base
|
||||
ARG BASE_TAG=latest
|
||||
FROM opea/comps-base:$BASE_TAG
|
||||
|
||||
COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py
|
||||
|
||||
|
||||
@@ -16,13 +16,14 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
|
||||
SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
|
||||
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
|
||||
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
|
||||
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
|
||||
|
||||
|
||||
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
|
||||
if self.services[cur_node].service_type == ServiceType.LLM:
|
||||
# convert TGI/vLLM to unified OpenAI /v1/chat/completions format
|
||||
next_inputs = {}
|
||||
next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified
|
||||
next_inputs["model"] = LLM_MODEL_ID
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
|
||||
@@ -17,6 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
|
||||
GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
|
||||
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
|
||||
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
|
||||
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
|
||||
|
||||
|
||||
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
|
||||
@@ -24,7 +25,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
||||
if self.services[cur_node].service_type == ServiceType.LLM:
|
||||
# convert TGI/vLLM to unified OpenAI /v1/chat/completions format
|
||||
next_inputs = {}
|
||||
next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified
|
||||
next_inputs["model"] = LLM_MODEL_ID
|
||||
next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
|
||||
next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
|
||||
next_inputs["top_p"] = llm_parameters_dict["top_p"]
|
||||
|
||||
@@ -69,6 +69,7 @@ services:
|
||||
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
|
||||
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
|
||||
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
|
||||
- LLM_MODEL_ID=${LLM_MODEL_ID}
|
||||
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
|
||||
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
|
||||
ipc: host
|
||||
|
||||
@@ -25,6 +25,9 @@ Intel Xeon optimized image hosted in huggingface repo will be used for TGI servi
|
||||
|
||||
```bash
|
||||
docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
|
||||
|
||||
# multilang tts (optional)
|
||||
docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile .
|
||||
```
|
||||
|
||||
### 5. Build MegaService Docker Image
|
||||
@@ -42,6 +45,7 @@ Then run the command `docker images`, you will have following images ready:
|
||||
1. `opea/whisper:latest`
|
||||
2. `opea/speecht5:latest`
|
||||
3. `opea/audioqna:latest`
|
||||
4. `opea/gpt-sovits:latest` (optional)
|
||||
|
||||
## 🚀 Set the environment variables
|
||||
|
||||
@@ -57,9 +61,11 @@ export MEGA_SERVICE_HOST_IP=${host_ip}
|
||||
export WHISPER_SERVER_HOST_IP=${host_ip}
|
||||
export SPEECHT5_SERVER_HOST_IP=${host_ip}
|
||||
export LLM_SERVER_HOST_IP=${host_ip}
|
||||
export GPT_SOVITS_SERVER_HOST_IP=${host_ip}
|
||||
|
||||
export WHISPER_SERVER_PORT=7066
|
||||
export SPEECHT5_SERVER_PORT=7055
|
||||
export GPT_SOVITS_SERVER_PORT=9880
|
||||
export LLM_SERVER_PORT=3006
|
||||
|
||||
export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
|
||||
@@ -74,16 +80,20 @@ Note: Please replace with host_ip with your external IP address, do not use loca
|
||||
```bash
|
||||
cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose up -d
|
||||
|
||||
# multilang tts (optional)
|
||||
docker compose -f compose_multilang.yaml up -d
|
||||
```
|
||||
|
||||
## 🚀 Test MicroServices
|
||||
|
||||
```bash
|
||||
# whisper service
|
||||
curl http://${host_ip}:7066/v1/asr \
|
||||
-X POST \
|
||||
-d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
|
||||
curl http://${host_ip}:7066/v1/audio/transcriptions \
|
||||
-H "Content-Type: multipart/form-data" \
|
||||
-F file="@./sample.wav" \
|
||||
-F model="openai/whisper-small"
|
||||
|
||||
# tgi service
|
||||
curl http://${host_ip}:3006/generate \
|
||||
@@ -92,11 +102,10 @@ curl http://${host_ip}:3006/generate \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# speecht5 service
|
||||
curl http://${host_ip}:7055/v1/tts \
|
||||
-X POST \
|
||||
-d '{"text": "Who are you?"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
|
||||
|
||||
# gpt-sovits service (optional)
|
||||
curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
|
||||
```
|
||||
|
||||
## 🚀 Test MegaService
|
||||
@@ -106,7 +115,8 @@ base64 string to the megaservice endpoint. The megaservice will return a spoken
|
||||
to the response, decode the base64 string and save it as a .wav file.
|
||||
|
||||
```bash
|
||||
# voice can be "default" or "male"
|
||||
# if you are using speecht5 as the tts service, voice can be "default" or "male"
|
||||
# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/integrations/dependency/gpt-sovits/README.md
|
||||
curl http://${host_ip}:3008/v1/audioqna \
|
||||
-X POST \
|
||||
-d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
|
||||
|
||||
@@ -30,7 +30,7 @@ services:
|
||||
ports:
|
||||
- "3006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -61,6 +61,7 @@ services:
|
||||
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
|
||||
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
|
||||
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
|
||||
- LLM_MODEL_ID=${LLM_MODEL_ID}
|
||||
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
|
||||
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
|
||||
ipc: host
|
||||
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "3006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -58,7 +58,20 @@ services:
|
||||
- GPT_SOVITS_SERVER_PORT=${GPT_SOVITS_SERVER_PORT}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
audioqna-xeon-ui-server:
|
||||
image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
|
||||
container_name: audioqna-xeon-ui-server
|
||||
depends_on:
|
||||
- audioqna-xeon-backend-server
|
||||
ports:
|
||||
- "5175:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
|
||||
ipc: host
|
||||
restart: always
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
|
||||
@@ -40,7 +40,7 @@ services:
|
||||
ports:
|
||||
- "3006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -82,6 +82,7 @@ services:
|
||||
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
|
||||
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
|
||||
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
|
||||
- LLM_MODEL_ID=${LLM_MODEL_ID}
|
||||
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
|
||||
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
|
||||
ipc: host
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
|
||||
@@ -32,7 +32,7 @@ COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
|
||||
COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
|
||||
|
||||
WORKDIR $HOME/GenAIComps
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
|
||||
WORKDIR $HOME
|
||||
|
||||
|
||||
209
AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
Normal file
209
AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
Normal file
@@ -0,0 +1,209 @@
|
||||
# Build Mega Service of AvatarChatbot on AMD GPU
|
||||
|
||||
This document outlines the deployment process for a AvatarChatbot application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
|
||||
|
||||
## 🚀 Build Docker images
|
||||
|
||||
### 1. Source Code install GenAIComps
|
||||
|
||||
```bash
|
||||
git clone https://github.com/opea-project/GenAIComps.git
|
||||
cd GenAIComps
|
||||
```
|
||||
|
||||
### 2. Build ASR Image
|
||||
|
||||
```bash
|
||||
docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
|
||||
|
||||
|
||||
docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/Dockerfile .
|
||||
```
|
||||
|
||||
### 3. Build LLM Image
|
||||
|
||||
```bash
|
||||
docker build --no-cache -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
|
||||
```
|
||||
|
||||
### 4. Build TTS Image
|
||||
|
||||
```bash
|
||||
docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
|
||||
|
||||
docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/Dockerfile .
|
||||
```
|
||||
|
||||
### 5. Build Animation Image
|
||||
|
||||
```bash
|
||||
docker build -t opea/wav2lip:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/wav2lip/src/Dockerfile .
|
||||
|
||||
docker build -t opea/animation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/animation/src/Dockerfile .
|
||||
```
|
||||
|
||||
### 6. Build MegaService Docker Image
|
||||
|
||||
To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/opea-project/GenAIExamples.git
|
||||
cd GenAIExamples/AvatarChatbot/
|
||||
docker build --no-cache -t opea/avatarchatbot:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
|
||||
```
|
||||
|
||||
Then run the command `docker images`, you will have following images ready:
|
||||
|
||||
1. `opea/whisper:latest`
|
||||
2. `opea/asr:latest`
|
||||
3. `opea/llm-tgi:latest`
|
||||
4. `opea/speecht5:latest`
|
||||
5. `opea/tts:latest`
|
||||
6. `opea/wav2lip:latest`
|
||||
7. `opea/animation:latest`
|
||||
8. `opea/avatarchatbot:latest`
|
||||
|
||||
## 🚀 Set the environment variables
|
||||
|
||||
Before starting the services with `docker compose`, you have to recheck the following environment variables.
|
||||
|
||||
```bash
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export host_ip=$(hostname -I | awk '{print $1}')
|
||||
|
||||
export TGI_SERVICE_PORT=3006
|
||||
export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
|
||||
export ASR_ENDPOINT=http://${host_ip}:7066
|
||||
export TTS_ENDPOINT=http://${host_ip}:7055
|
||||
export WAV2LIP_ENDPOINT=http://${host_ip}:7860
|
||||
|
||||
export MEGA_SERVICE_HOST_IP=${host_ip}
|
||||
export ASR_SERVICE_HOST_IP=${host_ip}
|
||||
export TTS_SERVICE_HOST_IP=${host_ip}
|
||||
export LLM_SERVICE_HOST_IP=${host_ip}
|
||||
export ANIMATION_SERVICE_HOST_IP=${host_ip}
|
||||
|
||||
export MEGA_SERVICE_PORT=8888
|
||||
export ASR_SERVICE_PORT=3001
|
||||
export TTS_SERVICE_PORT=3002
|
||||
export LLM_SERVICE_PORT=3007
|
||||
export ANIMATION_SERVICE_PORT=3008
|
||||
|
||||
export DEVICE="cpu"
|
||||
export WAV2LIP_PORT=7860
|
||||
export INFERENCE_MODE='wav2lip+gfpgan'
|
||||
export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
|
||||
export FACE="assets/img/avatar5.png"
|
||||
# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
|
||||
export AUDIO='None'
|
||||
export FACESIZE=96
|
||||
export OUTFILE="/outputs/result.mp4"
|
||||
export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
|
||||
export UPSCALE_FACTOR=1
|
||||
export FPS=10
|
||||
```
|
||||
|
||||
Warning!!! - The Wav2lip service works in this solution using only the CPU. To use AMD GPUs and achieve operational performance, the Wav2lip image needs to be modified to adapt to AMD hardware and the ROCm framework.
|
||||
|
||||
## 🚀 Start the MegaService
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/AvatarChatbot/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose.yaml up -d
|
||||
```
|
||||
|
||||
## 🚀 Test MicroServices
|
||||
|
||||
```bash
|
||||
# whisper service
|
||||
curl http://${host_ip}:7066/v1/asr \
|
||||
-X POST \
|
||||
-d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# asr microservice
|
||||
curl http://${host_ip}:3001/v1/audio/transcriptions \
|
||||
-X POST \
|
||||
-d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# tgi service
|
||||
curl http://${host_ip}:3006/generate \
|
||||
-X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# llm microservice
|
||||
curl http://${host_ip}:3007/v1/chat/completions\
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# speecht5 service
|
||||
curl http://${host_ip}:7055/v1/tts \
|
||||
-X POST \
|
||||
-d '{"text": "Who are you?"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# tts microservice
|
||||
curl http://${host_ip}:3002/v1/audio/speech \
|
||||
-X POST \
|
||||
-d '{"text": "Who are you?"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# wav2lip service
|
||||
cd ../../../..
|
||||
curl http://${host_ip}:7860/v1/wav2lip \
|
||||
-X POST \
|
||||
-d @assets/audio/sample_minecraft.json \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# animation microservice
|
||||
curl http://${host_ip}:3008/v1/animation \
|
||||
-X POST \
|
||||
-d @assets/audio/sample_question.json \
|
||||
-H "Content-Type: application/json"
|
||||
|
||||
```
|
||||
|
||||
## 🚀 Test MegaService
|
||||
|
||||
```bash
|
||||
curl http://${host_ip}:3009/v1/avatarchatbot \
|
||||
-X POST \
|
||||
-d @assets/audio/sample_whoareyou.json \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
If the megaservice is running properly, you should see the following output:
|
||||
|
||||
```bash
|
||||
"/outputs/result.mp4"
|
||||
```
|
||||
|
||||
The output file will be saved in the current working directory, as `${PWD}` is mapped to `/outputs` inside the wav2lip-service Docker container.
|
||||
|
||||
## Gradio UI
|
||||
|
||||
```bash
|
||||
cd $WORKPATH/GenAIExamples/AvatarChatbot
|
||||
python3 ui/gradio/app_gradio_demo_avatarchatbot.py
|
||||
```
|
||||
|
||||
The UI can be viewed at http://${host_ip}:7861
|
||||
<img src="../../../../assets/img/UI.png" alt="UI Example" width="60%">
|
||||
In the current version v1.0, you need to set the avatar figure image/video and the DL model choice in the environment variables before starting AvatarChatbot backend service and running the UI. Please just customize the audio question in the UI.
|
||||
\*\* We will enable change of avatar figure between runs in v2.0
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/AvatarChatbot/tests
|
||||
export IMAGE_REPO="opea"
|
||||
export IMAGE_TAG="latest"
|
||||
export HUGGINGFACEHUB_API_TOKEN=<your_hf_token>
|
||||
|
||||
test_avatarchatbot_on_xeon.sh
|
||||
```
|
||||
158
AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
Normal file
158
AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
Normal file
@@ -0,0 +1,158 @@
|
||||
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
whisper-service:
|
||||
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
|
||||
container_name: whisper-service
|
||||
ports:
|
||||
- "7066:7066"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
restart: unless-stopped
|
||||
asr:
|
||||
image: ${REGISTRY:-opea}/asr:${TAG:-latest}
|
||||
container_name: asr-service
|
||||
ports:
|
||||
- "3001:9099"
|
||||
ipc: host
|
||||
environment:
|
||||
ASR_ENDPOINT: ${ASR_ENDPOINT}
|
||||
speecht5-service:
|
||||
image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
|
||||
container_name: speecht5-service
|
||||
ports:
|
||||
- "7055:7055"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
restart: unless-stopped
|
||||
tts:
|
||||
image: ${REGISTRY:-opea}/tts:${TAG:-latest}
|
||||
container_name: tts-service
|
||||
ports:
|
||||
- "3002:9088"
|
||||
ipc: host
|
||||
environment:
|
||||
TTS_ENDPOINT: ${TTS_ENDPOINT}
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
|
||||
container_name: tgi-service
|
||||
ports:
|
||||
- "${TGI_SERVICE_PORT:-3006}:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
shm_size: 1g
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri/:/dev/dri/
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
group_add:
|
||||
- video
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
ipc: host
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192
|
||||
llm:
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
container_name: llm-tgi-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
ports:
|
||||
- "3007:9000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
|
||||
LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
restart: unless-stopped
|
||||
wav2lip-service:
|
||||
image: ${REGISTRY:-opea}/wav2lip:${TAG:-latest}
|
||||
container_name: wav2lip-service
|
||||
ports:
|
||||
- "7860:7860"
|
||||
ipc: host
|
||||
volumes:
|
||||
- ${PWD}:/outputs
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
DEVICE: ${DEVICE}
|
||||
INFERENCE_MODE: ${INFERENCE_MODE}
|
||||
CHECKPOINT_PATH: ${CHECKPOINT_PATH}
|
||||
FACE: ${FACE}
|
||||
AUDIO: ${AUDIO}
|
||||
FACESIZE: ${FACESIZE}
|
||||
OUTFILE: ${OUTFILE}
|
||||
GFPGAN_MODEL_VERSION: ${GFPGAN_MODEL_VERSION}
|
||||
UPSCALE_FACTOR: ${UPSCALE_FACTOR}
|
||||
FPS: ${FPS}
|
||||
WAV2LIP_PORT: ${WAV2LIP_PORT}
|
||||
restart: unless-stopped
|
||||
animation:
|
||||
image: ${REGISTRY:-opea}/animation:${TAG:-latest}
|
||||
container_name: animation-server
|
||||
ports:
|
||||
- "3008:9066"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
WAV2LIP_ENDPOINT: ${WAV2LIP_ENDPOINT}
|
||||
restart: unless-stopped
|
||||
avatarchatbot-backend-server:
|
||||
image: ${REGISTRY:-opea}/avatarchatbot:${TAG:-latest}
|
||||
container_name: avatarchatbot-backend-server
|
||||
depends_on:
|
||||
- asr
|
||||
- llm
|
||||
- tts
|
||||
- animation
|
||||
ports:
|
||||
- "3009:8888"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
|
||||
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT}
|
||||
ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
|
||||
ASR_SERVICE_PORT: ${ASR_SERVICE_PORT}
|
||||
LLM_SERVICE_HOST_IP: ${LLM_SERVICE_HOST_IP}
|
||||
LLM_SERVICE_PORT: ${LLM_SERVICE_PORT}
|
||||
LLM_SERVER_HOST_IP: ${LLM_SERVICE_HOST_IP}
|
||||
LLM_SERVER_PORT: ${LLM_SERVICE_PORT}
|
||||
TTS_SERVICE_HOST_IP: ${TTS_SERVICE_HOST_IP}
|
||||
TTS_SERVICE_PORT: ${TTS_SERVICE_PORT}
|
||||
ANIMATION_SERVICE_HOST_IP: ${ANIMATION_SERVICE_HOST_IP}
|
||||
ANIMATION_SERVICE_PORT: ${ANIMATION_SERVICE_PORT}
|
||||
WHISPER_SERVER_HOST_IP: ${WHISPER_SERVER_HOST_IP}
|
||||
WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
|
||||
SPEECHT5_SERVER_HOST_IP: ${SPEECHT5_SERVER_HOST_IP}
|
||||
SPEECHT5_SERVER_PORT: ${SPEECHT5_SERVER_PORT}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
47
AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
Normal file
47
AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
export host_ip=$(hostname -I | awk '{print $1}')
|
||||
|
||||
export TGI_SERVICE_PORT=3006
|
||||
export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
|
||||
export ASR_ENDPOINT=http://${host_ip}:7066
|
||||
export TTS_ENDPOINT=http://${host_ip}:7055
|
||||
export WAV2LIP_ENDPOINT=http://${host_ip}:7860
|
||||
|
||||
export WHISPER_SERVER_HOST_IP=${host_ip}
|
||||
export WHISPER_SERVER_PORT=7066
|
||||
|
||||
export SPEECHT5_SERVER_HOST_IP=${host_ip}
|
||||
export SPEECHT5_SERVER_PORT=7055
|
||||
|
||||
export MEGA_SERVICE_HOST_IP=${host_ip}
|
||||
export ASR_SERVICE_HOST_IP=${host_ip}
|
||||
export TTS_SERVICE_HOST_IP=${host_ip}
|
||||
export LLM_SERVICE_HOST_IP=${host_ip}
|
||||
export ANIMATION_SERVICE_HOST_IP=${host_ip}
|
||||
|
||||
export MEGA_SERVICE_PORT=8888
|
||||
export ASR_SERVICE_PORT=3001
|
||||
export TTS_SERVICE_PORT=3002
|
||||
export LLM_SERVICE_PORT=3007
|
||||
export ANIMATION_SERVICE_PORT=3008
|
||||
|
||||
export DEVICE="cpu"
|
||||
export WAV2LIP_PORT=7860
|
||||
export INFERENCE_MODE='wav2lip+gfpgan'
|
||||
export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
|
||||
export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
|
||||
# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
|
||||
export AUDIO='None'
|
||||
export FACESIZE=96
|
||||
export OUTFILE="/outputs/result.mp4"
|
||||
export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
|
||||
export UPSCALE_FACTOR=1
|
||||
export FPS=10
|
||||
170
AvatarChatbot/tests/test_compose_on_rocm.sh
Normal file
170
AvatarChatbot/tests/test_compose_on_rocm.sh
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
IMAGE_REPO=${IMAGE_REPO:-"opea"}
|
||||
IMAGE_TAG=${IMAGE_TAG:-"latest"}
|
||||
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
if ls $LOG_PATH/*.log 1> /dev/null 2>&1; then
|
||||
rm $LOG_PATH/*.log
|
||||
echo "Log files removed."
|
||||
else
|
||||
echo "No log files to remove."
|
||||
fi
|
||||
ip_address=$(hostname -I | awk '{print $1}')
|
||||
|
||||
|
||||
function build_docker_images() {
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="avatarchatbot whisper asr llm-textgen speecht5 tts wav2lip animation"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
|
||||
|
||||
docker images && sleep 3s
|
||||
}
|
||||
|
||||
|
||||
function start_services() {
|
||||
cd $WORKPATH/docker_compose/amd/gpu/rocm
|
||||
|
||||
export HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN
|
||||
export OPENAI_API_KEY=$OPENAI_API_KEY
|
||||
export host_ip=${ip_address}
|
||||
|
||||
export TGI_SERVICE_PORT=3006
|
||||
export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
|
||||
export ASR_ENDPOINT=http://${host_ip}:7066
|
||||
export TTS_ENDPOINT=http://${host_ip}:7055
|
||||
export WAV2LIP_ENDPOINT=http://${host_ip}:7860
|
||||
|
||||
export MEGA_SERVICE_HOST_IP=${host_ip}
|
||||
export ASR_SERVICE_HOST_IP=${host_ip}
|
||||
export TTS_SERVICE_HOST_IP=${host_ip}
|
||||
export LLM_SERVICE_HOST_IP=${host_ip}
|
||||
export ANIMATION_SERVICE_HOST_IP=${host_ip}
|
||||
export WHISPER_SERVER_HOST_IP=${host_ip}
|
||||
export WHISPER_SERVER_PORT=7066
|
||||
|
||||
export SPEECHT5_SERVER_HOST_IP=${host_ip}
|
||||
export SPEECHT5_SERVER_PORT=7055
|
||||
|
||||
export MEGA_SERVICE_PORT=8888
|
||||
export ASR_SERVICE_PORT=3001
|
||||
export TTS_SERVICE_PORT=3002
|
||||
export LLM_SERVICE_PORT=3007
|
||||
export ANIMATION_SERVICE_PORT=3008
|
||||
|
||||
export DEVICE="cpu"
|
||||
export WAV2LIP_PORT=7860
|
||||
export INFERENCE_MODE='wav2lip+gfpgan'
|
||||
export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
|
||||
export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
|
||||
# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
|
||||
export AUDIO='None'
|
||||
export FACESIZE=96
|
||||
export OUTFILE="./outputs/result.mp4"
|
||||
export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
|
||||
export UPSCALE_FACTOR=1
|
||||
export FPS=5
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose up -d --force-recreate
|
||||
|
||||
echo "Check tgi-service status"
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs tgi-service > $LOG_PATH/tgi_service_start.log
|
||||
if grep -q Connected $LOG_PATH/tgi_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
echo "tgi-service are up and running"
|
||||
sleep 5s
|
||||
|
||||
echo "Check wav2lip-service status"
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs wav2lip-service >& $LOG_PATH/wav2lip-service_start.log
|
||||
if grep -q "Application startup complete" $LOG_PATH/wav2lip-service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
echo "wav2lip-service are up and running"
|
||||
sleep 5s
|
||||
}
|
||||
|
||||
|
||||
function validate_megaservice() {
|
||||
cd $WORKPATH
|
||||
ls
|
||||
result=$(http_proxy="" curl http://${ip_address}:3009/v1/avatarchatbot -X POST -d @assets/audio/sample_whoareyou.json -H 'Content-Type: application/json')
|
||||
echo "result is === $result"
|
||||
if [[ $result == *"mp4"* ]]; then
|
||||
echo "Result correct."
|
||||
else
|
||||
docker logs whisper-service > $LOG_PATH/whisper-service.log
|
||||
docker logs asr-service > $LOG_PATH/asr-service.log
|
||||
docker logs speecht5-service > $LOG_PATH/speecht5-service.log
|
||||
docker logs tts-service > $LOG_PATH/tts-service.log
|
||||
docker logs tgi-service > $LOG_PATH/tgi-service.log
|
||||
docker logs llm-tgi-server > $LOG_PATH/llm-tgi-server.log
|
||||
docker logs wav2lip-service > $LOG_PATH/wav2lip-service.log
|
||||
docker logs animation-server > $LOG_PATH/animation-server.log
|
||||
|
||||
echo "Result wrong."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
|
||||
#function validate_frontend() {
|
||||
|
||||
#}
|
||||
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/amd/gpu/rocm
|
||||
docker compose down && docker compose rm -f
|
||||
}
|
||||
|
||||
|
||||
function main() {
|
||||
|
||||
echo $OPENAI_API_KEY
|
||||
echo $OPENAI_KEY
|
||||
|
||||
stop_docker
|
||||
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
|
||||
start_services
|
||||
# validate_microservices
|
||||
sleep 30
|
||||
validate_megaservice
|
||||
# validate_frontend
|
||||
stop_docker
|
||||
|
||||
echo y | docker system prune
|
||||
|
||||
}
|
||||
|
||||
|
||||
main
|
||||
@@ -68,13 +68,16 @@ To set up environment variables for deploying ChatQnA services, follow these ste
|
||||
|
||||
```bash
|
||||
# on Gaudi
|
||||
source ./docker_compose/intel/hpu/gaudi/set_env.sh
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
|
||||
source ./set_env.sh
|
||||
export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
|
||||
# on Xeon
|
||||
source ./docker_compose/intel/cpu/xeon/set_env.sh
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
source ./set_env.sh
|
||||
export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
|
||||
# on Nvidia GPU
|
||||
source ./docker_compose/nvidia/gpu/set_env.sh
|
||||
cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
|
||||
source ./set_env.sh
|
||||
export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service
|
||||
```
|
||||
|
||||
@@ -91,6 +94,14 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
|
||||
CPU example with Open Telemetry feature:
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
It will automatically download the docker image on `docker hub`:
|
||||
|
||||
```bash
|
||||
@@ -232,6 +243,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.
|
||||
|
||||
### Deploy ChatQnA on Xeon
|
||||
@@ -243,6 +261,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.
|
||||
|
||||
### Deploy ChatQnA on NVIDIA GPU
|
||||
@@ -346,7 +371,7 @@ OPEA microservice deployment can easily be monitored through Grafana dashboards
|
||||
|
||||
## Tracing Services with OpenTelemetry Tracing and Jaeger
|
||||
|
||||
> NOTE: limited support. Only LLM inference serving with TGI on Gaudi is enabled for this feature.
|
||||
> NOTE: This feature is disabled by default. Please check the Deploy ChatQnA sessions for how to enable this feature with compose.telemetry.yaml file.
|
||||
|
||||
OPEA microservice and TGI/TEI serving can easily be traced through Jaeger dashboards in conjunction with OpenTelemetry Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.
|
||||
|
||||
@@ -357,8 +382,17 @@ Users could also get the external IP via below command.
|
||||
ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
|
||||
```
|
||||
|
||||
Access the Jaeger dashboard UI at http://{EXTERNAL_IP}:16686
|
||||
|
||||
For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
|
||||

|
||||
|
||||
Here is a screenshot for one tracing of TGI serving request.
|
||||

|
||||
|
||||
There are also OPEA related tracings. Users could understand the time breakdown of each service request by looking into each opea:schedule operation.
|
||||

|
||||
|
||||
There could be async function such as `llm/MicroService_asyn_generate` and user needs to check the trace of the async function in another operation like
|
||||
opea:llm_generate_stream.
|
||||

|
||||
|
||||
112
ChatQnA/benchmark_chatqna.yaml
Normal file
112
ChatQnA/benchmark_chatqna.yaml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
deploy:
|
||||
device: gaudi
|
||||
version: 1.2.0
|
||||
modelUseHostPath: /mnt/models
|
||||
HUGGINGFACEHUB_API_TOKEN: "" # mandatory
|
||||
node: [1, 2, 4, 8]
|
||||
namespace: ""
|
||||
timeout: 1000 # timeout in seconds for services to be ready, default 30 minutes
|
||||
interval: 5 # interval in seconds between service ready checks, default 5 seconds
|
||||
|
||||
services:
|
||||
backend:
|
||||
resources:
|
||||
enabled: False
|
||||
cores_per_instance: "16"
|
||||
memory_capacity: "8000Mi"
|
||||
replicaCount: [1, 2, 4, 8]
|
||||
|
||||
teirerank:
|
||||
enabled: True
|
||||
model_id: ""
|
||||
resources:
|
||||
enabled: False
|
||||
cards_per_instance: 1
|
||||
replicaCount: [1, 1, 1, 1]
|
||||
|
||||
tei:
|
||||
model_id: ""
|
||||
resources:
|
||||
enabled: False
|
||||
cores_per_instance: "80"
|
||||
memory_capacity: "20000Mi"
|
||||
replicaCount: [1, 2, 4, 8]
|
||||
|
||||
llm:
|
||||
engine: vllm # or tgi
|
||||
model_id: "meta-llama/Meta-Llama-3-8B-Instruct" # mandatory
|
||||
replicaCount:
|
||||
with_teirerank: [7, 15, 31, 63] # When teirerank.enabled is True
|
||||
without_teirerank: [8, 16, 32, 64] # When teirerank.enabled is False
|
||||
resources:
|
||||
enabled: False
|
||||
cards_per_instance: 1
|
||||
model_params:
|
||||
vllm: # VLLM specific parameters
|
||||
batch_params:
|
||||
enabled: True
|
||||
max_num_seqs: [1, 2, 4, 8] # Each value triggers an LLM service upgrade
|
||||
token_params:
|
||||
enabled: False
|
||||
max_input_length: ""
|
||||
max_total_tokens: ""
|
||||
max_batch_total_tokens: ""
|
||||
max_batch_prefill_tokens: ""
|
||||
tgi: # TGI specific parameters
|
||||
batch_params:
|
||||
enabled: True
|
||||
max_batch_size: [1, 2, 4, 8] # Each value triggers an LLM service upgrade
|
||||
token_params:
|
||||
enabled: False
|
||||
max_input_length: "1280"
|
||||
max_total_tokens: "2048"
|
||||
max_batch_total_tokens: "65536"
|
||||
max_batch_prefill_tokens: "4096"
|
||||
|
||||
data-prep:
|
||||
resources:
|
||||
enabled: False
|
||||
cores_per_instance: ""
|
||||
memory_capacity: ""
|
||||
replicaCount: [1, 1, 1, 1]
|
||||
|
||||
retriever-usvc:
|
||||
resources:
|
||||
enabled: False
|
||||
cores_per_instance: "8"
|
||||
memory_capacity: "8000Mi"
|
||||
replicaCount: [1, 2, 4, 8]
|
||||
|
||||
redis-vector-db:
|
||||
resources:
|
||||
enabled: False
|
||||
cores_per_instance: ""
|
||||
memory_capacity: ""
|
||||
replicaCount: [1, 1, 1, 1]
|
||||
|
||||
chatqna-ui:
|
||||
replicaCount: [1, 1, 1, 1]
|
||||
|
||||
nginx:
|
||||
replicaCount: [1, 1, 1, 1]
|
||||
|
||||
benchmark:
|
||||
# http request behavior related fields
|
||||
user_queries: [640]
|
||||
concurrency: [128]
|
||||
load_shape_type: "constant" # "constant" or "poisson"
|
||||
poisson_arrival_rate: 1.0 # only used when load_shape_type is "poisson"
|
||||
warmup_iterations: 10
|
||||
seed: 1024
|
||||
|
||||
# workload, all of the test cases will run for benchmark
|
||||
bench_target: [chatqnafixed, chatqna_qlist_pubmed] # specify the bench_target for benchmark
|
||||
dataset: ["/home/sdp/upload_file.txt", "/home/sdp/pubmed_10000.txt"] # specify the absolute path to the dataset file
|
||||
prompt: [10, 1000] # set the prompt length for the chatqna_qlist_pubmed workload, set to 10 for chatqnafixed workload
|
||||
|
||||
llm:
|
||||
# specify the llm output token size
|
||||
max_token_size: [128, 256]
|
||||
@@ -167,7 +167,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
|
||||
|
||||
|
||||
def align_generator(self, gen, **kwargs):
|
||||
# openai reaponse format
|
||||
# OpenAI response format
|
||||
# b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
|
||||
for line in gen:
|
||||
line = line.decode("utf-8")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Build and deploy CodeGen Application on AMD GPU (ROCm)
|
||||
# Build and deploy ChatQnA Application on AMD GPU (ROCm)
|
||||
|
||||
## Build MegaService of ChatQnA on AMD ROCm GPU
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ services:
|
||||
ports:
|
||||
- "${CHATQNA_TEI_EMBEDDING_PORT}:80"
|
||||
volumes:
|
||||
- "/var/opea/chatqna-service/data:/data"
|
||||
- "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
|
||||
shm_size: 1g
|
||||
ipc: host
|
||||
environment:
|
||||
@@ -72,7 +72,7 @@ services:
|
||||
ports:
|
||||
- "${CHATQNA_TEI_RERANKING_PORT}:80"
|
||||
volumes:
|
||||
- "/var/opea/chatqna-service/data:/data"
|
||||
- "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -104,7 +104,7 @@ services:
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
volumes:
|
||||
- "/var/opea/chatqna-service/data:/data"
|
||||
- "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
|
||||
shm_size: 1g
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
|
||||
@@ -2,6 +2,84 @@
|
||||
|
||||
This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on AIPC. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
|
||||
|
||||
## Quick Start:
|
||||
|
||||
1. Set up the environment variables.
|
||||
2. Run Docker Compose.
|
||||
3. Consume the ChatQnA Service.
|
||||
|
||||
### Quick Start: 1. Set up Environment Variable
|
||||
|
||||
To set up environment variables for deploying ChatQnA services, follow these steps:
|
||||
|
||||
```bash
|
||||
mkdir ~/OPEA -p
|
||||
cd ~/OPEA
|
||||
git clone https://github.com/opea-project/GenAIExamples.git
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/aipc
|
||||
```
|
||||
|
||||
1. Set the required environment variables:
|
||||
|
||||
```bash
|
||||
export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
|
||||
```
|
||||
|
||||
2. If you are in a proxy environment, also set the proxy-related environment variables:
|
||||
|
||||
```bash
|
||||
export https_proxy="Your_HTTPs_Proxy"
|
||||
# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
|
||||
export no_proxy=$no_proxy,chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
|
||||
```
|
||||
|
||||
3. Set up other environment variables
|
||||
|
||||
By default, llama3.2 is used for LLM serving, the default model can be changed to other LLM models. Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.
|
||||
To change the default model defined in set_env.sh, overwrite it by exporting OLLAMA_MODEL to the new model or by modifying set_env.sh.
|
||||
For example, change to using the following model.
|
||||
|
||||
```bash
|
||||
export OLLAMA_MODEL="deepseek-r1:8b"
|
||||
```
|
||||
|
||||
to use the [DeepSeek-R1-Distill-Llama-8B model](https://ollama.com/library/deepseek-r1:8b)
|
||||
|
||||
```bash
|
||||
source ./set_env.sh
|
||||
```
|
||||
|
||||
### Quick Start: 2. Run Docker Compose
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
It will take several minutes to automatically download the docker images
|
||||
|
||||
NB: You should build docker image from source by yourself if:
|
||||
|
||||
- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
|
||||
- You can't download the docker image.
|
||||
- You want to use a specific version of Docker image.
|
||||
|
||||
Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.
|
||||
|
||||
### Quick Start:3. Consume the ChatQnA Service
|
||||
|
||||
Once the services are up, open the following URL from your browser: http://{host_ip}:80.
|
||||
Enter Prompt like What is deep learning?
|
||||
|
||||
Or if you prefer to try only on the localhost machine, then try
|
||||
|
||||
```bash
|
||||
curl http://${host_ip}:8888/v1/chatqna \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": "What is deep learning?"
|
||||
}'
|
||||
```
|
||||
|
||||
## 🚀 Build Docker Images
|
||||
|
||||
First of all, you need to build Docker Images locally and install the python package of it.
|
||||
@@ -82,18 +160,18 @@ export host_ip="External_Public_IP"
|
||||
|
||||
For Linux users, please run `hostname -I | awk '{print $1}'`. For Windows users, please run `ipconfig | findstr /i "IPv4"` to get the external public ip.
|
||||
|
||||
**Export the value of your Huggingface API token to the `your_hf_api_token` environment variable**
|
||||
**Export the value of your Huggingface API token to the `HUGGINGFACEHUB_API_TOKEN` environment variable**
|
||||
|
||||
> Change the Your_Huggingface_API_Token below with tyour actual Huggingface API Token value
|
||||
|
||||
```
|
||||
export your_hf_api_token="Your_Huggingface_API_Token"
|
||||
export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
|
||||
```
|
||||
|
||||
**Append the value of the public IP address to the no_proxy list if you are in a proxy environment**
|
||||
|
||||
```
|
||||
export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service
|
||||
export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
|
||||
```
|
||||
|
||||
- Linux PC
|
||||
@@ -105,7 +183,7 @@ export https_proxy=${your_http_proxy}
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export OLLAMA_HOST=${host_ip}
|
||||
export OLLAMA_MODEL="llama3.2"
|
||||
```
|
||||
@@ -116,7 +194,7 @@ export OLLAMA_MODEL="llama3.2"
|
||||
set EMBEDDING_MODEL_ID=BAAI/bge-base-en-v1.5
|
||||
set RERANK_MODEL_ID=BAAI/bge-reranker-base
|
||||
set INDEX_NAME=rag-redis
|
||||
set HUGGINGFACEHUB_API_TOKEN=%your_hf_api_token%
|
||||
set HUGGINGFACEHUB_API_TOKEN=%HUGGINGFACEHUB_API_TOKEN%
|
||||
set OLLAMA_HOST=host.docker.internal
|
||||
set OLLAMA_MODEL="llama3.2"
|
||||
```
|
||||
|
||||
@@ -109,7 +109,7 @@ services:
|
||||
- RETRIEVER_SERVICE_HOST_IP=retriever
|
||||
- RERANK_SERVER_HOST_IP=tei-reranking-service
|
||||
- RERANK_SERVER_PORT=80
|
||||
- LLM_SERVER_HOST_IP=${OLLAMA_HOST}
|
||||
- LLM_SERVER_HOST_IP=ollama-service
|
||||
- LLM_SERVER_PORT=11434
|
||||
- LLM_MODEL=${OLLAMA_MODEL}
|
||||
- LOGFLAG=${LOGFLAG}
|
||||
|
||||
@@ -7,15 +7,17 @@ pushd "../../../../../" > /dev/null
|
||||
source .set_env.sh
|
||||
popd > /dev/null
|
||||
|
||||
if [ -z "${your_hf_api_token}" ]; then
|
||||
echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set your_hf_api_token."
|
||||
export host_ip=$(hostname -I | awk '{print $1}')
|
||||
|
||||
if [ -z "${HUGGINGFACEHUB_API_TOKEN}" ]; then
|
||||
echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set HUGGINGFACEHUB_API_TOKEN."
|
||||
fi
|
||||
|
||||
if [ -z "${host_ip}" ]; then
|
||||
echo "Error: host_ip is not set. Please set host_ip first."
|
||||
fi
|
||||
|
||||
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export INDEX_NAME="rag-redis"
|
||||
|
||||
@@ -34,16 +34,36 @@ To set up environment variables for deploying ChatQnA services, follow these ste
|
||||
```
|
||||
|
||||
3. Set up other environment variables:
|
||||
|
||||
```bash
|
||||
source ./set_env.sh
|
||||
```
|
||||
|
||||
4. Change Model for LLM serving
|
||||
|
||||
By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.
|
||||
Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.
|
||||
To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.
|
||||
For example, change to Llama-2-7b-chat-hf using the following command.
|
||||
|
||||
```bash
|
||||
export LLM_MODEL_ID="meta-llama/Llama-2-7b-chat-hf"
|
||||
```
|
||||
|
||||
## Quick Start: 2.Run Docker Compose
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
|
||||
CPU example with Open Telemetry feature:
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
It will automatically download the docker image on `docker hub`:
|
||||
|
||||
```bash
|
||||
@@ -263,12 +283,16 @@ If use vLLM as the LLM serving backend.
|
||||
docker compose -f compose.yaml up -d
|
||||
# Start ChatQnA without Rerank Pipeline
|
||||
docker compose -f compose_without_rerank.yaml up -d
|
||||
# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
If use TGI as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose_tgi.yaml up -d
|
||||
# Start ChatQnA with Open Telemetry Tracing
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
### Validate Microservices
|
||||
|
||||
27
ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
Normal file
27
ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tei-embedding-service:
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tei-reranking-service:
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
chatqna-xeon-backend-server:
|
||||
environment:
|
||||
- ENABLE_OPEA_TELEMETRY=true
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "6006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -64,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -80,7 +80,7 @@ services:
|
||||
ports:
|
||||
- "9009:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
227
ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
Normal file
227
ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
Normal file
@@ -0,0 +1,227 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
services:
|
||||
etcd:
|
||||
container_name: milvus-etcd
|
||||
image: quay.io/coreos/etcd:v3.5.5
|
||||
environment:
|
||||
- ETCD_AUTO_COMPACTION_MODE=revision
|
||||
- ETCD_AUTO_COMPACTION_RETENTION=1000
|
||||
- ETCD_QUOTA_BACKEND_BYTES=4294967296
|
||||
- ETCD_SNAPSHOT_COUNT=50000
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
|
||||
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
|
||||
healthcheck:
|
||||
test: ["CMD", "etcdctl", "endpoint", "health"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
minio:
|
||||
container_name: milvus-minio
|
||||
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
|
||||
environment:
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
ports:
|
||||
- "${MINIO_PORT1:-5044}:9001"
|
||||
- "${MINIO_PORT2:-5043}:9000"
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
|
||||
command: minio server /minio_data --console-address ":9001"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
milvus-standalone:
|
||||
container_name: milvus-standalone
|
||||
image: milvusdb/milvus:v2.4.6
|
||||
command: ["milvus", "run", "standalone"]
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
environment:
|
||||
ETCD_ENDPOINTS: etcd:2379
|
||||
MINIO_ADDRESS: minio:9000
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
|
||||
interval: 30s
|
||||
start_period: 90s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
ports:
|
||||
- "19530:19530"
|
||||
- "${MILVUS_STANDALONE_PORT:-9091}:9091"
|
||||
depends_on:
|
||||
- "etcd"
|
||||
- "minio"
|
||||
|
||||
dataprep-milvus-service:
|
||||
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
|
||||
container_name: dataprep-milvus-server
|
||||
ports:
|
||||
- "${DATAPREP_PORT:-11101}:5000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MILVUS"
|
||||
MILVUS_HOST: ${host_ip}
|
||||
MILVUS_PORT: 19530
|
||||
TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
|
||||
LOGFLAG: ${LOGFLAG}
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
milvus-standalone:
|
||||
condition: service_healthy
|
||||
etcd:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
|
||||
retriever:
|
||||
image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
|
||||
container_name: retriever-milvus-server
|
||||
depends_on:
|
||||
- milvus-standalone
|
||||
ports:
|
||||
- "7000:7000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
MILVUS_HOST: ${host_ip}
|
||||
MILVUS_PORT: 19530
|
||||
TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
LOGFLAG: ${LOGFLAG}
|
||||
RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_MILVUS"
|
||||
restart: unless-stopped
|
||||
|
||||
tei-embedding-service:
|
||||
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
|
||||
container_name: tei-embedding-server
|
||||
ports:
|
||||
- "6006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
|
||||
|
||||
tei-reranking-service:
|
||||
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
|
||||
container_name: tei-reranking-server
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate
|
||||
|
||||
vllm-service:
|
||||
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
|
||||
container_name: vllm-service
|
||||
ports:
|
||||
- "9009:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
|
||||
|
||||
chatqna-xeon-backend-server:
|
||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||
container_name: chatqna-xeon-backend-server
|
||||
depends_on:
|
||||
- milvus-standalone
|
||||
- tei-embedding-service
|
||||
- dataprep-milvus-service
|
||||
- retriever
|
||||
- tei-reranking-service
|
||||
- vllm-service
|
||||
ports:
|
||||
- "8888:8888"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
|
||||
- EMBEDDING_SERVER_HOST_IP=tei-embedding-service
|
||||
- RETRIEVER_SERVICE_HOST_IP=retriever
|
||||
- EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
|
||||
- RERANK_SERVER_HOST_IP=tei-reranking-service
|
||||
- RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
|
||||
- LLM_SERVER_HOST_IP=vllm-service
|
||||
- LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
|
||||
- LLM_MODEL=${LLM_MODEL_ID}
|
||||
- LOGFLAG=${LOGFLAG}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
chatqna-xeon-ui-server:
|
||||
image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
|
||||
container_name: chatqna-xeon-ui-server
|
||||
depends_on:
|
||||
- chatqna-xeon-backend-server
|
||||
ports:
|
||||
- "5173:5173"
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
chatqna-xeon-nginx-server:
|
||||
image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
|
||||
container_name: chatqna-xeon-nginx-server
|
||||
depends_on:
|
||||
- chatqna-xeon-backend-server
|
||||
- chatqna-xeon-ui-server
|
||||
ports:
|
||||
- "${NGINX_PORT:-80}:80"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
|
||||
- FRONTEND_SERVICE_PORT=5173
|
||||
- BACKEND_SERVICE_NAME=chatqna
|
||||
- BACKEND_SERVICE_IP=chatqna-xeon-backend-server
|
||||
- BACKEND_SERVICE_PORT=8888
|
||||
- DATAPREP_SERVICE_IP=dataprep-milvus-service
|
||||
- DATAPREP_SERVICE_PORT=5000
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -28,7 +28,7 @@ services:
|
||||
ports:
|
||||
- "6006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -59,7 +59,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -75,7 +75,7 @@ services:
|
||||
ports:
|
||||
- "9009:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -32,7 +32,7 @@ services:
|
||||
ports:
|
||||
- "6040:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -64,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "6041:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -80,7 +80,7 @@ services:
|
||||
ports:
|
||||
- "6042:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tei-embedding-service:
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tei-reranking-service:
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tgi-service:
|
||||
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
chatqna-xeon-backend-server:
|
||||
environment:
|
||||
- ENABLE_OPEA_TELEMETRY=true
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "6006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -64,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -80,7 +80,7 @@ services:
|
||||
ports:
|
||||
- "9009:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "6006:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -64,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "9009:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
811
ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
Normal file
811
ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
Normal file
@@ -0,0 +1,811 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Licensed to the LF AI & Data foundation under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Related configuration of etcd, used to store Milvus metadata & service discovery.
|
||||
etcd:
|
||||
endpoints: localhost:2379
|
||||
rootPath: by-dev # The root path where data is stored in etcd
|
||||
metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
|
||||
kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
|
||||
log:
|
||||
level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
|
||||
# path is one of:
|
||||
# - "default" as os.Stderr,
|
||||
# - "stderr" as os.Stderr,
|
||||
# - "stdout" as os.Stdout,
|
||||
# - file path to append server logs to.
|
||||
# please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log
|
||||
path: stdout
|
||||
ssl:
|
||||
enabled: false # Whether to support ETCD secure connection mode
|
||||
tlsCert: /path/to/etcd-client.pem # path to your cert file
|
||||
tlsKey: /path/to/etcd-client-key.pem # path to your key file
|
||||
tlsCACert: /path/to/ca.pem # path to your CACert file
|
||||
# TLS min version
|
||||
# Optional values: 1.0, 1.1, 1.2, 1.3。
|
||||
# We recommend using version 1.2 and above.
|
||||
tlsMinVersion: 1.3
|
||||
requestTimeout: 10000 # Etcd operation timeout in milliseconds
|
||||
use:
|
||||
embed: false # Whether to enable embedded Etcd (an in-process EtcdServer).
|
||||
data:
|
||||
dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
|
||||
auth:
|
||||
enabled: false # Whether to enable authentication
|
||||
userName: # username for etcd authentication
|
||||
password: # password for etcd authentication
|
||||
|
||||
metastore:
|
||||
type: etcd # Default value: etcd, Valid values: [etcd, tikv]
|
||||
|
||||
# Related configuration of tikv, used to store Milvus metadata.
|
||||
# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
|
||||
# TiKV is a good option when the metadata size requires better horizontal scalability.
|
||||
tikv:
|
||||
endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd.
|
||||
rootPath: by-dev # The root path where data is stored in tikv
|
||||
metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
|
||||
kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
|
||||
requestTimeout: 10000 # ms, tikv request timeout
|
||||
snapshotScanSize: 256 # batch size of tikv snapshot scan
|
||||
ssl:
|
||||
enabled: false # Whether to support TiKV secure connection mode
|
||||
tlsCert: # path to your cert file
|
||||
tlsKey: # path to your key file
|
||||
tlsCACert: # path to your CACert file
|
||||
|
||||
localStorage:
|
||||
path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
|
||||
|
||||
# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus.
|
||||
# We refer to the storage service as MinIO/S3 in the following description for simplicity.
|
||||
minio:
|
||||
address: localhost # Address of MinIO/S3
|
||||
port: 9000 # Port of MinIO/S3
|
||||
accessKeyID: minioadmin # accessKeyID of MinIO/S3
|
||||
secretAccessKey: minioadmin # MinIO/S3 encryption string
|
||||
useSSL: false # Access to MinIO/S3 with SSL
|
||||
ssl:
|
||||
tlsCACert: /path/to/public.crt # path to your CACert file
|
||||
bucketName: a-bucket # Bucket name in MinIO/S3
|
||||
rootPath: files # The root path where the message is stored in MinIO/S3
|
||||
# Whether to useIAM role to access S3/GCS instead of access/secret keys
|
||||
# For more information, refer to
|
||||
# aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html
|
||||
# gcp: https://cloud.google.com/storage/docs/access-control/iam
|
||||
# aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control
|
||||
# aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role
|
||||
useIAM: false
|
||||
# Cloud Provider of S3. Supports: "aws", "gcp", "aliyun".
|
||||
# You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio
|
||||
# You can use "gcp" for other cloud provider supports S3 API with signature v2
|
||||
# You can use "aliyun" for other cloud provider uses virtual host style bucket
|
||||
# When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now
|
||||
cloudProvider: aws
|
||||
# Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
|
||||
# Leave it empty if you want to use AWS default endpoint
|
||||
iamEndpoint:
|
||||
logLevel: fatal # Log level for aws sdk log. Supported level: off, fatal, error, warn, info, debug, trace
|
||||
region: # Specify minio storage system location region
|
||||
useVirtualHost: false # Whether use virtual host mode for bucket
|
||||
requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
|
||||
# The maximum number of objects requested per batch in minio ListObjects rpc,
|
||||
# 0 means using oss client by default, decrease these configuration if ListObjects timeout
|
||||
listObjectsMaxKeys: 0
|
||||
|
||||
# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka.
|
||||
# You can change your mq by setting mq.type field.
|
||||
# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file.
|
||||
# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka
|
||||
# 2. cluster mode: Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode)
|
||||
mq:
|
||||
# Default value: "default"
|
||||
# Valid values: [default, pulsar, kafka, rocksmq, natsmq]
|
||||
type: default
|
||||
enablePursuitMode: true # Default value: "true"
|
||||
pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds
|
||||
pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes
|
||||
mqBufSize: 16 # MQ client consumer buffer length
|
||||
dispatcher:
|
||||
mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge
|
||||
targetBufSize: 16 # the length of channel buffer for targe
|
||||
maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack
|
||||
|
||||
# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services.
|
||||
pulsar:
|
||||
address: localhost # Address of pulsar
|
||||
port: 6650 # Port of Pulsar
|
||||
webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080
|
||||
maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar.
|
||||
tenant: public
|
||||
namespace: default
|
||||
requestTimeout: 60 # pulsar client global request timeout in seconds
|
||||
enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
|
||||
|
||||
# If you want to enable kafka, needs to comment the pulsar configs
|
||||
# kafka:
|
||||
# brokerList:
|
||||
# saslUsername:
|
||||
# saslPassword:
|
||||
# saslMechanisms:
|
||||
# securityProtocol:
|
||||
# ssl:
|
||||
# enabled: false # whether to enable ssl mode
|
||||
# tlsCert: # path to client's public key (PEM) used for authentication
|
||||
# tlsKey: # path to client's private key (PEM) used for authentication
|
||||
# tlsCaCert: # file or directory path to CA certificate(s) for verifying the broker's key
|
||||
# tlsKeyPassword: # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any
|
||||
# readTimeout: 10
|
||||
|
||||
rocksmq:
|
||||
# The path where the message is stored in rocksmq
|
||||
# please adjust in embedded Milvus: /tmp/milvus/rdb_data
|
||||
path: /var/lib/milvus/rdb_data
|
||||
lrucacheratio: 0.06 # rocksdb cache memory ratio
|
||||
rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq
|
||||
retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq.
|
||||
retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq.
|
||||
compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data
|
||||
compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level.
|
||||
|
||||
# natsmq configuration.
|
||||
# more detail: https://docs.nats.io/running-a-nats-service/configuration
|
||||
natsmq:
|
||||
server:
|
||||
port: 4222 # Port for nats server listening
|
||||
storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats
|
||||
maxFileStore: 17179869184 # Maximum size of the 'file' storage
|
||||
maxPayload: 8388608 # Maximum number of bytes in a message payload
|
||||
maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections
|
||||
initializeTimeout: 4000 # waiting for initialization of natsmq finished
|
||||
monitor:
|
||||
trace: false # If true enable protocol trace log messages
|
||||
debug: false # If true enable debug log messages
|
||||
logTime: true # If set to false, log without timestamps.
|
||||
logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path
|
||||
logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
|
||||
retention:
|
||||
maxAge: 4320 # Maximum age of any message in the P-channel
|
||||
maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
|
||||
maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
|
||||
|
||||
# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
|
||||
rootCoord:
|
||||
dmlChannelNum: 16 # The number of dml channels created at system startup
|
||||
maxPartitionNum: 1024 # Maximum number of partitions in a collection
|
||||
minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed
|
||||
enableActiveStandby: false
|
||||
maxDatabaseNum: 64 # Maximum number of database
|
||||
maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
|
||||
gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 53100
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
# Related configuration of proxy, used to validate client requests and reduce the returned results.
|
||||
proxy:
|
||||
timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
|
||||
healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
|
||||
msgStream:
|
||||
timeTick:
|
||||
bufSize: 512
|
||||
maxNameLength: 255 # Maximum length of name for a collection or alias
|
||||
# Maximum number of fields in a collection.
|
||||
# As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64.
|
||||
# So adjust at your risk!
|
||||
maxFieldNum: 64
|
||||
maxVectorFieldNum: 4 # Maximum number of vector fields in a collection.
|
||||
maxShardNum: 16 # Maximum number of shards in a collection
|
||||
maxDimension: 32768 # Maximum dimension of a vector
|
||||
# Whether to produce gin logs.\n
|
||||
# please adjust in embedded Milvus: false
|
||||
ginLogging: true
|
||||
ginLogSkipPaths: / # skip url path for gin log
|
||||
maxTaskNum: 1024 # max task number of proxy task queue
|
||||
mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection
|
||||
accessLog:
|
||||
enable: false # if use access log
|
||||
minioEnable: false # if upload sealed access log file to minio
|
||||
localPath: /tmp/milvus_access
|
||||
filename: # Log filename, leave empty to use stdout.
|
||||
maxSize: 64 # Max size for a single file, in MB.
|
||||
cacheSize: 10240 # Size of log of memory cache, in B
|
||||
rotatedTime: 0 # Max time for single access log file in seconds
|
||||
remotePath: access_log/ # File path in minIO
|
||||
remoteMaxTime: 0 # Max time for log file in minIO, in hours
|
||||
formatters:
|
||||
base:
|
||||
format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]"
|
||||
query:
|
||||
format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]"
|
||||
methods: "Query,Search,Delete"
|
||||
connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info
|
||||
connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds
|
||||
maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos
|
||||
gracefulStopTimeout: 30 # seconds. force stop node without graceful stop
|
||||
slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds.
|
||||
http:
|
||||
enabled: true # Whether to enable the http server
|
||||
debug_mode: false # Whether to enable http server debug mode
|
||||
port: # high-level restful api
|
||||
acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
|
||||
enablePprof: true # Whether to enable pprof middleware on the metrics port
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 19530
|
||||
internalPort: 19529
|
||||
grpc:
|
||||
serverMaxSendSize: 268435456
|
||||
serverMaxRecvSize: 67108864
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 67108864
|
||||
|
||||
# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments.
|
||||
queryCoord:
|
||||
taskMergeCap: 1
|
||||
taskExecutionCap: 256
|
||||
autoHandoff: true # Enable auto handoff
|
||||
autoBalance: true # Enable auto balance
|
||||
autoBalanceChannel: true # Enable auto balance channel
|
||||
balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes
|
||||
globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes
|
||||
scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance
|
||||
reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance
|
||||
overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload
|
||||
balanceIntervalSeconds: 60
|
||||
memoryUsageMaxDifferencePercentage: 30
|
||||
rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes
|
||||
segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes
|
||||
globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes
|
||||
segmentCountMaxSteps: 50 # segment count based plan generator max steps
|
||||
rowCountMaxSteps: 50 # segment count based plan generator max steps
|
||||
randomMaxSteps: 10 # segment count based plan generator max steps
|
||||
growingRowCountWeight: 4 # the memory weight of growing segment row count
|
||||
balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed
|
||||
checkSegmentInterval: 1000
|
||||
checkChannelInterval: 1000
|
||||
checkBalanceInterval: 10000
|
||||
checkIndexInterval: 10000
|
||||
channelTaskTimeout: 60000 # 1 minute
|
||||
segmentTaskTimeout: 120000 # 2 minute
|
||||
distPullInterval: 500
|
||||
collectionObserverInterval: 200
|
||||
checkExecutedFlagInterval: 100
|
||||
heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available
|
||||
loadTimeoutSeconds: 600
|
||||
distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds
|
||||
heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds
|
||||
checkHandoffInterval: 5000
|
||||
enableActiveStandby: false
|
||||
checkInterval: 1000
|
||||
checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node
|
||||
checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node
|
||||
brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout
|
||||
collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it
|
||||
observerTaskParallel: 16 # the parallel observer dispatcher task number
|
||||
checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
|
||||
checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session
|
||||
gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
|
||||
enableStoppingBalance: true # whether enable stopping balance
|
||||
channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
|
||||
cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 19531
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
# Related configuration of queryNode, used to run hybrid search between vector and scalar data.
|
||||
queryNode:
|
||||
stats:
|
||||
publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
|
||||
segcore:
|
||||
knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]).
|
||||
chunkRows: 128 # The number of vectors in a chunk.
|
||||
interimIndex:
|
||||
enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog.
|
||||
nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8
|
||||
nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist
|
||||
memExpansionRate: 1.15 # extra memory needed by building interim index
|
||||
buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
|
||||
knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
|
||||
loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
|
||||
enableDisk: false # enable querynode load disk index, and search on disk index
|
||||
maxDiskUsagePercentage: 95
|
||||
cache:
|
||||
enabled: true
|
||||
memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
|
||||
readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
|
||||
# options: async, sync, disable.
|
||||
# Specifies the necessity for warming up the chunk cache.
|
||||
# 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
|
||||
# chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
|
||||
# for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
|
||||
# 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
|
||||
warmup: disable
|
||||
mmap:
|
||||
mmapEnabled: false # Enable mmap for loading data
|
||||
lazyload:
|
||||
enabled: false # Enable lazyload for loading data
|
||||
waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve
|
||||
requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default
|
||||
requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default
|
||||
maxRetryTimes: 1 # max retry times for lazy load, 1 by default
|
||||
maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default
|
||||
grouping:
|
||||
enabled: true
|
||||
maxNQ: 1000
|
||||
topKMergeRatio: 20
|
||||
scheduler:
|
||||
receiveChanSize: 10240
|
||||
unsolvedQueueSize: 10240
|
||||
# maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task).
|
||||
# Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio.
|
||||
# It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2.
|
||||
# Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100.
|
||||
# (0, 100]
|
||||
maxReadConcurrentRatio: 1
|
||||
cpuRatio: 10 # ratio used to estimate read task cpu usage.
|
||||
maxTimestampLag: 86400
|
||||
scheduleReadPolicy:
|
||||
# fifo: A FIFO queue support the schedule.
|
||||
# user-task-polling:
|
||||
# The user's tasks will be polled one by one and scheduled.
|
||||
# Scheduling is fair on task granularity.
|
||||
# The policy is based on the username for authentication.
|
||||
# And an empty username is considered the same user.
|
||||
# When there are no multi-users, the policy decay into FIFO"
|
||||
name: fifo
|
||||
taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty
|
||||
enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other)
|
||||
maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler
|
||||
dataSync:
|
||||
flowGraph:
|
||||
maxQueueLength: 16 # Maximum length of task queue in flowgraph
|
||||
maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
|
||||
enableSegmentPrune: false # use partition prune function on shard delegator
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 21123
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
indexCoord:
|
||||
bindIndexNodeMode:
|
||||
enable: false
|
||||
address: localhost:22930
|
||||
withCred: false
|
||||
nodeID: 0
|
||||
segment:
|
||||
minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed
|
||||
|
||||
indexNode:
|
||||
scheduler:
|
||||
buildParallel: 1
|
||||
enableDisk: true # enable index node build disk vector index
|
||||
maxDiskUsagePercentage: 95
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 21121
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
dataCoord:
|
||||
channel:
|
||||
watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer.
|
||||
balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch
|
||||
legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels
|
||||
balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing
|
||||
balanceInterval: 360 # The interval with which the channel manager check dml channel balance status
|
||||
checkInterval: 1 # The interval in seconds with which the channel manager advances channel states
|
||||
notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds).
|
||||
segment:
|
||||
maxSize: 1024 # Maximum size of a segment in MB
|
||||
diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index
|
||||
sealProportion: 0.12
|
||||
assignmentExpiration: 2000 # The time of the assignment expiration in ms
|
||||
allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart
|
||||
maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60
|
||||
# If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than
|
||||
# minSizeFromIdleToSealed, Milvus will automatically seal it.
|
||||
# The max idle time of segment in seconds, 10*60.
|
||||
maxIdleTime: 600
|
||||
minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed.
|
||||
# The max number of binlog file for one segment, the segment will be sealed if
|
||||
# the number of binlog file reaches to max value.
|
||||
maxBinlogFileNumber: 32
|
||||
smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than
|
||||
# (smallProportion * segment max # of rows).
|
||||
# A compaction will happen on small segments if the segment after compaction will have
|
||||
compactableProportion: 0.85
|
||||
# over (compactableProportion * segment max # of rows) rows.
|
||||
# MUST BE GREATER THAN OR EQUAL TO <smallProportion>!!!
|
||||
# During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
|
||||
expansionRate: 1.25
|
||||
autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
|
||||
enableCompaction: true # Enable data segment compaction
|
||||
compaction:
|
||||
enableAutoCompaction: true
|
||||
indexBasedCompaction: true
|
||||
rpcTimeout: 10
|
||||
maxParallelTaskNum: 10
|
||||
workerMaxParallelTaskNum: 2
|
||||
levelzero:
|
||||
forceTrigger:
|
||||
minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB
|
||||
maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB
|
||||
deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction
|
||||
deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30
|
||||
enableGarbageCollection: true
|
||||
gc:
|
||||
interval: 3600 # gc interval in seconds
|
||||
missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d)
|
||||
dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600
|
||||
removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
|
||||
scanInterval: 168 # garbage collection scan residue interval in hours
|
||||
enableActiveStandby: false
|
||||
brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
|
||||
autoBalance: true # Enable auto balance
|
||||
checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
|
||||
import:
|
||||
filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task.
|
||||
taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state.
|
||||
maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask).
|
||||
scheduleInterval: 2 # The interval for scheduling import, measured in seconds.
|
||||
checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker.
|
||||
checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker.
|
||||
maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
|
||||
waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
|
||||
gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 13333
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
dataNode:
|
||||
dataSync:
|
||||
flowGraph:
|
||||
maxQueueLength: 16 # Maximum length of task queue in flowgraph
|
||||
maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
|
||||
maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally
|
||||
skipMode:
|
||||
enable: true # Support skip some timetick message to reduce CPU usage
|
||||
skipNum: 4 # Consume one for every n records skipped
|
||||
coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds
|
||||
segment:
|
||||
insertBufSize: 16777216 # Max buffer size to flush for a single segment.
|
||||
deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB
|
||||
syncPeriod: 600 # The period to sync segments if buffer is not empty.
|
||||
memory:
|
||||
forceSyncEnable: true # Set true to force sync if memory usage is too high
|
||||
forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced.
|
||||
checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds
|
||||
forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced.
|
||||
timetick:
|
||||
byRPC: true
|
||||
interval: 500
|
||||
channel:
|
||||
# specify the size of global work pool of all channels
|
||||
# if this parameter <= 0, will set it as the maximum number of CPUs that can be executing
|
||||
# suggest to set it bigger on large collection numbers to avoid blocking
|
||||
workPoolSize: -1
|
||||
# specify the size of global work pool for channel checkpoint updating
|
||||
# if this parameter <= 0, will set it as 10
|
||||
updateChannelCheckpointMaxParallel: 10
|
||||
updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel
|
||||
updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call
|
||||
maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC.
|
||||
channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates.
|
||||
import:
|
||||
maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode.
|
||||
maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files.
|
||||
readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import.
|
||||
compaction:
|
||||
levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
|
||||
gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
|
||||
ip: # if not specified, use the first unicastable address
|
||||
port: 21124
|
||||
grpc:
|
||||
serverMaxSendSize: 536870912
|
||||
serverMaxRecvSize: 268435456
|
||||
clientMaxSendSize: 268435456
|
||||
clientMaxRecvSize: 536870912
|
||||
|
||||
# Configures the system log output.
|
||||
log:
|
||||
level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
|
||||
file:
|
||||
rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
|
||||
maxSize: 300 # MB
|
||||
maxAge: 10 # Maximum time for log retention in day.
|
||||
maxBackups: 20
|
||||
format: text # text or json
|
||||
stdout: true # Stdout enable or not
|
||||
|
||||
grpc:
|
||||
log:
|
||||
level: WARNING
|
||||
gracefulStopTimeout: 10 # second, time to wait graceful stop finish
|
||||
client:
|
||||
compressionEnabled: false
|
||||
dialTimeout: 200
|
||||
keepAliveTime: 10000
|
||||
keepAliveTimeout: 20000
|
||||
maxMaxAttempts: 10
|
||||
initialBackoff: 0.2
|
||||
maxBackoff: 10
|
||||
minResetInterval: 1000
|
||||
maxCancelError: 32
|
||||
minSessionCheckInterval: 200
|
||||
|
||||
# Configure the proxy tls enable.
|
||||
tls:
|
||||
serverPemPath: configs/cert/server.pem
|
||||
serverKeyPath: configs/cert/server.key
|
||||
caPemPath: configs/cert/ca.pem
|
||||
|
||||
common:
|
||||
defaultPartitionName: _default # default partition name for a collection
|
||||
defaultIndexName: _default_idx # default index name
|
||||
entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire
|
||||
indexSliceSize: 16 # MB
|
||||
threadCoreCoefficient:
|
||||
highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool
|
||||
middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool
|
||||
lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool
|
||||
buildIndexThreadPoolRatio: 0.75
|
||||
DiskIndex:
|
||||
MaxDegree: 56
|
||||
SearchListSize: 100
|
||||
PQCodeBudgetGBRatio: 0.125
|
||||
BuildNumThreadsRatio: 1
|
||||
SearchCacheBudgetGBRatio: 0.1
|
||||
LoadNumThreadRatio: 8
|
||||
BeamWidthRatio: 4
|
||||
gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency.
|
||||
gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time.
|
||||
storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead
|
||||
# Default value: auto
|
||||
# Valid values: [auto, avx512, avx2, avx, sse4_2]
|
||||
# This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building.
|
||||
simdType: auto
|
||||
security:
|
||||
authorizationEnabled: false
|
||||
# The superusers will ignore some system check processes,
|
||||
# like the old password verification when updating the credential
|
||||
superUsers:
|
||||
tlsMode: 0
|
||||
session:
|
||||
ttl: 30 # ttl value when session granting a lease to register service
|
||||
retryTimes: 30 # retry times when session sending etcd requests
|
||||
locks:
|
||||
metrics:
|
||||
enable: false # whether gather statistics for metrics locks
|
||||
threshold:
|
||||
info: 500 # minimum milliseconds for printing durations in info level
|
||||
warn: 1000 # minimum milliseconds for printing durations in warn level
|
||||
storage:
|
||||
scheme: s3
|
||||
enablev2: false
|
||||
ttMsgEnabled: true # Whether the instance disable sending ts messages
|
||||
traceLogMode: 0 # trace request info
|
||||
bloomFilterSize: 100000 # bloom filter initial size
|
||||
maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter
|
||||
|
||||
# QuotaConfig, configurations of Milvus quota and limits.
|
||||
# By default, we enable:
|
||||
# 1. TT protection;
|
||||
# 2. Memory protection.
|
||||
# 3. Disk quota protection.
|
||||
# You can enable:
|
||||
# 1. DML throughput limitation;
|
||||
# 2. DDL, DQL qps/rps limitation;
|
||||
# 3. DQL Queue length/latency protection;
|
||||
# 4. DQL result rate protection;
|
||||
# If necessary, you can also manually force to deny RW requests.
|
||||
quotaAndLimits:
|
||||
enabled: true # `true` to enable quota and limits, `false` to disable.
|
||||
# quotaCenterCollectInterval is the time interval that quotaCenter
|
||||
# collects metrics from Proxies, Query cluster and Data cluster.
|
||||
# seconds, (0 ~ 65536)
|
||||
quotaCenterCollectInterval: 3
|
||||
ddl:
|
||||
enabled: false
|
||||
collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
|
||||
partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
|
||||
db:
|
||||
collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
|
||||
partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
|
||||
indexRate:
|
||||
enabled: false
|
||||
max: -1 # qps, default no limit, rate for CreateIndex, DropIndex
|
||||
db:
|
||||
max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex
|
||||
flushRate:
|
||||
enabled: true
|
||||
max: -1 # qps, default no limit, rate for flush
|
||||
collection:
|
||||
max: 0.1 # qps, default no limit, rate for flush at collection level.
|
||||
db:
|
||||
max: -1 # qps of db level, default no limit, rate for flush
|
||||
compactionRate:
|
||||
enabled: false
|
||||
max: -1 # qps, default no limit, rate for manualCompaction
|
||||
db:
|
||||
max: -1 # qps of db level, default no limit, rate for manualCompaction
|
||||
dml:
|
||||
# dml limit rates, default no limit.
|
||||
# The maximum rate will not be greater than max.
|
||||
enabled: false
|
||||
insertRate:
|
||||
max: -1 # MB/s, default no limit
|
||||
db:
|
||||
max: -1 # MB/s, default no limit
|
||||
collection:
|
||||
max: -1 # MB/s, default no limit
|
||||
partition:
|
||||
max: -1 # MB/s, default no limit
|
||||
upsertRate:
|
||||
max: -1 # MB/s, default no limit
|
||||
db:
|
||||
max: -1 # MB/s, default no limit
|
||||
collection:
|
||||
max: -1 # MB/s, default no limit
|
||||
partition:
|
||||
max: -1 # MB/s, default no limit
|
||||
deleteRate:
|
||||
max: -1 # MB/s, default no limit
|
||||
db:
|
||||
max: -1 # MB/s, default no limit
|
||||
collection:
|
||||
max: -1 # MB/s, default no limit
|
||||
partition:
|
||||
max: -1 # MB/s, default no limit
|
||||
bulkLoadRate:
|
||||
max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate
|
||||
db:
|
||||
max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate
|
||||
collection:
|
||||
max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate
|
||||
partition:
|
||||
max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate
|
||||
dql:
|
||||
# dql limit rates, default no limit.
|
||||
# The maximum rate will not be greater than max.
|
||||
enabled: false
|
||||
searchRate:
|
||||
max: -1 # vps (vectors per second), default no limit
|
||||
db:
|
||||
max: -1 # vps (vectors per second), default no limit
|
||||
collection:
|
||||
max: -1 # vps (vectors per second), default no limit
|
||||
partition:
|
||||
max: -1 # vps (vectors per second), default no limit
|
||||
queryRate:
|
||||
max: -1 # qps, default no limit
|
||||
db:
|
||||
max: -1 # qps, default no limit
|
||||
collection:
|
||||
max: -1 # qps, default no limit
|
||||
partition:
|
||||
max: -1 # qps, default no limit
|
||||
limits:
|
||||
maxCollectionNum: 65536
|
||||
maxCollectionNumPerDB: 65536
|
||||
maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit
|
||||
maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes
|
||||
limitWriting:
|
||||
# forceDeny false means dml requests are allowed (except for some
|
||||
# specific conditions, such as memory of nodes to water marker), true means always reject all dml requests.
|
||||
forceDeny: false
|
||||
ttProtection:
|
||||
enabled: false
|
||||
# maxTimeTickDelay indicates the backpressure for DML Operations.
|
||||
# DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
|
||||
# if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected.
|
||||
# seconds
|
||||
maxTimeTickDelay: 300
|
||||
memProtection:
|
||||
# When memory usage > memoryHighWaterLevel, all dml requests would be rejected;
|
||||
# When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate;
|
||||
# When memory usage < memoryLowWaterLevel, no action.
|
||||
enabled: true
|
||||
dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes
|
||||
dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes
|
||||
queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes
|
||||
queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes
|
||||
growingSegmentsSizeProtection:
|
||||
# No action will be taken if the growing segments size is less than the low watermark.
|
||||
# When the growing segments size exceeds the low watermark, the dml rate will be reduced,
|
||||
# but the rate will not be lower than minRateRatio * dmlRate.
|
||||
enabled: false
|
||||
minRateRatio: 0.5
|
||||
lowWaterLevel: 0.2
|
||||
highWaterLevel: 0.4
|
||||
diskProtection:
|
||||
enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
|
||||
diskQuota: -1 # MB, (0, +inf), default no limit
|
||||
diskQuotaPerDB: -1 # MB, (0, +inf), default no limit
|
||||
diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit
|
||||
diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit
|
||||
limitReading:
|
||||
# forceDeny false means dql requests are allowed (except for some
|
||||
# specific conditions, such as collection has been dropped), true means always reject all dql requests.
|
||||
forceDeny: false
|
||||
queueProtection:
|
||||
enabled: false
|
||||
# nqInQueueThreshold indicated that the system was under backpressure for Search/Query path.
|
||||
# If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off
|
||||
# until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1.
|
||||
# int, default no limit
|
||||
nqInQueueThreshold: -1
|
||||
# queueLatencyThreshold indicated that the system was under backpressure for Search/Query path.
|
||||
# If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off
|
||||
# until the latency of queuing no longer exceeds queueLatencyThreshold.
|
||||
# The latency here refers to the averaged latency over a period of time.
|
||||
# milliseconds, default no limit
|
||||
queueLatencyThreshold: -1
|
||||
resultProtection:
|
||||
enabled: false
|
||||
# maxReadResultRate indicated that the system was under backpressure for Search/Query path.
|
||||
# If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off
|
||||
# until the read result rate no longer exceeds maxReadResultRate.
|
||||
# MB/s, default no limit
|
||||
maxReadResultRate: -1
|
||||
maxReadResultRatePerDB: -1
|
||||
maxReadResultRatePerCollection: -1
|
||||
# colOffSpeed is the speed of search&query rates cool off.
|
||||
# (0, 1]
|
||||
coolOffSpeed: 0.9
|
||||
|
||||
trace:
|
||||
# trace exporter type, default is stdout,
|
||||
# optional values: ['noop','stdout', 'jaeger', 'otlp']
|
||||
exporter: noop
|
||||
# fraction of traceID based sampler,
|
||||
# optional values: [0, 1]
|
||||
# Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
|
||||
sampleFraction: 0
|
||||
jaeger:
|
||||
url: # when exporter is jaeger should set the jaeger's URL
|
||||
otlp:
|
||||
endpoint: # example: "127.0.0.1:4318"
|
||||
secure: true
|
||||
|
||||
#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
|
||||
#here, you can set the size of the memory occupied by the memory pool, with the unit being MB.
|
||||
#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize.
|
||||
#if initMemSize and MaxMemSize both set zero,
|
||||
#milvus will automatically initialize half of the available GPU memory,
|
||||
#maxMemSize will the whole available GPU memory.
|
||||
gpu:
|
||||
initMemSize: # Gpu Memory Pool init size
|
||||
maxMemSize: # Gpu Memory Pool Max size
|
||||
@@ -14,3 +14,7 @@ export INDEX_NAME="rag-redis"
|
||||
# Set it as a non-null string, such as true, if you want to enable logging facility,
|
||||
# otherwise, keep it as "" to disable it.
|
||||
export LOGFLAG=""
|
||||
# Set OpenTelemetry Tracing Endpoint
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
@@ -10,7 +10,7 @@ Quick Start:
|
||||
2. Run Docker Compose.
|
||||
3. Consume the ChatQnA Service.
|
||||
|
||||
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
|
||||
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
|
||||
|
||||
## Quick Start: 1.Setup Environment Variable
|
||||
|
||||
@@ -39,12 +39,37 @@ To set up environment variables for deploying ChatQnA services, follow these ste
|
||||
source ./set_env.sh
|
||||
```
|
||||
|
||||
4. Change Model for LLM serving
|
||||
|
||||
By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.
|
||||
Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.
|
||||
To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.
|
||||
For example, change to DeepSeek-R1-Distill-Qwen-32B using the following command.
|
||||
|
||||
```bash
|
||||
export LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
||||
```
|
||||
|
||||
Please also check [required gaudi cards for different models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#system-requirements-for-llm-models) for new models.
|
||||
It might be necessary to increase the number of Gaudi cards for the model by exporting NUM_CARDS to the new model or by modifying set_env.sh, and then repeating step 3. For example, increase the number of Gaudi cards for DeepSeek-R1-
|
||||
Distill-Qwen-32B using the following command:
|
||||
|
||||
```bash
|
||||
export NUM_CARDS=4
|
||||
```
|
||||
|
||||
## Quick Start: 2.Run Docker Compose
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
|
||||
|
||||
```bash
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
It will automatically download the docker image on `docker hub`:
|
||||
|
||||
```bash
|
||||
@@ -259,12 +284,16 @@ If use vLLM as the LLM serving backend.
|
||||
docker compose -f compose.yaml up -d
|
||||
# Start ChatQnA without Rerank Pipeline
|
||||
docker compose -f compose_without_rerank.yaml up -d
|
||||
# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
If use TGI as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose_tgi.yaml up -d
|
||||
# Start ChatQnA with Open Telemetry Tracing
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
|
||||
```
|
||||
|
||||
If you want to enable guardrails microservice in the pipeline, please follow the below command instead:
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tei-embedding-service:
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tei-reranking-service:
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
chatqna-gaudi-backend-server:
|
||||
environment:
|
||||
- ENABLE_OPEA_TELEMETRY=true
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "8090:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -62,7 +62,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
@@ -83,7 +83,7 @@ services:
|
||||
ports:
|
||||
- "8007:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -92,6 +92,7 @@ services:
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
|
||||
@@ -102,7 +103,7 @@ services:
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
chatqna-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||
container_name: chatqna-gaudi-backend-server
|
||||
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "8088:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -70,7 +70,7 @@ services:
|
||||
ports:
|
||||
- "8090:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -103,7 +103,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
@@ -124,7 +124,7 @@ services:
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -133,12 +133,13 @@ services:
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
chatqna-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
|
||||
container_name: chatqna-gaudi-guardrails-server
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tei-embedding-service:
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tei-reranking-service:
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tgi-service:
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
chatqna-gaudi-backend-server:
|
||||
environment:
|
||||
- ENABLE_OPEA_TELEMETRY=true
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
@@ -25,20 +25,19 @@ services:
|
||||
INDEX_NAME: ${INDEX_NAME}
|
||||
TEI_ENDPOINT: http://tei-embedding-service:80
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
|
||||
tei-embedding-service:
|
||||
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
|
||||
container_name: tei-embedding-gaudi-server
|
||||
ports:
|
||||
- "8090:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
|
||||
retriever:
|
||||
image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
|
||||
container_name: retriever-redis-server
|
||||
@@ -56,7 +55,6 @@ services:
|
||||
INDEX_NAME: ${INDEX_NAME}
|
||||
TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
|
||||
LOGFLAG: ${LOGFLAG}
|
||||
RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
|
||||
restart: unless-stopped
|
||||
@@ -66,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "8808:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
@@ -80,14 +78,14 @@ services:
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
MAX_WARMUP_SEQUENCE_LENGTH: 512
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||
container_name: tgi-gaudi-server
|
||||
ports:
|
||||
- "8005:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -101,26 +99,12 @@ services:
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096
|
||||
chatqna-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||
container_name: chatqna-gaudi-backend-server
|
||||
@@ -146,7 +130,6 @@ services:
|
||||
- LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
|
||||
- LLM_MODEL=${LLM_MODEL_ID}
|
||||
- LOGFLAG=${LOGFLAG}
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
ipc: host
|
||||
restart: always
|
||||
chatqna-gaudi-ui-server:
|
||||
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
ports:
|
||||
- "8090:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -64,7 +64,7 @@ services:
|
||||
ports:
|
||||
- "8007:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
@@ -73,12 +73,13 @@ services:
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
||||
chatqna-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
|
||||
container_name: chatqna-gaudi-backend-server
|
||||
|
||||
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export INDEX_NAME="rag-redis"
|
||||
export NUM_CARDS=1
|
||||
# Set it as a non-null string, such as true, if you want to enable logging facility,
|
||||
# otherwise, keep it as "" to disable it.
|
||||
export LOGFLAG=""
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,6 @@ function build_docker_images() {
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
|
||||
sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna-guardrails chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
|
||||
@@ -47,6 +47,7 @@ function start_services() {
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export NUM_CARDS=1
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
|
||||
|
||||
249
ChatQnA/tests/test_compose_milvus_on_xeon.sh
Normal file
249
ChatQnA/tests/test_compose_milvus_on_xeon.sh
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
IMAGE_REPO=${IMAGE_REPO:-"opea"}
|
||||
IMAGE_TAG=${IMAGE_TAG:-"latest"}
|
||||
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
ip_address=$(hostname -I | awk '{print $1}')
|
||||
export host_ip=$(hostname -I | awk '{print $1}')
|
||||
|
||||
function build_docker_images() {
|
||||
opea_branch=${opea_branch:-"main"}
|
||||
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
|
||||
if [[ "${opea_branch}" != "main" ]]; then
|
||||
cd $WORKPATH
|
||||
OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
|
||||
NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
|
||||
find . -type f -name "Dockerfile*" | while read -r file; do
|
||||
echo "Processing file: $file"
|
||||
sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
|
||||
done
|
||||
fi
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# make sure NOT change the pwd
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
|
||||
|
||||
docker images && sleep 1s
|
||||
}
|
||||
function start_services() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon/
|
||||
export no_proxy=${no_proxy},${ip_address}
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export LOGFLAG=true
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_milvus.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
|
||||
if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
}
|
||||
|
||||
function validate_service() {
|
||||
local URL="$1"
|
||||
local EXPECTED_RESULT="$2"
|
||||
local SERVICE_NAME="$3"
|
||||
local DOCKER_NAME="$4"
|
||||
local INPUT_DATA="$5"
|
||||
|
||||
if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
|
||||
cd $LOG_PATH
|
||||
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
|
||||
elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
|
||||
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
|
||||
else
|
||||
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
|
||||
fi
|
||||
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
|
||||
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
|
||||
|
||||
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
|
||||
|
||||
|
||||
# check response status
|
||||
if [ "$HTTP_STATUS" -ne "200" ]; then
|
||||
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
|
||||
exit 1
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
|
||||
fi
|
||||
echo "Response"
|
||||
echo $RESPONSE_BODY
|
||||
echo "Expected Result"
|
||||
echo $EXPECTED_RESULT
|
||||
# check response body
|
||||
if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
|
||||
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
|
||||
exit 1
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] Content is as expected."
|
||||
fi
|
||||
|
||||
sleep 1s
|
||||
}
|
||||
|
||||
function validate_microservices() {
|
||||
# Check if the microservices are running correctly.
|
||||
|
||||
# tei for embedding service
|
||||
validate_service \
|
||||
"${ip_address}:6006/embed" \
|
||||
"[[" \
|
||||
"tei-embedding" \
|
||||
"tei-embedding-server" \
|
||||
'{"inputs":"What is Deep Learning?"}'
|
||||
|
||||
sleep 1m # retrieval can't curl as expected, try to wait for more time
|
||||
|
||||
# test /v1/dataprep/ingest upload file
|
||||
echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
|
||||
validate_service \
|
||||
"http://${ip_address}:11101/v1/dataprep/ingest" \
|
||||
"Data preparation succeeded" \
|
||||
"dataprep_upload_file" \
|
||||
"dataprep-milvus-server"
|
||||
|
||||
# test /v1/dataprep/delete
|
||||
validate_service \
|
||||
"http://${ip_address}:11101/v1/dataprep/delete" \
|
||||
'{"status":true}' \
|
||||
"dataprep_del" \
|
||||
"dataprep-milvus-server"
|
||||
|
||||
# test /v1/dataprep/delete
|
||||
validate_service \
|
||||
"http://${ip_address}:11101/v1/dataprep/delete" \
|
||||
'{"status":true}' \
|
||||
"dataprep_del" \
|
||||
"dataprep-milvus-server"
|
||||
|
||||
|
||||
# retrieval microservice
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
validate_service \
|
||||
"${ip_address}:7000/v1/retrieval" \
|
||||
" " \
|
||||
"retrieval" \
|
||||
"retriever-milvus-server" \
|
||||
"{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
|
||||
|
||||
# tei for rerank microservice
|
||||
echo "Validating reranking service"
|
||||
validate_service \
|
||||
"${ip_address}:8808/rerank" \
|
||||
'{"index":1,"score":' \
|
||||
"tei-rerank" \
|
||||
"tei-reranking-server" \
|
||||
'{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
|
||||
|
||||
|
||||
# tgi for llm service
|
||||
echo "Validating llm service"
|
||||
validate_service \
|
||||
"${ip_address}:9009/v1/chat/completions" \
|
||||
"content" \
|
||||
"vllm-llm" \
|
||||
"vllm-service" \
|
||||
'{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
|
||||
}
|
||||
|
||||
function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_service \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"data: " \
|
||||
"chatqna-megaservice" \
|
||||
"chatqna-xeon-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
|
||||
}
|
||||
|
||||
function validate_frontend() {
|
||||
echo "[ TEST INFO ]: --------- frontend test started ---------"
|
||||
cd $WORKPATH/ui/svelte
|
||||
local conda_env_name="OPEA_e2e"
|
||||
export PATH=${HOME}/miniforge3/bin/:$PATH
|
||||
if conda info --envs | grep -q "$conda_env_name"; then
|
||||
echo "$conda_env_name exist!"
|
||||
else
|
||||
conda create -n ${conda_env_name} python=3.12 -y
|
||||
fi
|
||||
source activate ${conda_env_name}
|
||||
echo "[ TEST INFO ]: --------- conda env activated ---------"
|
||||
|
||||
sed -i "s/localhost/$ip_address/g" playwright.config.ts
|
||||
|
||||
conda install -c conda-forge nodejs=22.6.0 -y
|
||||
npm install && npm ci && npx playwright install --with-deps
|
||||
node -v && npm -v && pip list
|
||||
|
||||
exit_status=0
|
||||
npx playwright test || exit_status=$?
|
||||
|
||||
if [ $exit_status -ne 0 ]; then
|
||||
echo "[TEST INFO]: ---------frontend test failed---------"
|
||||
exit $exit_status
|
||||
else
|
||||
echo "[TEST INFO]: ---------frontend test passed---------"
|
||||
fi
|
||||
}
|
||||
|
||||
function stop_docker() {
|
||||
echo "In stop docker"
|
||||
echo $WORKPATH
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose_milvus.yaml down
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
stop_docker
|
||||
|
||||
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
|
||||
|
||||
start_time=$(date +%s)
|
||||
start_services
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,6 @@ function build_docker_images() {
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
|
||||
sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi nginx"
|
||||
@@ -45,12 +45,16 @@ function start_services() {
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export NUM_CARDS=1
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export host_ip=${ip_address}
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 160 ]]; do
|
||||
echo "n=$n"
|
||||
@@ -134,7 +138,7 @@ function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_service \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"data:" \
|
||||
"Nike" \
|
||||
"mega-chatqna" \
|
||||
"chatqna-gaudi-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
@@ -171,7 +175,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||
docker compose -f compose.yaml down
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml down
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -186,7 +190,7 @@ function main() {
|
||||
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
# validate_frontend
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"/var/opea/chatqna-service/data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -45,7 +46,7 @@ export CHATQNA_RERANK_SERVICE_HOST_IP=${HOST_IP}
|
||||
export CHATQNA_LLM_SERVICE_HOST_IP=${HOST_IP}
|
||||
export CHATQNA_NGINX_PORT=80
|
||||
export CHATQNA_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export PATH="/home/huggingface/miniconda3/bin:$PATH"
|
||||
export PATH="~/miniconda3/bin:$PATH"
|
||||
|
||||
function build_docker_images() {
|
||||
opea_branch=${opea_branch:-"main"}
|
||||
@@ -207,7 +208,7 @@ function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_service \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"data: " \
|
||||
"Nike" \
|
||||
"chatqna-megaservice" \
|
||||
"chatqna-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
@@ -259,17 +260,12 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 "$WORKPATH"/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
fi
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,12 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# make sure NOT change the pwd
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
|
||||
@@ -49,9 +55,12 @@ function start_services() {
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export host_ip=${ip_address}
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
|
||||
@@ -93,6 +102,7 @@ function validate_service() {
|
||||
|
||||
function validate_microservices() {
|
||||
# Check if the microservices are running correctly.
|
||||
sleep 3m
|
||||
|
||||
# tei for embedding service
|
||||
validate_service \
|
||||
@@ -102,8 +112,6 @@ function validate_microservices() {
|
||||
"tei-embedding-server" \
|
||||
'{"inputs":"What is Deep Learning?"}'
|
||||
|
||||
sleep 1m # retrieval can't curl as expected, try to wait for more time
|
||||
|
||||
# retrieval microservice
|
||||
test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
|
||||
validate_service \
|
||||
@@ -134,7 +142,7 @@ function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_service \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"data" \
|
||||
"Nike" \
|
||||
"mega-chatqna" \
|
||||
"chatqna-xeon-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
@@ -172,7 +180,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon
|
||||
docker compose -f compose.yaml down
|
||||
docker compose -f compose.yaml -f compose.telemetry.yaml down
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -185,13 +193,9 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
# validate_frontend
|
||||
fi
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,12 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# Not change the pwd
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
|
||||
@@ -228,14 +234,10 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
fi
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,12 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# Not change the pwd
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -46,6 +47,7 @@ function start_services() {
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export NUM_CARDS=1
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
@@ -53,7 +55,7 @@ function start_services() {
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 500 ]]; do
|
||||
@@ -217,7 +219,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||
docker compose -f compose_tgi.yaml down
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml down
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -230,13 +232,9 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s"
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
fi
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -48,9 +49,12 @@ function start_services() {
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
@@ -216,7 +220,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon
|
||||
docker compose -f compose_tgi.yaml down
|
||||
docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml down
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -229,16 +233,12 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
fi
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,6 @@ function build_docker_images() {
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
|
||||
sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna-without-rerank chatqna-ui dataprep retriever vllm-gaudi nginx"
|
||||
@@ -45,6 +45,7 @@ function start_services() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
|
||||
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
export NUM_CARDS=1
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
|
||||
@@ -218,13 +219,9 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s"
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
fi
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,7 +30,13 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
# Get the latest tag
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
# Not change the pwd
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="chatqna-without-rerank chatqna-ui dataprep retriever vllm nginx"
|
||||
@@ -219,16 +226,12 @@ function main() {
|
||||
duration=$((end_time-start_time))
|
||||
echo "Mega service start duration is $duration s" && sleep 1s
|
||||
|
||||
if [ "${mode}" == "perf" ]; then
|
||||
python3 $WORKPATH/tests/chatqna_benchmark.py
|
||||
elif [ "${mode}" == "" ]; then
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
fi
|
||||
validate_microservices
|
||||
echo "==== microservices validated ===="
|
||||
validate_megaservice
|
||||
echo "==== megaservice validated ===="
|
||||
validate_frontend
|
||||
echo "==== frontend validated ===="
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
@@ -8,7 +8,7 @@ services:
|
||||
ports:
|
||||
- "8028:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
|
||||
@@ -8,7 +8,7 @@ services:
|
||||
ports:
|
||||
- "8028:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
|
||||
|
||||
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
|
||||
|
||||
## 🚀 Create an AWS Xeon Instance
|
||||
|
||||
To run the example on a AWS Xeon instance, start by creating an AWS account if you don't have one already. Then, get started with the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home). AWS EC2 M7i, C7i, C7i-flex and M7i-flex are Intel Xeon Scalable processor instances suitable for the task. (code named Sapphire Rapids).
|
||||
@@ -63,6 +65,37 @@ By default, the LLM model is set to a default value as listed below:
|
||||
|
||||
Change the `LLM_MODEL_ID` below for your needs.
|
||||
|
||||
For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
|
||||
|
||||
1. Online
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_token}
|
||||
export HF_ENDPOINT="https://hf-mirror.com"
|
||||
model_name="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
# Start vLLM LLM Service
|
||||
docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
|
||||
# Start TGI LLM Service
|
||||
docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
|
||||
```
|
||||
|
||||
2. Offline
|
||||
|
||||
- Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
|
||||
|
||||
- Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
|
||||
|
||||
- Run the following command to start the LLM service.
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_token}
|
||||
export model_path="/path/to/model"
|
||||
# Start vLLM LLM Service
|
||||
docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
|
||||
# Start TGI LLM Service
|
||||
docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
|
||||
```
|
||||
|
||||
### Setup Environment Variables
|
||||
|
||||
1. Set the required environment variables:
|
||||
@@ -95,15 +128,47 @@ Change the `LLM_MODEL_ID` below for your needs.
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/CodeTrans/docker_compose/intel/cpu/xeon
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
If use vLLM as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose.yaml up -d
|
||||
```
|
||||
|
||||
If use TGI as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose_tgi.yaml up -d
|
||||
```
|
||||
|
||||
### Validate Microservices
|
||||
|
||||
1. TGI Service
|
||||
1. LLM backend Service
|
||||
|
||||
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
|
||||
|
||||
Try the command below to check whether the LLM serving is ready.
|
||||
|
||||
```bash
|
||||
curl http://${host_ip}:8008/generate \
|
||||
# vLLM service
|
||||
docker logs codetrans-xeon-vllm-service 2>&1 | grep complete
|
||||
# If the service is ready, you will get the response like below.
|
||||
INFO: Application startup complete.
|
||||
```
|
||||
|
||||
```bash
|
||||
# TGI service
|
||||
docker logs codetrans-xeon-tgi-service | grep Connected
|
||||
# If the service is ready, you will get the response like below.
|
||||
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
|
||||
```
|
||||
|
||||
Then try the `cURL` command below to validate services.
|
||||
|
||||
```bash
|
||||
# either vLLM or TGI service
|
||||
curl http://${host_ip}:8008/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"inputs":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
@@ -2,31 +2,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
|
||||
container_name: codetrans-tgi-service
|
||||
vllm-service:
|
||||
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
|
||||
container_name: codetrans-xeon-vllm-service
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
host_ip: ${host_ip}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
|
||||
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
|
||||
llm:
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
container_name: llm-textgen-server
|
||||
container_name: codetrans-xeon-llm-server
|
||||
depends_on:
|
||||
tgi-service:
|
||||
vllm-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9000:9000"
|
||||
@@ -35,18 +36,19 @@ services:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
restart: unless-stopped
|
||||
codetrans-xeon-backend-server:
|
||||
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
|
||||
container_name: codetrans-xeon-backend-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
- vllm-service
|
||||
- llm
|
||||
ports:
|
||||
- "7777:7777"
|
||||
- "${BACKEND_SERVICE_PORT:-7777}:7777"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
@@ -61,7 +63,7 @@ services:
|
||||
depends_on:
|
||||
- codetrans-xeon-backend-server
|
||||
ports:
|
||||
- "5173:5173"
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
|
||||
95
CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
Normal file
95
CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
Normal file
@@ -0,0 +1,95 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
|
||||
container_name: codetrans-xeon-tgi-service
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "${MODEL_CACHE}:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
host_ip: ${host_ip}
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
|
||||
llm:
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
container_name: codetrans-xeon-llm-server
|
||||
depends_on:
|
||||
tgi-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9000:9000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
restart: unless-stopped
|
||||
codetrans-xeon-backend-server:
|
||||
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
|
||||
container_name: codetrans-xeon-backend-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
- llm
|
||||
ports:
|
||||
- "${BACKEND_SERVICE_PORT:-7777}:7777"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
|
||||
- LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
|
||||
ipc: host
|
||||
restart: always
|
||||
codetrans-xeon-ui-server:
|
||||
image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
|
||||
container_name: codetrans-xeon-ui-server
|
||||
depends_on:
|
||||
- codetrans-xeon-backend-server
|
||||
ports:
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- BASE_URL=${BACKEND_SERVICE_ENDPOINT}
|
||||
ipc: host
|
||||
restart: always
|
||||
codetrans-xeon-nginx-server:
|
||||
image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
|
||||
container_name: codetrans-xeon-nginx-server
|
||||
depends_on:
|
||||
- codetrans-xeon-backend-server
|
||||
- codetrans-xeon-ui-server
|
||||
ports:
|
||||
- "${NGINX_PORT:-80}:80"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
|
||||
- FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
|
||||
- BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
|
||||
- BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
|
||||
- BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
|
||||
|
||||
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
|
||||
|
||||
## 🚀 Build Docker Images
|
||||
|
||||
First of all, you need to build Docker Images locally and install the python package of it. This step can be ignored after the Docker images published to Docker hub.
|
||||
@@ -55,6 +57,37 @@ By default, the LLM model is set to a default value as listed below:
|
||||
|
||||
Change the `LLM_MODEL_ID` below for your needs.
|
||||
|
||||
For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
|
||||
|
||||
1. Online
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_token}
|
||||
export HF_ENDPOINT="https://hf-mirror.com"
|
||||
model_name="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
# Start vLLM LLM Service
|
||||
docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
|
||||
# Start TGI LLM Service
|
||||
docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
|
||||
```
|
||||
|
||||
2. Offline
|
||||
|
||||
- Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
|
||||
|
||||
- Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
|
||||
|
||||
- Run the following command to start the LLM service.
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_token}
|
||||
export model_path="/path/to/model"
|
||||
# Start vLLM LLM Service
|
||||
docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
|
||||
# Start TGI LLM Service
|
||||
docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
|
||||
```
|
||||
|
||||
### Setup Environment Variables
|
||||
|
||||
1. Set the required environment variables:
|
||||
@@ -87,12 +120,43 @@ Change the `LLM_MODEL_ID` below for your needs.
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/CodeTrans/docker_compose/intel/hpu/gaudi
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
If use vLLM as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose.yaml up -d
|
||||
```
|
||||
|
||||
If use TGI as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose_tgi.yaml up -d
|
||||
```
|
||||
|
||||
### Validate Microservices
|
||||
|
||||
1. TGI Service
|
||||
1. LLM backend Service
|
||||
|
||||
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
|
||||
|
||||
Try the command below to check whether the LLM serving is ready.
|
||||
|
||||
```bash
|
||||
# vLLM service
|
||||
docker logs codetrans-gaudi-vllm-service 2>&1 | grep complete
|
||||
# If the service is ready, you will get the response like below.
|
||||
INFO: Application startup complete.
|
||||
```
|
||||
|
||||
```bash
|
||||
# TGI service
|
||||
docker logs codetrans-gaudi-tgi-service | grep Connected
|
||||
# If the service is ready, you will get the response like below.
|
||||
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
|
||||
```
|
||||
|
||||
Then try the `cURL` command below to validate services.
|
||||
|
||||
```bash
|
||||
curl http://${host_ip}:8008/generate \
|
||||
|
||||
@@ -2,39 +2,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||
container_name: codetrans-tgi-service
|
||||
vllm-service:
|
||||
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-vllm-service
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
- "${MODEL_CACHE:-./data}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "sleep 500 && exit 0"]
|
||||
interval: 1s
|
||||
timeout: 505s
|
||||
retries: 1
|
||||
test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
||||
llm:
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
container_name: llm-textgen-gaudi-server
|
||||
container_name: codetrans-xeon-llm-server
|
||||
depends_on:
|
||||
tgi-service:
|
||||
vllm-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9000:9000"
|
||||
@@ -43,18 +42,19 @@ services:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
restart: unless-stopped
|
||||
codetrans-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-backend-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
- vllm-service
|
||||
- llm
|
||||
ports:
|
||||
- "7777:7777"
|
||||
- "${BACKEND_SERVICE_PORT:-7777}:7777"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
@@ -69,7 +69,7 @@ services:
|
||||
depends_on:
|
||||
- codetrans-gaudi-backend-server
|
||||
ports:
|
||||
- "5173:5173"
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
|
||||
99
CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Normal file
99
CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Normal file
@@ -0,0 +1,99 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||
container_name: codetrans-gaudi-tgi-service
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "${MODEL_CACHE}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
|
||||
llm:
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-llm-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
ports:
|
||||
- "9000:9000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
restart: unless-stopped
|
||||
codetrans-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-backend-server
|
||||
depends_on:
|
||||
- tgi-service
|
||||
- llm
|
||||
ports:
|
||||
- "${BACKEND_SERVICE_PORT:-7777}:7777"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
|
||||
- LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
|
||||
ipc: host
|
||||
restart: always
|
||||
codetrans-gaudi-ui-server:
|
||||
image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-ui-server
|
||||
depends_on:
|
||||
- codetrans-gaudi-backend-server
|
||||
ports:
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- BASE_URL=${BACKEND_SERVICE_ENDPOINT}
|
||||
ipc: host
|
||||
restart: always
|
||||
codetrans-gaudi-nginx-server:
|
||||
image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
|
||||
container_name: codetrans-gaudi-nginx-server
|
||||
depends_on:
|
||||
- codetrans-gaudi-backend-server
|
||||
- codetrans-gaudi-ui-server
|
||||
ports:
|
||||
- "${NGINX_PORT:-80}:80"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
|
||||
- FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
|
||||
- BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
|
||||
- BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
|
||||
- BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -8,7 +8,12 @@ popd > /dev/null
|
||||
|
||||
|
||||
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
|
||||
export LLM_ENDPOINT="http://${host_ip}:8008"
|
||||
export LLM_COMPONENT_NAME="OpeaTextGenService"
|
||||
export NUM_CARDS=1
|
||||
export BLOCK_SIZE=128
|
||||
export MAX_NUM_SEQS=256
|
||||
export MAX_SEQ_LEN_TO_CAPTURE=2048
|
||||
export MEGA_SERVICE_HOST_IP=${host_ip}
|
||||
export LLM_SERVICE_HOST_IP=${host_ip}
|
||||
export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7777/v1/codetrans"
|
||||
|
||||
@@ -23,6 +23,18 @@ services:
|
||||
dockerfile: comps/llms/src/text-generation/Dockerfile
|
||||
extends: codetrans
|
||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||
vllm:
|
||||
build:
|
||||
context: vllm
|
||||
dockerfile: Dockerfile.cpu
|
||||
extends: codetrans
|
||||
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
|
||||
vllm-gaudi:
|
||||
build:
|
||||
context: vllm-fork
|
||||
dockerfile: Dockerfile.hpu
|
||||
extends: codetrans
|
||||
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
|
||||
nginx:
|
||||
build:
|
||||
context: GenAIComps
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,12 +30,12 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="codetrans codetrans-ui llm-textgen nginx"
|
||||
service_list="codetrans codetrans-ui llm-textgen vllm-gaudi nginx"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||
docker images && sleep 1s
|
||||
}
|
||||
|
||||
@@ -44,7 +45,12 @@ function start_services() {
|
||||
export http_proxy=${http_proxy}
|
||||
export https_proxy=${http_proxy}
|
||||
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_COMPONENT_NAME="OpeaTextGenService"
|
||||
export NUM_CARDS=1
|
||||
export BLOCK_SIZE=128
|
||||
export MAX_NUM_SEQS=256
|
||||
export MAX_SEQ_LEN_TO_CAPTURE=2048
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export MEGA_SERVICE_HOST_IP=${ip_address}
|
||||
export LLM_SERVICE_HOST_IP=${ip_address}
|
||||
@@ -64,13 +70,15 @@ function start_services() {
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
|
||||
if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
|
||||
docker logs codetrans-gaudi-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
|
||||
if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
|
||||
sleep 1m
|
||||
}
|
||||
|
||||
function validate_services() {
|
||||
@@ -102,27 +110,19 @@ function validate_services() {
|
||||
}
|
||||
|
||||
function validate_microservices() {
|
||||
# tgi for embedding service
|
||||
validate_services \
|
||||
"${ip_address}:8008/generate" \
|
||||
"generated_text" \
|
||||
"tgi" \
|
||||
"codetrans-tgi-service" \
|
||||
'{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
"${ip_address}:9000/v1/chat/completions" \
|
||||
"data: " \
|
||||
"llm" \
|
||||
"llm-textgen-gaudi-server" \
|
||||
"codetrans-xeon-llm-server" \
|
||||
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
|
||||
}
|
||||
|
||||
function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:7777/v1/codetrans" \
|
||||
"${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans" \
|
||||
"codetrans-gaudi-backend-server" \
|
||||
@@ -130,7 +130,7 @@ function validate_megaservice() {
|
||||
|
||||
# test the megeservice via nginx
|
||||
validate_services \
|
||||
"${ip_address}:80/v1/codetrans" \
|
||||
"${ip_address}:${NGINX_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans-nginx" \
|
||||
"codetrans-gaudi-nginx-server" \
|
||||
@@ -169,7 +169,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi
|
||||
docker compose stop && docker compose rm -f
|
||||
docker compose -f compose.yaml stop && docker compose rm -f
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
@@ -29,12 +30,16 @@ function build_docker_images() {
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
git clone https://github.com/vllm-project/vllm.git && cd vllm
|
||||
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
|
||||
echo "Check out vLLM tag ${VLLM_VER}"
|
||||
git checkout ${VLLM_VER} &> /dev/null
|
||||
cd ../
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="codetrans codetrans-ui llm-textgen nginx"
|
||||
service_list="codetrans codetrans-ui llm-textgen vllm nginx"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
|
||||
docker images && sleep 1s
|
||||
}
|
||||
|
||||
@@ -43,7 +48,8 @@ function start_services() {
|
||||
export http_proxy=${http_proxy}
|
||||
export https_proxy=${http_proxy}
|
||||
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_COMPONENT_NAME="OpeaTextGenService"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export MEGA_SERVICE_HOST_IP=${ip_address}
|
||||
export LLM_SERVICE_HOST_IP=${ip_address}
|
||||
@@ -59,17 +65,19 @@ function start_services() {
|
||||
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
|
||||
if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
|
||||
docker logs codetrans-xeon-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
|
||||
if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
|
||||
sleep 1m
|
||||
}
|
||||
|
||||
function validate_services() {
|
||||
@@ -101,20 +109,12 @@ function validate_services() {
|
||||
}
|
||||
|
||||
function validate_microservices() {
|
||||
# tgi for embedding service
|
||||
validate_services \
|
||||
"${ip_address}:8008/generate" \
|
||||
"generated_text" \
|
||||
"tgi" \
|
||||
"codetrans-tgi-service" \
|
||||
'{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
"${ip_address}:9000/v1/chat/completions" \
|
||||
"data: " \
|
||||
"llm" \
|
||||
"llm-textgen-server" \
|
||||
"codetrans-xeon-llm-server" \
|
||||
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
|
||||
|
||||
}
|
||||
@@ -122,7 +122,7 @@ function validate_microservices() {
|
||||
function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:7777/v1/codetrans" \
|
||||
"${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans" \
|
||||
"codetrans-xeon-backend-server" \
|
||||
@@ -130,7 +130,7 @@ function validate_megaservice() {
|
||||
|
||||
# test the megeservice via nginx
|
||||
validate_services \
|
||||
"${ip_address}:80/v1/codetrans" \
|
||||
"${ip_address}:${NGINX_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans-nginx" \
|
||||
"codetrans-xeon-nginx-server" \
|
||||
@@ -168,7 +168,7 @@ function validate_frontend() {
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon/
|
||||
docker compose stop && docker compose rm -f
|
||||
docker compose -f compose.yaml stop && docker compose rm -f
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
194
CodeTrans/tests/test_compose_tgi_on_gaudi.sh
Normal file
194
CodeTrans/tests/test_compose_tgi_on_gaudi.sh
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
IMAGE_REPO=${IMAGE_REPO:-"opea"}
|
||||
IMAGE_TAG=${IMAGE_TAG:-"latest"}
|
||||
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
ip_address=$(hostname -I | awk '{print $1}')
|
||||
|
||||
function build_docker_images() {
|
||||
opea_branch=${opea_branch:-"main"}
|
||||
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
|
||||
if [[ "${opea_branch}" != "main" ]]; then
|
||||
cd $WORKPATH
|
||||
OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
|
||||
NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
|
||||
find . -type f -name "Dockerfile*" | while read -r file; do
|
||||
echo "Processing file: $file"
|
||||
sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
|
||||
done
|
||||
fi
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="codetrans codetrans-ui llm-textgen nginx"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||
docker images && sleep 1s
|
||||
}
|
||||
|
||||
function start_services() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi/
|
||||
export http_proxy=${http_proxy}
|
||||
export https_proxy=${http_proxy}
|
||||
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
export LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_COMPONENT_NAME="OpeaTextGenService"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export MEGA_SERVICE_HOST_IP=${ip_address}
|
||||
export LLM_SERVICE_HOST_IP=${ip_address}
|
||||
export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
|
||||
export FRONTEND_SERVICE_IP=${ip_address}
|
||||
export FRONTEND_SERVICE_PORT=5173
|
||||
export BACKEND_SERVICE_NAME=codetrans
|
||||
export BACKEND_SERVICE_IP=${ip_address}
|
||||
export BACKEND_SERVICE_PORT=7777
|
||||
export NGINX_PORT=80
|
||||
export host_ip=${ip_address}
|
||||
|
||||
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs codetrans-gaudi-tgi-service > ${LOG_PATH}/tgi_service_start.log
|
||||
if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
|
||||
sleep 1m
|
||||
}
|
||||
|
||||
function validate_services() {
|
||||
local URL="$1"
|
||||
local EXPECTED_RESULT="$2"
|
||||
local SERVICE_NAME="$3"
|
||||
local DOCKER_NAME="$4"
|
||||
local INPUT_DATA="$5"
|
||||
|
||||
local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
|
||||
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
|
||||
|
||||
local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
|
||||
|
||||
if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
|
||||
echo "[ $SERVICE_NAME ] Content is as expected."
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
|
||||
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
|
||||
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
|
||||
exit 1
|
||||
fi
|
||||
sleep 5s
|
||||
}
|
||||
|
||||
function validate_microservices() {
|
||||
# tgi for embedding service
|
||||
validate_services \
|
||||
"${ip_address}:8008/generate" \
|
||||
"generated_text" \
|
||||
"tgi" \
|
||||
"codetrans-gaudi-tgi-service" \
|
||||
'{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
"${ip_address}:9000/v1/chat/completions" \
|
||||
"data: " \
|
||||
"llm" \
|
||||
"codetrans-gaudi-llm-server" \
|
||||
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
|
||||
|
||||
}
|
||||
|
||||
function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans" \
|
||||
"codetrans-gaudi-backend-server" \
|
||||
'{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
|
||||
|
||||
# test the megeservice via nginx
|
||||
validate_services \
|
||||
"${ip_address}:${NGINX_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans-nginx" \
|
||||
"codetrans-gaudi-nginx-server" \
|
||||
'{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
|
||||
|
||||
}
|
||||
|
||||
function validate_frontend() {
|
||||
cd $WORKPATH/ui/svelte
|
||||
local conda_env_name="OPEA_e2e"
|
||||
export PATH=${HOME}/miniforge3/bin/:$PATH
|
||||
if conda info --envs | grep -q "$conda_env_name"; then
|
||||
echo "$conda_env_name exist!"
|
||||
else
|
||||
conda create -n ${conda_env_name} python=3.12 -y
|
||||
fi
|
||||
source activate ${conda_env_name}
|
||||
|
||||
sed -i "s/localhost/$ip_address/g" playwright.config.ts
|
||||
|
||||
conda install -c conda-forge nodejs=22.6.0 -y
|
||||
npm install && npm ci && npx playwright install --with-deps
|
||||
node -v && npm -v && pip list
|
||||
|
||||
exit_status=0
|
||||
npx playwright test || exit_status=$?
|
||||
|
||||
if [ $exit_status -ne 0 ]; then
|
||||
echo "[TEST INFO]: ---------frontend test failed---------"
|
||||
exit $exit_status
|
||||
else
|
||||
echo "[TEST INFO]: ---------frontend test passed---------"
|
||||
fi
|
||||
}
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/hpu/gaudi/
|
||||
docker compose -f compose_tgi.yaml stop && docker compose rm -f
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
stop_docker
|
||||
|
||||
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
|
||||
start_services
|
||||
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
194
CodeTrans/tests/test_compose_tgi_on_xeon.sh
Normal file
194
CodeTrans/tests/test_compose_tgi_on_xeon.sh
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -xe
|
||||
IMAGE_REPO=${IMAGE_REPO:-"opea"}
|
||||
IMAGE_TAG=${IMAGE_TAG:-"latest"}
|
||||
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
|
||||
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
|
||||
export REGISTRY=${IMAGE_REPO}
|
||||
export TAG=${IMAGE_TAG}
|
||||
export MODEL_CACHE=${model_cache:-"./data"}
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
ip_address=$(hostname -I | awk '{print $1}')
|
||||
|
||||
function build_docker_images() {
|
||||
opea_branch=${opea_branch:-"main"}
|
||||
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
|
||||
if [[ "${opea_branch}" != "main" ]]; then
|
||||
cd $WORKPATH
|
||||
OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
|
||||
NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
|
||||
find . -type f -name "Dockerfile*" | while read -r file; do
|
||||
echo "Processing file: $file"
|
||||
sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
|
||||
done
|
||||
fi
|
||||
|
||||
cd $WORKPATH/docker_image_build
|
||||
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
|
||||
|
||||
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
|
||||
service_list="codetrans codetrans-ui llm-textgen nginx"
|
||||
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
|
||||
|
||||
docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
|
||||
docker images && sleep 1s
|
||||
}
|
||||
|
||||
function start_services() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon/
|
||||
export http_proxy=${http_proxy}
|
||||
export https_proxy=${http_proxy}
|
||||
export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
|
||||
export LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export LLM_COMPONENT_NAME="OpeaTextGenService"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export MEGA_SERVICE_HOST_IP=${ip_address}
|
||||
export LLM_SERVICE_HOST_IP=${ip_address}
|
||||
export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
|
||||
export FRONTEND_SERVICE_IP=${ip_address}
|
||||
export FRONTEND_SERVICE_PORT=5173
|
||||
export BACKEND_SERVICE_NAME=codetrans
|
||||
export BACKEND_SERVICE_IP=${ip_address}
|
||||
export BACKEND_SERVICE_PORT=7777
|
||||
export NGINX_PORT=80
|
||||
export host_ip=${ip_address}
|
||||
|
||||
sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
|
||||
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
docker logs codetrans-xeon-tgi-service > ${LOG_PATH}/tgi_service_start.log
|
||||
if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 5s
|
||||
n=$((n+1))
|
||||
done
|
||||
|
||||
sleep 1m
|
||||
}
|
||||
|
||||
function validate_services() {
|
||||
local URL="$1"
|
||||
local EXPECTED_RESULT="$2"
|
||||
local SERVICE_NAME="$3"
|
||||
local DOCKER_NAME="$4"
|
||||
local INPUT_DATA="$5"
|
||||
|
||||
local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
|
||||
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
|
||||
|
||||
local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
|
||||
|
||||
if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
|
||||
echo "[ $SERVICE_NAME ] Content is as expected."
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
|
||||
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
|
||||
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
|
||||
exit 1
|
||||
fi
|
||||
sleep 5s
|
||||
}
|
||||
|
||||
function validate_microservices() {
|
||||
# tgi for embedding service
|
||||
validate_services \
|
||||
"${ip_address}:8008/generate" \
|
||||
"generated_text" \
|
||||
"tgi" \
|
||||
"codetrans-xeon-tgi-service" \
|
||||
'{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
"${ip_address}:9000/v1/chat/completions" \
|
||||
"data: " \
|
||||
"llm" \
|
||||
"codetrans-xeon-llm-server" \
|
||||
'{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}'
|
||||
|
||||
}
|
||||
|
||||
function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans" \
|
||||
"codetrans-xeon-backend-server" \
|
||||
'{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
|
||||
|
||||
# test the megeservice via nginx
|
||||
validate_services \
|
||||
"${ip_address}:${NGINX_PORT}/v1/codetrans" \
|
||||
"print" \
|
||||
"mega-codetrans-nginx" \
|
||||
"codetrans-xeon-nginx-server" \
|
||||
'{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}'
|
||||
|
||||
}
|
||||
|
||||
function validate_frontend() {
|
||||
cd $WORKPATH/ui/svelte
|
||||
local conda_env_name="OPEA_e2e"
|
||||
export PATH=${HOME}/miniforge3/bin/:$PATH
|
||||
if conda info --envs | grep -q "$conda_env_name"; then
|
||||
echo "$conda_env_name exist!"
|
||||
else
|
||||
conda create -n ${conda_env_name} python=3.12 -y
|
||||
fi
|
||||
source activate ${conda_env_name}
|
||||
|
||||
sed -i "s/localhost/$ip_address/g" playwright.config.ts
|
||||
|
||||
conda install -c conda-forge nodejs=22.6.0 -y
|
||||
npm install && npm ci && npx playwright install --with-deps
|
||||
node -v && npm -v && pip list
|
||||
|
||||
exit_status=0
|
||||
npx playwright test || exit_status=$?
|
||||
|
||||
if [ $exit_status -ne 0 ]; then
|
||||
echo "[TEST INFO]: ---------frontend test failed---------"
|
||||
exit $exit_status
|
||||
else
|
||||
echo "[TEST INFO]: ---------frontend test passed---------"
|
||||
fi
|
||||
}
|
||||
|
||||
function stop_docker() {
|
||||
cd $WORKPATH/docker_compose/intel/cpu/xeon/
|
||||
docker compose -f compose_tgi.yaml stop && docker compose rm -f
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
stop_docker
|
||||
|
||||
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
|
||||
start_services
|
||||
|
||||
validate_microservices
|
||||
validate_megaservice
|
||||
validate_frontend
|
||||
|
||||
stop_docker
|
||||
echo y | docker system prune
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user