Files
GenAIExamples/ChatQnA/benchmark_chatqna.yaml
bjzhjing ed163087ba Provide unified scalable deployment and benchmarking support for exam… (#1315)
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: letonghan <letong.han@intel.com>
Co-authored-by: letonghan <letong.han@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-01-24 22:27:49 +08:00

84 lines
2.0 KiB
YAML

# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
deploy:
device: gaudi
version: 1.1.0
modelUseHostPath: /mnt/models
HUGGINGFACEHUB_API_TOKEN: ""
node: [1, 2, 4, 8]
namespace: ""
services:
backend:
instance_num: [2, 2, 4, 8]
cores_per_instance: ""
memory_capacity: ""
teirerank:
enabled: True
model_id: ""
replicaCount: [1, 1, 1, 1]
cards_per_instance: 1
tei:
model_id: ""
replicaCount: [1, 2, 4, 8]
cores_per_instance: ""
memory_capacity: ""
llm:
engine: tgi
model_id: ""
replicaCount: [7, 15, 31, 63]
max_batch_size: [1, 2, 4, 8]
max_input_length: ""
max_total_tokens: ""
max_batch_total_tokens: ""
max_batch_prefill_tokens: ""
cards_per_instance: 1
data-prep:
replicaCount: [1, 1, 1, 1]
cores_per_instance: ""
memory_capacity: ""
retriever-usvc:
replicaCount: [2, 2, 4, 8]
cores_per_instance: ""
memory_capacity: ""
redis-vector-db:
replicaCount: [1, 1, 1, 1]
cores_per_instance: ""
memory_capacity: ""
chatqna-ui:
replicaCount: [1, 1, 1, 1]
nginx:
replicaCount: [1, 1, 1, 1]
benchmark:
# http request behavior related fields
concurrency: [1, 2, 4]
totoal_query_num: [2048, 4096]
duration: [5, 10] # unit minutes
query_num_per_concurrency: [4, 8, 16]
possion: True
possion_arrival_rate: 1.0
warmup_iterations: 10
seed: 1024
# workload, all of the test cases will run for benchmark
test_cases:
- chatqnafixed
- chatqna_qlist_pubmed:
dataset: pub_med10 # pub_med10, pub_med100, pub_med1000
user_queries: [1, 2, 4]
query_token_size: 128 # if specified, means fixed query token size will be sent out
llm:
# specify the llm output token size
max_token_size: [128, 256]