GenAIExamples/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

test_suite_config: # Overall configuration settings for the test suite
  examples: ["chatqna"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
  deployment_type: ${DEPLOYMENT_TYPE}  # Default is "k8s", can also be "docker"
  service_ip: ${SERVICE_IP}  # Leave as None for k8s, specify for Docker
  service_port: ${SERVICE_PORT}  # Leave as None for k8s, specify for Docker
  warm_ups: ${WARMUP}  # Number of test requests for warm-up
  run_time: 60m  # The max total run time for the test suite
  seed:  # The seed for all RNGs
  user_queries: ${USER_QUERIES}  # Number of test requests at each concurrency level
  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
  random_prompt: false  # Use random prompts if true, fixed prompts if false
  collect_service_metric: false  # Collect service metrics if true, do not collect service metrics if false
  data_visualization: false # Generate data visualization if true, do not generate data visualization if false
  llm_model: "Intel/neural-chat-7b-v3-3"  # The LLM model used for the test
  test_output_dir: "${TEST_OUTPUT_DIR}"  # The directory to store the test output
  load_shape:              # Tenant concurrency pattern
    name: ${LOAD_SHAPE}      # poisson or constant(locust default load shape)
    params:                  # Loadshape-specific parameters
      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
        concurrent_level: ${CONCURRENT_LEVEL}      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
        arrival_rate: ${ARRIVAL_RATE}        # Request arrival rate

test_cases:
  chatqna:
    embedding:
      run_test: false
      service_name: "chatqna-embedding-usvc"  # Replace with your service name
    embedserve:
      run_test: false
      service_name: "chatqna-tei"  # Replace with your service name
    retriever:
      run_test: false
      service_name: "chatqna-retriever-usvc"  # Replace with your service name
      parameters:
        search_type: "similarity"
        k: 1
        fetch_k: 20
        lambda_mult: 0.5
        score_threshold: 0.2
    reranking:
      run_test: false
      service_name: "chatqna-reranking-usvc"  # Replace with your service name
      parameters:
        top_n: 1
    rerankserve:
      run_test: false
      service_name: "chatqna-teirerank"  # Replace with your service name
    llm:
      run_test: false
      service_name: "chatqna-llm-uservice"  # Replace with your service name
      parameters:
        max_tokens: 128
        temperature: 0.01
        top_k: 10
        top_p: 0.95
        repetition_penalty: 1.03
        stream: true
    llmserve:
      run_test: false
      service_name: "chatqna-tgi"  # Replace with your service name
    e2e:
      run_test: true
      service_name: "chatqna"  # Replace with your service name
      k: 1