From 3d8009aa914d0f48cf1019cff3613de0f99b7b74 Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Fri, 28 Feb 2025 10:30:54 +0800
Subject: [PATCH] Fix benchmark scripts (#1517)

- Align benchmark default config:
1. Update default helm charts version.
2. Add `# mandatory` comment.
3. Update default model ID for LLM.
- Fix deploy issue:
1. Support different `replicaCount` for w/ w/o rerank test.
2. Add `max_num_seqs` for vllm.
3. Add resource setting for tune mode.

- Fix Benchmark issue:
1. Update `user_queries` and `concurrency` setting.
2. Remove invalid parameters.
3. Fix `dataset` and `prompt` setting. And dataset ingest into db.
5. Fix the benchmark hang issue with large user queries. Update `"processes": 16` will fix this issue.
6. Update the eval_path setting logical.
- Optimize benchmark readme.
- Optimize the log path to make the logs more readable.

Signed-off-by: chensuyue <suyue.chen@intel.com>
Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Signed-off-by: letonghan <letong.han@intel.com>
---
 ChatQnA/benchmark_chatqna.yaml | 101 +++++++++-----
 README-deploy-benchmark.md     | 143 ++++++++++++++++++--
 benchmark.py                   | 235 ++++++++++++++++++++++++---------
 deploy.py                      | 144 +++++++++++++-------
 deploy_and_benchmark.py        | 224 ++++++++++++++++++++++++-------
 requirements.txt               |   1 +
 6 files changed, 641 insertions(+), 207 deletions(-)

diff --git a/ChatQnA/benchmark_chatqna.yaml b/ChatQnA/benchmark_chatqna.yaml
index c608b8afb..407d555ce 100644
--- a/ChatQnA/benchmark_chatqna.yaml
+++ b/ChatQnA/benchmark_chatqna.yaml
@@ -3,55 +3,89 @@
 
 deploy:
   device: gaudi
-  version: 1.1.0
+  version: 1.2.0
   modelUseHostPath: /mnt/models
-  HUGGINGFACEHUB_API_TOKEN: ""
+  HUGGINGFACEHUB_API_TOKEN: "" # mandatory
   node: [1, 2, 4, 8]
   namespace: ""
+  timeout: 1000  # timeout in seconds for services to be ready, default 30 minutes
+  interval: 5    # interval in seconds between service ready checks, default 5 seconds
 
   services:
     backend:
-      instance_num: [2, 2, 4, 8]
-      cores_per_instance: ""
-      memory_capacity: ""
+      resources:
+        enabled: False
+        cores_per_instance: "16"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
 
     teirerank:
       enabled: True
       model_id: ""
+      resources:
+        enabled: False
+        cards_per_instance: 1
       replicaCount: [1, 1, 1, 1]
-      cards_per_instance: 1
 
     tei:
       model_id: ""
+      resources:
+        enabled: False
+        cores_per_instance: "80"
+        memory_capacity: "20000Mi"
       replicaCount: [1, 2, 4, 8]
-      cores_per_instance: ""
-      memory_capacity: ""
 
     llm:
-      engine: tgi
-      model_id: ""
-      replicaCount: [7, 15, 31, 63]
-      max_batch_size: [1, 2, 4, 8]
-      max_input_length: ""
-      max_total_tokens: ""
-      max_batch_total_tokens: ""
-      max_batch_prefill_tokens: ""
-      cards_per_instance: 1
+      engine: vllm  # or tgi
+      model_id: "meta-llama/Meta-Llama-3-8B-Instruct" # mandatory
+      replicaCount:
+        with_teirerank: [7, 15, 31, 63]     # When teirerank.enabled is True
+        without_teirerank: [8, 16, 32, 64]   # When teirerank.enabled is False
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      model_params:
+        vllm:  # VLLM specific parameters
+          batch_params:
+            enabled: True
+            max_num_seqs: [1, 2, 4, 8]    # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: ""
+            max_total_tokens: ""
+            max_batch_total_tokens: ""
+            max_batch_prefill_tokens: ""
+        tgi:   # TGI specific parameters
+          batch_params:
+            enabled: True
+            max_batch_size: [1, 2, 4, 8]  # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: "1280"
+            max_total_tokens: "2048"
+            max_batch_total_tokens: "65536"
+            max_batch_prefill_tokens: "4096"
 
     data-prep:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
       replicaCount: [1, 1, 1, 1]
-      cores_per_instance: ""
-      memory_capacity: ""
 
     retriever-usvc:
-      replicaCount: [2, 2, 4, 8]
-      cores_per_instance: ""
-      memory_capacity: ""
+      resources:
+        enabled: False
+        cores_per_instance: "8"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
 
     redis-vector-db:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
       replicaCount: [1, 1, 1, 1]
-      cores_per_instance: ""
-      memory_capacity: ""
 
     chatqna-ui:
       replicaCount: [1, 1, 1, 1]
@@ -61,22 +95,17 @@ deploy:
 
 benchmark:
   # http request behavior related fields
-  concurrency:               [1, 2, 4]
-  totoal_query_num:          [2048, 4096]
-  duration:                  [5, 10] # unit minutes
-  query_num_per_concurrency: [4, 8, 16]
-  possion:                   True
-  possion_arrival_rate:      1.0
+  user_queries:              [640]
+  concurrency:               [128]
+  load_shape_type:           "constant" # "constant" or "poisson"
+  poisson_arrival_rate:      1.0  # only used when load_shape_type is "poisson"
   warmup_iterations:         10
   seed:                      1024
 
   # workload, all of the test cases will run for benchmark
-  test_cases:
-    - chatqnafixed
-    - chatqna_qlist_pubmed:
-        dataset: pub_med10  # pub_med10, pub_med100, pub_med1000
-  user_queries:              [1, 2, 4]
-  query_token_size:          128                   # if specified, means fixed query token size will be sent out
+  bench_target: [chatqnafixed, chatqna_qlist_pubmed] # specify the bench_target for benchmark
+  dataset: ["/home/sdp/upload_file.txt", "/home/sdp/pubmed_10000.txt"]  # specify the absolute path to the dataset file
+  prompt: [10, 1000]  # set the prompt length for the chatqna_qlist_pubmed workload, set to 10 for chatqnafixed workload
 
   llm:
     # specify the llm output token size
diff --git a/README-deploy-benchmark.md b/README-deploy-benchmark.md
index 4b813cccc..9f1a13f8f 100644
--- a/README-deploy-benchmark.md
+++ b/README-deploy-benchmark.md
@@ -11,10 +11,9 @@ We aim to run these benchmarks and share them with the OPEA community for three
 ## Table of Contents
 
 - [Prerequisites](#prerequisites)
-- [Overview](#overview)
-  - [Using deploy_and_benchmark.py](#using-deploy_and_benchmark.py-recommended)
 - [Data Preparation](#data-preparation)
-- [Configuration](#configuration)
+- [Running Deploy and Benchmark Tests](#running-deploy-and-benchmark-tests)
+- [Troubleshooting](#troubleshooting)
 
 ## Prerequisites
 
@@ -25,8 +24,50 @@ Before running the benchmarks, ensure you have:
    - Kubernetes installation: Use [kubespray](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md) or other official Kubernetes installation guides
    - (Optional) [Kubernetes set up guide on Intel Gaudi product](https://github.com/opea-project/GenAIInfra/blob/main/README.md#setup-kubernetes-cluster)
 
-2. **Configuration YAML**
-   The configuration file (e.g., `./ChatQnA/benchmark_chatqna.yaml`) consists of two main sections: deployment and benchmarking. Required fields must be filled with valid values (like the Hugging Face token). For all other fields, you can either customize them according to your needs or leave them empty ("") to use the default values from the [helm charts](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts).
+2. **Configuration YAML**  
+   The configuration file (e.g., `./ChatQnA/benchmark_chatqna.yaml`) consists of two main sections: deployment and benchmarking. Required fields with `# mandatory` comment must be filled with valid values, such as `HUGGINGFACEHUB_API_TOKEN`. For all other fields, you can either customize them according to our needs or leave them empty ("") to use the default values from the [helm charts](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts).
+
+   **Default Models**:
+
+   - LLM: `meta-llama/Meta-Llama-3-8B-Instruct` (Required: must be specified as it's shared between deployment and benchmarking phases)
+   - Embedding: `BAAI/bge-base-en-v1.5`
+   - Reranking: `BAAI/bge-reranker-base`
+
+   You can customize which models to use by setting the `model_id` field in the corresponding service section. Note that the LLM model must be specified in the configuration as it is used by both deployment and benchmarking processes.
+
+   **Important Notes**:
+
+   - For Gaudi deployments:
+     - LLM service runs on Gaudi devices
+     - If enabled, the reranking service (teirerank) also runs on Gaudi devices
+   - **Llama Model Access**:
+     - Downloading Llama models requires both:
+       1. HuggingFace API token
+       2. Special authorization from Meta
+     - Please visit [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to request access
+     - Deployment will fail if model download is unsuccessful due to missing authorization
+
+   **Node and Replica Configuration**:
+
+   ```yaml
+   node: [1, 2, 4, 8] # Number of nodes to deploy
+   replicaCount: [1, 2, 4, 8] # Must align with node configuration
+   ```
+
+   The `replicaCount` values must align with the `node` configuration by index:
+
+   - When deploying on 1 node → uses replicaCount[0] = 1
+   - When deploying on 2 nodes → uses replicaCount[1] = 2
+   - When deploying on 4 nodes → uses replicaCount[2] = 4
+   - When deploying on 8 nodes → uses replicaCount[3] = 8
+
+   Note: Model parameters that accept lists (e.g., `max_batch_size`, `max_num_seqs`) are deployment parameters that affect model service behavior but not the number of service instances. When these parameters are lists, each value will trigger a service upgrade followed by a new round of testing, while maintaining the same number of service instances.
+
+3. **Install required Python packages**
+   Run the following command to install all necessary dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
 
 ## Data Preparation
 
@@ -34,36 +75,114 @@ Before running benchmarks, you need to:
 
 1. **Prepare Test Data**
 
-   - Download the retrieval file:
+   - Testing for general benchmark target:
+
+     Download the retrieval file using the command below for data ingestion in RAG:
+
      ```bash
      wget https://github.com/opea-project/GenAIEval/tree/main/evals/benchmark/data/upload_file.txt
      ```
-   - For the `chatqna_qlist_pubmed` test case, prepare `pubmed_${max_lines}.txt` by following this [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/stresscli/README_Pubmed_qlist.md)
+
+   - Testing for pubmed benchmark target:
+
+     For the `chatqna_qlist_pubmed` test case, prepare `pubmed_${max_lines}.txt` by following this [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/stresscli/README_Pubmed_qlist.md)
+
+   After the data is prepared, please update the `absolute path` of this file in the benchmark.yaml file. For example, in the `ChatQnA/benchmark_chatqna.yaml` file, `/home/sdp/upload_file.txt` should be replaced by your file path.
 
 2. **Prepare Model Files (Recommended)**
    ```bash
    pip install -U "huggingface_hub[cli]"
    sudo mkdir -p /mnt/models
    sudo chmod 777 /mnt/models
-   huggingface-cli download --cache-dir /mnt/models Intel/neural-chat-7b-v3-3
+   huggingface-cli download --cache-dir /mnt/models meta-llama/Meta-Llama-3-8B-Instruct
    ```
 
-## Overview
+## Running Deploy and Benchmark Tests
 
 The benchmarking process consists of two main components: deployment and benchmarking. We provide `deploy_and_benchmark.py` as a unified entry point that combines both steps.
 
-### Using deploy_and_benchmark.py (Recommended)
+### Running the Tests
 
-The script `deploy_and_benchmark.py` serves as the main entry point. Here's an example using ChatQnA configuration (you can replace it with any other example's configuration YAML file):
+The script `deploy_and_benchmark.py` serves as the main entry point. You can use any example's configuration YAML file. Here are examples using ChatQnA configuration:
 
 1. For a specific number of nodes:
 
    ```bash
+   # Default OOB (Out of Box) mode
    python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --target-node 1
+
+   # Or specify test mode explicitly
+   python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --target-node 1 --test-mode [oob|tune]
    ```
 
 2. For all node configurations:
+
    ```bash
+   # Default OOB (Out of Box) mode
    python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml
+
+   # Or specify test mode explicitly
+   python deploy_and_benchmark.py ./ChatQnA/benchmark_chatqna.yaml --test-mode [oob|tune]
    ```
-   This will iterate through the node list in your configuration YAML file, performing deployment and benchmarking for each node count.
+
+   This will process all node configurations defined in your YAML file.
+
+### Test Modes
+
+The script provides two test modes controlled by the `--test-mode` parameter:
+
+1. **OOB (Out of Box) Mode** - Default
+
+   ```bash
+   --test-mode oob  # or omit the parameter
+   ```
+
+   - Uses enabled configurations only:
+     - Resources: Only uses resources when `resources.enabled` is True
+     - Model parameters:
+       - Uses batch parameters when `batch_params.enabled` is True
+       - Uses token parameters when `token_params.enabled` is True
+   - Suitable for basic functionality testing with selected optimizations
+
+2. **Tune Mode**
+   ```bash
+   --test-mode tune
+   ```
+   - Applies all configurations regardless of enabled status:
+     - Resource-related parameters:
+       - `resources.cores_per_instance`: CPU cores allocation
+       - `resources.memory_capacity`: Memory allocation
+       - `resources.cards_per_instance`: GPU/Accelerator cards allocation
+     - Model parameters:
+       - Batch parameters:
+         - `max_batch_size`: Maximum batch size (TGI engine)
+         - `max_num_seqs`: Maximum number of sequences (vLLM engine)
+       - Token parameters:
+         - `max_input_length`: Maximum input sequence length
+         - `max_total_tokens`: Maximum total tokens per request
+         - `max_batch_total_tokens`: Maximum total tokens in a batch
+         - `max_batch_prefill_tokens`: Maximum tokens in prefill phase
+
+Choose "oob" mode when you want to selectively enable optimizations, or "tune" mode when you want to apply all available optimizations regardless of their enabled status.
+
+### Troubleshooting
+
+**Helm Chart Directory Issues**
+
+- During execution, the script downloads and extracts the Helm chart to a directory named after your example
+- The directory name is derived from your input YAML file path
+  - For example: if your input is `./ChatQnA/benchmark_chatqna.yaml`, the extracted directory will be `chatqna/`
+- In some error cases, this directory might not be properly cleaned up
+- If you encounter deployment issues, check if there's a leftover Helm chart directory:
+
+  ```bash
+  # Example: for ./ChatQnA/benchmark_chatqna.yaml
+  ls -la chatqna/
+
+  # Clean up if needed
+  rm -rf chatqna/
+  ```
+
+- After cleaning up the directory, try running the deployment again
+
+Note: Always ensure there are no leftover Helm chart directories from previous failed runs before starting a new deployment.
diff --git a/benchmark.py b/benchmark.py
index fb20367c0..202a2cb01 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import sys
 from datetime import datetime
 
+import requests
 import yaml
 from evals.benchmark.stresscli.commands.load_test import locust_runtests
 from kubernetes import client, config
@@ -25,17 +25,15 @@ def construct_benchmark_config(test_suite_config):
     """Extract relevant data from the YAML based on the specified test cases."""
 
     return {
-        "concurrency": test_suite_config.get("concurrency", []),
-        "totoal_query_num": test_suite_config.get("user_queries", []),
-        "duration:": test_suite_config.get("duration:", []),
-        "query_num_per_concurrency": test_suite_config.get("query_num_per_concurrency", []),
-        "possion": test_suite_config.get("possion", False),
-        "possion_arrival_rate": test_suite_config.get("possion_arrival_rate", 1.0),
+        "user_queries": test_suite_config.get("user_queries", [1]),
+        "concurrency": test_suite_config.get("concurrency", [1]),
+        "load_shape_type": test_suite_config.get("load_shape_type", "constant"),
+        "poisson_arrival_rate": test_suite_config.get("poisson_arrival_rate", 1.0),
         "warmup_iterations": test_suite_config.get("warmup_iterations", 10),
         "seed": test_suite_config.get("seed", None),
-        "test_cases": test_suite_config.get("test_cases", ["chatqnafixed"]),
-        "user_queries": test_suite_config.get("user_queries", [1]),
-        "query_token_size": test_suite_config.get("query_token_size", 128),
+        "bench_target": test_suite_config.get("bench_target", ["chatqnafixed"]),
+        "dataset": test_suite_config.get("dataset", ""),
+        "prompt": test_suite_config.get("prompt", [10]),
         "llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", [128]),
     }
 
@@ -97,17 +95,11 @@ def _get_service_ip(service_name, deployment_type="k8s", service_ip=None, servic
     return svc_ip, port
 
 
-def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params):
+def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params, concurrency=1):
     """Create content for the run.yaml file."""
 
-    # If a load shape includes the parameter concurrent_level,
-    # the parameter will be passed to Locust to launch fixed
-    # number of simulated users.
-    concurrency = 1
-    if num_queries >= 0:
-        concurrency = max(1, num_queries // test_params["concurrent_level"])
-    else:
-        concurrency = test_params["concurrent_level"]
+    # calculate the number of concurrent users
+    concurrent_level = int(num_queries // concurrency)
 
     import importlib.util
 
@@ -116,16 +108,21 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
     print(spec)
 
     # get folder path of opea-eval
-    eval_path = None
-    import pkg_resources
-
-    for dist in pkg_resources.working_set:
-        if "opea-eval" in dist.project_name:
-            eval_path = dist.location
+    eval_path = os.getenv("EVAL_PATH", "")
     if not eval_path:
-        print("Fail to load opea-eval package. Please install it first.")
+        import pkg_resources
+
+        for dist in pkg_resources.working_set:
+            if "opea-eval" in dist.project_name:
+                eval_path = dist.location
+                break
+    if not eval_path:
+        print("Fail to find the opea-eval package. Please set/install it first.")
         exit(1)
 
+    load_shape = test_params["load_shape"]
+    load_shape["params"]["constant"] = {"concurrent_level": concurrent_level}
+
     yaml_content = {
         "profile": {
             "storage": {"hostpath": test_params["test_output_dir"]},
@@ -133,8 +130,9 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
                 "tool": "locust",
                 "locustfile": os.path.join(eval_path, "evals/benchmark/stresscli/locust/aistress.py"),
                 "host": base_url,
+                "run-time": test_params["run_time"],
                 "stop-timeout": test_params["query_timeout"],
-                "processes": 2,
+                "processes": 16,  # set to 2 by default
                 "namespace": test_params["namespace"],
                 "bench-target": bench_target,
                 "service-metric-collect": test_params["collect_service_metric"],
@@ -145,42 +143,38 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
                 "seed": test_params.get("seed", None),
                 "llm-model": test_params["llm_model"],
                 "deployment-type": test_params["deployment_type"],
-                "load-shape": test_params["load_shape"],
+                "load-shape": load_shape,
             },
             "runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
         }
     }
 
-    # For the following scenarios, test will stop after the specified run-time
-    if test_params["run_time"] is not None and test_phase != "warmup":
-        yaml_content["profile"]["global-settings"]["run-time"] = test_params["run_time"]
-
     return yaml_content
 
 
-def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts) -> str:
+def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts, concurrency=1) -> str:
     """Create a stresscli configuration file and persist it on disk."""
     stresscli_confs = []
     # Get the workload
-    test_cases = test_params["test_cases"]
-    for test_case in test_cases:
+    bench_target = test_params["bench_target"]
+    for i, b_target in enumerate(bench_target):
         stresscli_conf = {}
-        print(test_case)
-        if isinstance(test_case, str):
-            bench_target = test_case
-        elif isinstance(test_case, dict):
-            bench_target = list(test_case.keys())[0]
-            dataset_conf = test_case[bench_target]
-        if bench_target == "chatqna_qlist_pubmed":
-            max_lines = dataset_conf["dataset"].split("pub_med")[-1]
-            stresscli_conf["envs"] = {"DATASET": f"pubmed_{max_lines}.txt", "MAX_LINES": max_lines}
+        print(f"[OPEA BENCHMARK] 🚀 Running test for {b_target} in phase {test_phase} for {num_queries} queries")
+        if len(test_params["dataset"]) > i:
+            stresscli_conf["envs"] = {"DATASET": test_params["dataset"][i], "MAX_LINES": str(test_params["prompt"][i])}
+        else:
+            stresscli_conf["envs"] = {"MAX_LINES": str(test_params["prompt"][i])}
         # Generate the content of stresscli configuration file
-        stresscli_yaml = _create_yaml_content(case_params, base_url, bench_target, test_phase, num_queries, test_params)
+        stresscli_yaml = _create_yaml_content(
+            case_params, base_url, b_target, test_phase, num_queries, test_params, concurrency
+        )
 
         # Dump the stresscli configuration file
         service_name = case_params.get("service_name")
+        max_output = case_params.get("max_output")
         run_yaml_path = os.path.join(
-            test_params["test_output_dir"], f"run_{service_name}_{ts}_{test_phase}_{num_queries}_{bench_target}.yaml"
+            test_params["test_output_dir"],
+            f"run_{test_phase}_{service_name}_{num_queries}_{b_target}_{max_output}_{ts}.yaml",
         )
         with open(run_yaml_path, "w") as yaml_file:
             yaml.dump(stresscli_yaml, yaml_file)
@@ -207,15 +201,79 @@ def create_stresscli_confs(service, base_url, test_suite_config, index):
         stresscli_confs.extend(_create_stresscli_confs(service, test_suite_config, "benchmark", -1, base_url, index))
     else:
         # Test stop is controlled by request count
-        for user_queries in user_queries_lst:
+        for i, user_query in enumerate(user_queries_lst):
+            concurrency_list = test_suite_config["concurrency"]
+            user_query *= test_suite_config["node_num"]
             stresscli_confs.extend(
-                _create_stresscli_confs(service, test_suite_config, "benchmark", user_queries, base_url, index)
+                _create_stresscli_confs(
+                    service,
+                    test_suite_config,
+                    "benchmark",
+                    user_query,
+                    base_url,
+                    index,
+                    concurrency=concurrency_list[i],
+                )
             )
 
     return stresscli_confs
 
 
-def _run_service_test(example, service, test_suite_config):
+def ingest_data_to_db(service, dataset, namespace):
+    """Ingest data into the database."""
+    for service_name in service.get("service_list"):
+        if "data" in service_name:
+            # Ingest data into the database
+            print(f"[OPEA BENCHMARK] 🚀 Ingesting data into the database for {service_name}...")
+            try:
+                svc_ip, port = _get_service_ip(service_name, "k8s", None, None, namespace)
+                url = f"http://{svc_ip}:{port}/v1/dataprep/ingest"
+
+                files = {"files": open(dataset, "rb")}
+
+                response = requests.post(url, files=files)
+                if response.status_code != 200:
+                    print(f"Error ingesting data: {response.text}. Status code: {response.status_code}")
+                    return False
+                if "Data preparation succeeded" not in response.text:
+                    print(f"Error ingesting data: {response.text}. Response: {response}")
+                    return False
+
+            except Exception as e:
+                print(f"Error ingesting data: {e}")
+                return False
+            print(f"[OPEA BENCHMARK] 🚀 Data ingestion completed for {service_name}.")
+            break
+    return True
+
+
+def clear_db(service, namespace):
+    """Delete all files from the database."""
+    for service_name in service.get("service_list"):
+        if "data" in service_name:
+            # Delete data from the database
+            try:
+                svc_ip, port = _get_service_ip(service_name, "k8s", None, None, namespace)
+                url = f"http://{svc_ip}:{port}/v1/dataprep/delete"
+                data = {"file_path": "all"}
+                print(f"[OPEA BENCHMARK] 🚀 Deleting data from the database for {service_name} with {url}")
+
+                response = requests.post(url, json=data, headers={"Content-Type": "application/json"})
+                if response.status_code != 200:
+                    print(f"Error deleting data: {response.text}. Status code: {response.status_code}")
+                    return False
+                if "true" not in response.text:
+                    print(f"Error deleting data: {response.text}. Response: {response}")
+                    return False
+            except Exception as e:
+                print(f"Error deleting data: {e}")
+                return False
+            print(f"[OPEA BENCHMARK] 🚀 Data deletion completed for {service_name}.")
+            break
+    return True
+
+
+def _run_service_test(example, service, test_suite_config, namespace):
     """Run the test for a specific service and example."""
     print(f"[OPEA BENCHMARK] 🚀 Example: [ {example} ] Service: [ {service.get('service_name')} ], Running test...")
 
@@ -251,44 +309,94 @@ def _run_service_test(example, service, test_suite_config):
         run_yaml_path = stresscli_conf["run_yaml_path"]
         print(f"[OPEA BENCHMARK] 🚀 The {index} time test is running, run yaml: {run_yaml_path}...")
         os.environ["MAX_TOKENS"] = str(service.get("max_output"))
+
+        dataset = None
         if stresscli_conf.get("envs") is not None:
             for key, value in stresscli_conf.get("envs").items():
                 os.environ[key] = value
+                if key == "DATASET":
+                    dataset = value
 
-        output_folders.append(locust_runtests(None, run_yaml_path))
+        if dataset:
+            # Ingest data into the database for single run of benchmark
+            result = ingest_data_to_db(service, dataset, namespace)
+            if not result:
+                print(f"[OPEA BENCHMARK] 🚀 Data ingestion failed for {service_name}.")
+                exit(1)
+        else:
+            print(f"[OPEA BENCHMARK] 🚀 Dataset is not specified for {service_name}. Check the benchmark.yaml again.")
+
+        # Run the benchmark test and append the output folder to the list
+        print("[OPEA BENCHMARK] 🚀 Start locust_runtests at", datetime.now().strftime("%Y%m%d_%H%M%S"))
+        locust_output = locust_runtests(None, run_yaml_path)
+        print(f"[OPEA BENCHMARK] 🚀 locust_output origin name is {locust_output}")
+        # Rename the output folder to include the index
+        new_output_path = os.path.join(
+            os.path.dirname(run_yaml_path), f"{os.path.splitext(os.path.basename(run_yaml_path))[0]}_output"
+        )
+        os.rename(locust_output, new_output_path)
+        print(f"[OPEA BENCHMARK] 🚀 locust new_output_path is {new_output_path}")
+
+        output_folders.append(new_output_path)
+        print("[OPEA BENCHMARK] 🚀 End locust_runtests at", datetime.now().strftime("%Y%m%d_%H%M%S"))
+
+        # Delete all files from the database after the test
+        result = clear_db(service, namespace)
+        print("[OPEA BENCHMARK] 🚀 End of clean up db", datetime.now().strftime("%Y%m%d_%H%M%S"))
+        if not result:
+            print(f"[OPEA BENCHMARK] 🚀 Data deletion failed for {service_name}.")
+            exit(1)
 
     print(f"[OPEA BENCHMARK] 🚀 Test completed for {service_name} at {url}")
     return output_folders
 
 
-def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, report=False):
+def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model=None, report=False, output_dir=None):
+    """Run the benchmark test for the specified helm chart and configuration.
+
+    Args:
+        benchmark_config (dict): The benchmark configuration.
+        chart_name (str): The name of the helm chart.
+        namespace (str): The namespace to deploy the chart.
+        node_num (int): The number of nodes of current deployment.
+        llm_model (str): The LLM model to use for the test.
+        report (bool): Whether to generate a report after the test.
+        output_dir (str): Directory to store the test output. If None, uses default directory.
+    """
     # If llm_model is None or an empty string, set to default value
     if not llm_model:
-        llm_model = "Qwen/Qwen2.5-Coder-7B-Instruct"
+        llm_model = "meta-llama/Meta-Llama-3-8B-Instruct"
 
     # Extract data
     parsed_data = construct_benchmark_config(benchmark_config)
     test_suite_config = {
         "user_queries": parsed_data["user_queries"],  # num of user queries
         "random_prompt": False,  # whether to use random prompt, set to False by default
-        "run_time": "60m",  # The max total run time for the test suite, set to 60m by default
+        "run_time": "30m",  # The max total run time for the test suite, set to 60m by default
         "collect_service_metric": False,  # whether to collect service metrics, set to False by default
         "llm_model": llm_model,  # The LLM model used for the test
         "deployment_type": "k8s",  # Default is "k8s", can also be "docker"
         "service_ip": None,  # Leave as None for k8s, specify for Docker
         "service_port": None,  # Leave as None for k8s, specify for Docker
-        "test_output_dir": os.getcwd() + "/benchmark_output",  # The directory to store the test output
+        "test_output_dir": (
+            output_dir if output_dir else os.getcwd() + "/benchmark_output"
+        ),  # Use output_dir if provided
+        "node_num": node_num,
         "load_shape": {
-            "name": "constant",
-            "params": {"constant": {"concurrent_level": 4}, "poisson": {"arrival_rate": 1.0}},
+            "name": parsed_data["load_shape_type"],
+            "params": {
+                "poisson": {"arrival_rate": parsed_data["poisson_arrival_rate"]},
+            },
         },
-        "concurrent_level": 4,
-        "arrival_rate": 1.0,
+        "concurrency": parsed_data["concurrency"],
+        "arrival_rate": parsed_data["poisson_arrival_rate"],
         "query_timeout": 120,
         "warm_ups": parsed_data["warmup_iterations"],
         "seed": parsed_data["seed"],
         "namespace": namespace,
-        "test_cases": parsed_data["test_cases"],
+        "bench_target": parsed_data["bench_target"],
+        "dataset": parsed_data["dataset"],
+        "prompt": parsed_data["prompt"],
         "llm_max_token_size": parsed_data["llm_max_token_size"],
     }
 
@@ -313,15 +421,14 @@ def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, repor
                     "chatqna-retriever-usvc",
                     "chatqna-tei",
                     "chatqna-teirerank",
-                    "chatqna-tgi",
+                    "chatqna-vllm",
                 ],
-                "test_cases": parsed_data["test_cases"],
                 # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
                 "prompts": query_data,
                 "max_output": llm_max_token,  # max number of output tokens
                 "k": 1,  # number of retrieved documents
             }
-        output_folder = _run_service_test(chart_name, case_data, test_suite_config)
+        output_folder = _run_service_test(chart_name, case_data, test_suite_config, namespace)
 
     print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.")
 
@@ -339,5 +446,5 @@ def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, repor
 
 
 if __name__ == "__main__":
-    benchmark_config = load_yaml("./benchmark.yaml")
-    run_benchmark(benchmark_config=benchmark_config, chart_name="chatqna", namespace="deploy-benchmark")
+    benchmark_config = load_yaml("./ChatQnA/benchmark_chatqna.yaml")
+    run_benchmark(benchmark_config=benchmark_config, chart_name="chatqna", namespace="benchmark")
diff --git a/deploy.py b/deploy.py
index 21dd278cc..bd3a8a87d 100644
--- a/deploy.py
+++ b/deploy.py
@@ -49,12 +49,14 @@ def configure_replica(values, deploy_config):
     return values
 
 
-def get_output_filename(num_nodes, with_rerank, example_type, device, action_type):
+def get_output_filename(num_nodes, with_rerank, example_type, device, action_type, batch_size=None):
     """Generate output filename based on configuration."""
     rerank_suffix = "with-rerank-" if with_rerank else ""
     action_suffix = "deploy-" if action_type == 0 else "update-" if action_type == 1 else ""
+    # Only include batch_suffix if batch_size is not None
+    batch_suffix = f"batch{batch_size}-" if batch_size else ""
 
-    return f"{example_type}-{num_nodes}-{device}-{action_suffix}{rerank_suffix}values.yaml"
+    return f"{example_type}-{rerank_suffix}{device}-{action_suffix}node{num_nodes}-{batch_suffix}values.yaml"
 
 
 def configure_resources(values, deploy_config):
@@ -62,30 +64,31 @@ def configure_resources(values, deploy_config):
     resource_configs = []
 
     for service_name, config in deploy_config["services"].items():
+        # Skip if resources configuration doesn't exist or is not enabled
+        resources_config = config.get("resources", {})
+        if not resources_config:
+            continue
+
         resources = {}
-        if deploy_config["device"] == "gaudi" and config.get("cards_per_instance", 0) > 1:
+        if deploy_config["device"] == "gaudi" and resources_config.get("cards_per_instance", 0) > 1:
             resources = {
-                "limits": {"habana.ai/gaudi": config["cards_per_instance"]},
-                "requests": {"habana.ai/gaudi": config["cards_per_instance"]},
+                "limits": {"habana.ai/gaudi": resources_config["cards_per_instance"]},
+                "requests": {"habana.ai/gaudi": resources_config["cards_per_instance"]},
             }
         else:
-            limits = {}
-            requests = {}
+            # Only add CPU if cores_per_instance has a valid value
+            cores = resources_config.get("cores_per_instance")
+            if cores is not None and cores != "":
+                resources = {"limits": {"cpu": cores}, "requests": {"cpu": cores}}
 
-            # Only add CPU if cores_per_instance has a value
-            if config.get("cores_per_instance"):
-                limits["cpu"] = config["cores_per_instance"]
-                requests["cpu"] = config["cores_per_instance"]
-
-            # Only add memory if memory_capacity has a value
-            if config.get("memory_capacity"):
-                limits["memory"] = config["memory_capacity"]
-                requests["memory"] = config["memory_capacity"]
-
-            # Only create resources if we have any limits/requests
-            if limits and requests:
-                resources["limits"] = limits
-                resources["requests"] = requests
+            # Only add memory if memory_capacity has a valid value
+            memory = resources_config.get("memory_capacity")
+            if memory is not None and memory != "":
+                if not resources:
+                    resources = {"limits": {"memory": memory}, "requests": {"memory": memory}}
+                else:
+                    resources["limits"]["memory"] = memory
+                    resources["requests"]["memory"] = memory
 
         if resources:
             if service_name == "llm":
@@ -116,48 +119,64 @@ def configure_resources(values, deploy_config):
 
 def configure_extra_cmd_args(values, deploy_config):
     """Configure extra command line arguments for services."""
+    batch_size = None
     for service_name, config in deploy_config["services"].items():
-        extra_cmd_args = []
+        if service_name == "llm":
+            extra_cmd_args = []
+            engine = config.get("engine", "tgi")
+            model_params = config.get("model_params", {})
 
-        for param in [
-            "max_batch_size",
-            "max_input_length",
-            "max_total_tokens",
-            "max_batch_total_tokens",
-            "max_batch_prefill_tokens",
-        ]:
-            if config.get(param):
-                extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(config[param])])
+            # Get engine-specific parameters
+            engine_params = model_params.get(engine, {})
 
-        if extra_cmd_args:
-            if service_name == "llm":
-                engine = config.get("engine", "tgi")
+            # Get batch parameters and token parameters configuration
+            batch_params = engine_params.get("batch_params", {})
+            token_params = engine_params.get("token_params", {})
+
+            # Get batch size based on engine type
+            if engine == "tgi":
+                batch_size = batch_params.get("max_batch_size")
+            elif engine == "vllm":
+                batch_size = batch_params.get("max_num_seqs")
+            batch_size = batch_size if batch_size and batch_size != "" else None
+
+            # Add all parameters that exist in batch_params
+            for param, value in batch_params.items():
+                if value is not None and value != "":
+                    extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(value)])
+
+            # Add all parameters that exist in token_params
+            for param, value in token_params.items():
+                if value is not None and value != "":
+                    extra_cmd_args.extend([f"--{param.replace('_', '-')}", str(value)])
+
+            if extra_cmd_args:
                 if engine not in values:
                     values[engine] = {}
                 values[engine]["extraCmdArgs"] = extra_cmd_args
-            else:
-                if service_name not in values:
-                    values[service_name] = {}
-                values[service_name]["extraCmdArgs"] = extra_cmd_args
+                print(f"extraCmdArgs: {extra_cmd_args}")
 
-    return values
+    return values, batch_size
 
 
 def configure_models(values, deploy_config):
     """Configure model settings for services."""
     for service_name, config in deploy_config["services"].items():
-        # Skip if no model_id defined or service is disabled
-        if not config.get("model_id") or config.get("enabled") is False:
+        # Get model_id and check if it's valid (not None or empty string)
+        model_id = config.get("model_id")
+        if not model_id or model_id == "" or config.get("enabled") is False:
             continue
 
         if service_name == "llm":
             # For LLM service, use its engine as the key
+            # Check if engine is valid (not None or empty string)
             engine = config.get("engine", "tgi")
-            values[engine]["LLM_MODEL_ID"] = config.get("model_id")
+            if engine and engine != "":
+                values[engine]["LLM_MODEL_ID"] = model_id
         elif service_name == "tei":
-            values[service_name]["EMBEDDING_MODEL_ID"] = config.get("model_id")
+            values[service_name]["EMBEDDING_MODEL_ID"] = model_id
         elif service_name == "teirerank":
-            values[service_name]["RERANK_MODEL_ID"] = config.get("model_id")
+            values[service_name]["RERANK_MODEL_ID"] = model_id
 
     return values
 
@@ -209,13 +228,13 @@ def generate_helm_values(example_type, deploy_config, chart_dir, action_type, no
     values = configure_rerank(values, with_rerank, deploy_config, example_type, node_selector or {})
     values = configure_replica(values, deploy_config)
     values = configure_resources(values, deploy_config)
-    values = configure_extra_cmd_args(values, deploy_config)
+    values, batch_size = configure_extra_cmd_args(values, deploy_config)
     values = configure_models(values, deploy_config)
 
     device = deploy_config.get("device", "unknown")
 
     # Generate and write YAML file
-    filename = get_output_filename(num_nodes, with_rerank, example_type, device, action_type)
+    filename = get_output_filename(num_nodes, with_rerank, example_type, device, action_type, batch_size)
     yaml_string = yaml.dump(values, default_flow_style=False)
 
     filepath = os.path.join(chart_dir, filename)
@@ -376,12 +395,24 @@ def install_helm_release(release_name, chart_name, namespace, hw_values_file, de
 
 
 def uninstall_helm_release(release_name, namespace=None):
-    """Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'."""
+    """Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'.
+
+    First checks if the release exists before attempting to uninstall.
+    """
     # Default to 'default' namespace if none is specified
     if not namespace:
         namespace = "default"
 
     try:
+        # Check if the release exists
+        check_command = ["helm", "list", "--namespace", namespace, "--filter", release_name, "--output", "json"]
+        output = run_kubectl_command(check_command)
+        releases = json.loads(output)
+
+        if not releases:
+            print(f"Helm release {release_name} not found in namespace {namespace}. Nothing to uninstall.")
+            return
+
         # Uninstall the Helm release
         command = ["helm", "uninstall", release_name, "--namespace", namespace]
         print(f"Uninstalling Helm release {release_name} in namespace {namespace}...")
@@ -399,6 +430,8 @@ def uninstall_helm_release(release_name, namespace=None):
 
     except subprocess.CalledProcessError as e:
         print(f"Error occurred while uninstalling Helm release or deleting namespace: {e}")
+    except json.JSONDecodeError as e:
+        print(f"Error parsing helm list output: {e}")
 
 
 def update_service(release_name, chart_name, namespace, hw_values_file, deploy_values_file, update_values_file):
@@ -449,7 +482,7 @@ def read_deploy_config(config_path):
         return None
 
 
-def check_deployment_ready(release_name, namespace, timeout=300, interval=5, logfile="deployment.log"):
+def check_deployment_ready(release_name, namespace, timeout=1000, interval=5, logfile="deployment.log"):
     """Wait until all pods in the deployment are running and ready.
 
     Args:
@@ -586,6 +619,18 @@ def main():
     parser.add_argument("--update-service", action="store_true", help="Update the deployment with new configuration.")
     parser.add_argument("--check-ready", action="store_true", help="Check if all services in the deployment are ready.")
     parser.add_argument("--chart-dir", default=".", help="Path to the untarred Helm chart directory.")
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1000,
+        help="Maximum time to wait for deployment readiness in seconds (default: 1000)",
+    )
+    parser.add_argument(
+        "--interval",
+        type=int,
+        default=5,
+        help="Interval between readiness checks in seconds (default: 5)",
+    )
 
     args = parser.parse_args()
 
@@ -597,7 +642,7 @@ def main():
         clear_labels_from_nodes(args.label, args.node_names)
         return
     elif args.check_ready:
-        is_ready = check_deployment_ready(args.chart_name, args.namespace)
+        is_ready = check_deployment_ready(args.chart_name, args.namespace, args.timeout, args.interval)
         return is_ready
     elif args.uninstall:
         uninstall_helm_release(args.chart_name, args.namespace)
@@ -659,6 +704,7 @@ def main():
             update_service(
                 args.chart_name, args.chart_name, args.namespace, hw_values_file, args.user_values, values_file_path
             )
+            print(f"values_file_path: {values_file_path}")
             return
         except Exception as e:
             parser.error(f"Failed to update deployment: {str(e)}")
diff --git a/deploy_and_benchmark.py b/deploy_and_benchmark.py
index 1dc4c4308..f210f215d 100644
--- a/deploy_and_benchmark.py
+++ b/deploy_and_benchmark.py
@@ -23,13 +23,14 @@ def read_yaml(file_path):
         return None
 
 
-def construct_deploy_config(deploy_config, target_node, max_batch_size=None):
-    """Construct a new deploy config based on the target node number and optional max_batch_size.
+def construct_deploy_config(deploy_config, target_node, batch_param_value=None, test_mode="oob"):
+    """Construct a new deploy config based on the target node number and optional batch parameter value.
 
     Args:
         deploy_config: Original deploy config dictionary
         target_node: Target node number to match in the node array
-        max_batch_size: Optional specific max_batch_size value to use
+        batch_param_value: Optional specific batch parameter value to use
+        test_mode: Test mode, either 'oob' or 'tune'
 
     Returns:
         A new deploy config with single values for node and instance_num
@@ -51,21 +52,79 @@ def construct_deploy_config(deploy_config, target_node, max_batch_size=None):
     # Set the single node value
     new_config["node"] = target_node
 
-    # Update instance_num for each service based on the same index
-    for service_name, service_config in new_config.get("services", {}).items():
-        if "replicaCount" in service_config:
-            instance_nums = service_config["replicaCount"]
-            if isinstance(instance_nums, list):
-                if len(instance_nums) != len(nodes):
-                    raise ValueError(
-                        f"instance_num array length ({len(instance_nums)}) for service {service_name} "
-                        f"doesn't match node array length ({len(nodes)})"
-                    )
-                service_config["replicaCount"] = instance_nums[node_index]
+    # First determine which llm replicaCount to use based on teirerank.enabled
+    services = new_config.get("services", {})
+    teirerank_enabled = services.get("teirerank", {}).get("enabled", True)
 
-    # Update max_batch_size if specified
-    if max_batch_size is not None and "llm" in new_config["services"]:
-        new_config["services"]["llm"]["max_batch_size"] = max_batch_size
+    # Process each service's configuration
+    for service_name, service_config in services.items():
+        # Handle replicaCount
+        if "replicaCount" in service_config:
+            if service_name == "llm" and isinstance(service_config["replicaCount"], dict):
+                replica_counts = service_config["replicaCount"]
+                service_config["replicaCount"] = (
+                    replica_counts["with_teirerank"] if teirerank_enabled else replica_counts["without_teirerank"]
+                )
+
+            if isinstance(service_config["replicaCount"], list):
+                if len(service_config["replicaCount"]) < len(nodes):
+                    raise ValueError(
+                        f"replicaCount array length ({len(service_config['replicaCount'])}) for service {service_name} "
+                        f"smaller than node array length ({len(nodes)})"
+                    )
+                service_config["replicaCount"] = service_config["replicaCount"][node_index]
+
+        # Handle resources based on test_mode
+        if "resources" in service_config:
+            resources = service_config["resources"]
+            if test_mode == "tune" or resources.get("enabled", False):
+                # Keep resource configuration but remove enabled field
+                resources.pop("enabled", None)
+            else:
+                # Remove resource configuration in OOB mode when disabled
+                service_config.pop("resources")
+
+        # Handle model parameters for LLM service
+        if service_name == "llm" and "model_params" in service_config:
+            model_params = service_config["model_params"]
+            engine = service_config.get("engine", "tgi")
+
+            # Get engine-specific parameters
+            engine_params = model_params.get(engine, {})
+
+            # Handle batch parameters
+            if "batch_params" in engine_params:
+                batch_params = engine_params["batch_params"]
+                if test_mode == "tune" or batch_params.get("enabled", False):
+                    # Keep batch parameters configuration but remove enabled field
+                    batch_params.pop("enabled", None)
+
+                    # Update batch parameter value if specified
+                    if batch_param_value is not None:
+                        if engine == "tgi":
+                            batch_params["max_batch_size"] = str(batch_param_value)
+                        elif engine == "vllm":
+                            batch_params["max_num_seqs"] = str(batch_param_value)
+                else:
+                    engine_params.pop("batch_params")
+
+            # Handle token parameters
+            if "token_params" in engine_params:
+                token_params = engine_params["token_params"]
+                if test_mode == "tune" or token_params.get("enabled", False):
+                    # Keep token parameters configuration but remove enabled field
+                    token_params.pop("enabled", None)
+                else:
+                    # Remove token parameters in OOB mode when disabled
+                    engine_params.pop("token_params")
+
+            # Update model_params with engine-specific parameters only
+            model_params.clear()
+            model_params[engine] = engine_params
+
+            # Remove model_params if empty or if engine_params is empty
+            if not model_params or not engine_params:
+                service_config.pop("model_params")
 
     return new_config
 
@@ -84,13 +143,18 @@ def pull_helm_chart(chart_pull_url, version, chart_name):
     return untar_dir
 
 
-def main(yaml_file, target_node=None):
+def main(yaml_file, target_node=None, test_mode="oob"):
     """Main function to process deployment configuration.
 
     Args:
         yaml_file: Path to the YAML configuration file
         target_node: Optional target number of nodes to deploy. If not specified, will process all nodes.
+        test_mode: Test mode, either "oob" (out of box) or "tune". Defaults to "oob".
     """
+    if test_mode not in ["oob", "tune"]:
+        print("Error: test_mode must be either 'oob' or 'tune'")
+        return None
+
     config = read_yaml(yaml_file)
     if config is None:
         print("Failed to read YAML file.")
@@ -116,7 +180,7 @@ def main(yaml_file, target_node=None):
 
     # Pull the Helm chart
     chart_pull_url = f"oci://ghcr.io/opea-project/charts/{chart_name}"
-    version = deploy_config.get("version", "1.1.0")
+    version = deploy_config.get("version", "0-latest")
     chart_dir = pull_helm_chart(chart_pull_url, version, chart_name)
     if not chart_dir:
         return
@@ -140,20 +204,61 @@ def main(yaml_file, target_node=None):
                 continue
 
             try:
-                # Process max_batch_sizes
-                max_batch_sizes = deploy_config.get("services", {}).get("llm", {}).get("max_batch_size", [])
-                if not isinstance(max_batch_sizes, list):
-                    max_batch_sizes = [max_batch_sizes]
+                # Process batch parameters based on engine type
+                services = deploy_config.get("services", {})
+                llm_config = services.get("llm", {})
+
+                if "model_params" in llm_config:
+                    model_params = llm_config["model_params"]
+                    engine = llm_config.get("engine", "tgi")
+
+                    # Get engine-specific parameters
+                    engine_params = model_params.get(engine, {})
+
+                    # Handle batch parameters
+                    batch_params = []
+                    if "batch_params" in engine_params:
+                        key = "max_batch_size" if engine == "tgi" else "max_num_seqs"
+                        batch_params = engine_params["batch_params"].get(key, [])
+                        param_name = key
+
+                    if not isinstance(batch_params, list):
+                        batch_params = [batch_params]
+
+                    # Skip multiple iterations if batch parameter is empty
+                    if batch_params == [""] or not batch_params:
+                        batch_params = [None]
+                else:
+                    batch_params = [None]
+                    param_name = "batch_param"
+
+                # Get timeout and interval from deploy config for check-ready
+                timeout = deploy_config.get("timeout", 1000)  # default 1000s
+                interval = deploy_config.get("interval", 5)  # default 5s
 
                 values_file_path = None
-                for i, max_batch_size in enumerate(max_batch_sizes):
-                    print(f"\nProcessing max_batch_size: {max_batch_size}")
+                # Create benchmark output directory
+                benchmark_dir = os.path.join(os.getcwd(), "benchmark_output")
+                os.makedirs(benchmark_dir, exist_ok=True)
+
+                for i, batch_param in enumerate(batch_params):
+                    print(f"\nProcessing {test_mode} mode {param_name}: {batch_param}")
+                    # Create subdirectory for this iteration with test mode in the name
+                    iteration_dir = os.path.join(
+                        benchmark_dir,
+                        f"benchmark_{test_mode}_node{node}_batch{batch_param if batch_param is not None else 'default'}",
+                    )
+                    os.makedirs(iteration_dir, exist_ok=True)
 
                     # Construct new deploy config
-                    new_deploy_config = construct_deploy_config(deploy_config, node, max_batch_size)
+                    new_deploy_config = construct_deploy_config(deploy_config, node, batch_param, test_mode)
 
                     # Write the new deploy config to a temporary file
-                    temp_config_file = f"temp_deploy_config_{node}_{max_batch_size}.yaml"
+                    temp_config_file = (
+                        f"temp_deploy_config_{node}.yaml"
+                        if batch_param is None
+                        else f"temp_deploy_config_{node}_{batch_param}.yaml"
+                    )
                     try:
                         with open(temp_config_file, "w") as f:
                             yaml.dump(new_deploy_config, f)
@@ -178,6 +283,8 @@ def main(yaml_file, target_node=None):
                             if match:
                                 values_file_path = match.group(1)
                                 print(f"Captured values_file_path: {values_file_path}")
+                                # Copy values file to iteration directory
+                                shutil.copy2(values_file_path, iteration_dir)
                             else:
                                 print("values_file_path not found in the output")
 
@@ -198,12 +305,20 @@ def main(yaml_file, target_node=None):
                                 values_file_path,
                                 "--update-service",
                             ]
-                            result = subprocess.run(cmd, check=True)
+                            result = subprocess.run(cmd, check=True, capture_output=True, text=True)
                             if result.returncode != 0:
-                                print(
-                                    f"Update failed for {node} nodes configuration with max_batch_size {max_batch_size}"
-                                )
-                                break  # Skip remaining max_batch_sizes for this node
+                                print(f"Update failed for {node} nodes configuration with {param_name} {batch_param}")
+                                break  # Skip remaining {param_name} for this node
+
+                            # Update values_file_path from the output
+                            match = re.search(r"values_file_path: (\S+)", result.stdout)
+                            if match:
+                                values_file_path = match.group(1)
+                                print(f"Updated values_file_path: {values_file_path}")
+                                # Copy values file to iteration directory
+                                shutil.copy2(values_file_path, iteration_dir)
+                            else:
+                                print("values_file_path not found in the output")
 
                         # Wait for deployment to be ready
                         print("\nWaiting for deployment to be ready...")
@@ -215,26 +330,42 @@ def main(yaml_file, target_node=None):
                             "--namespace",
                             namespace,
                             "--check-ready",
+                            "--timeout",
+                            str(timeout),
+                            "--interval",
+                            str(interval),
                         ]
                         try:
-                            result = subprocess.run(cmd, check=True)
-                            print("Deployments are ready!")
+                            result = subprocess.run(
+                                cmd, check=False
+                            )  # Changed to check=False to handle return code manually
+                            if result.returncode == 0:
+                                print("Deployments are ready!")
+                                # Run benchmark only if deployment is ready
+                                run_benchmark(
+                                    benchmark_config=benchmark_config,
+                                    chart_name=chart_name,
+                                    namespace=namespace,
+                                    node_num=node,
+                                    llm_model=deploy_config.get("services", {}).get("llm", {}).get("model_id", ""),
+                                    output_dir=iteration_dir,
+                                )
+                            else:
+                                print(
+                                    f"Deployments are not ready after timeout period during "
+                                    f"{'deployment' if i == 0 else 'update'} for {node} nodes. "
+                                    f"Skipping remaining iterations."
+                                )
+                                break  # Exit the batch parameter loop for current node
                         except subprocess.CalledProcessError as e:
-                            print(f"Deployments status failed with returncode: {e.returncode}")
-
-                        # Run benchmark
-                        run_benchmark(
-                            benchmark_config=benchmark_config,
-                            chart_name=chart_name,
-                            namespace=namespace,
-                            llm_model=deploy_config.get("services", {}).get("llm", {}).get("model_id", ""),
-                        )
+                            print(f"Error while checking deployment status: {str(e)}")
+                            break  # Exit the batch parameter loop for current node
 
                     except Exception as e:
                         print(
-                            f"Error during {'deployment' if i == 0 else 'update'} for {node} nodes with max_batch_size {max_batch_size}: {str(e)}"
+                            f"Error during {'deployment' if i == 0 else 'update'} for {node} nodes with {param_name} {batch_param}: {str(e)}"
                         )
-                        break  # Skip remaining max_batch_sizes for this node
+                        break  # Skip remaining {param_name} for this node
                     finally:
                         # Clean up the temporary file
                         if os.path.exists(temp_config_file):
@@ -287,6 +418,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Deploy and benchmark with specific node configuration.")
     parser.add_argument("yaml_file", help="Path to the YAML configuration file")
     parser.add_argument("--target-node", type=int, help="Optional: Target number of nodes to deploy.", default=None)
+    parser.add_argument("--test-mode", type=str, help="Test mode, either 'oob' (out of box) or 'tune'.", default="oob")
 
     args = parser.parse_args()
-    main(args.yaml_file, args.target_node)
+    main(args.yaml_file, args.target_node, args.test_mode)
diff --git a/requirements.txt b/requirements.txt
index 44f6445aa..637668c3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ kubernetes
 locust
 numpy
 opea-eval>=1.2
+prometheus_client
 pytest
 pyyaml
 requests