refactor python script into deploy_and_benchmark.py

Signed-off-by: letonghan <letong.han@intel.com>
2025-01-23 14:41:11 +08:00
parent eba1c300b3
commit 78a1efd7f0
5 changed files with 1211 additions and 391 deletions
--- a/ChatQnA/chatqna.yaml
+++ b/ChatQnA/chatqna.yaml
@@ -0,0 +1,90 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deploy:
+  device: gaudi
+  version: 1.1.0
+  modelUseHostPath: /mnt/models
+  HUGGINGFACEHUB_API_TOKEN: ""
+  node: [1, 2, 4]
+  namespace: "default"
+  cards_per_node: 8
+
+  services:
+    backend:
+      instance_num: [2, 2, 4]
+      cores_per_instance: ""
+      memory_capacity: ""
+
+    teirerank:
+      enabled: True
+      model_id: ""
+      instance_num: [1, 1, 1]
+      cards_per_instance: 1
+
+    tei:
+      model_id: ""
+      instance_num: [1, 2, 4]
+      cores_per_instance: ""
+      memory_capacity: ""
+
+    llm:
+      engine: tgi
+      model_id: ""
+      instance_num: [7, 15, 31]
+      max_batch_size: [1, 2, 4, 8]
+      max_input_length: ""
+      max_total_tokens: ""
+      max_batch_total_tokens: ""
+      max_batch_prefill_tokens: ""
+      cards_per_instance: 1
+
+    data-prep:
+      instance_num: [1, 1, 1]
+      cores_per_instance: ""
+      memory_capacity: ""
+
+    retriever-usvc:
+      instance_num: [2, 2, 4]
+      cores_per_instance: ""
+      memory_capacity: ""
+
+    redis-vector-db:
+      instance_num: [1, 1, 1]
+      cores_per_instance: ""
+      memory_capacity: ""
+
+    chatqna-ui:
+      instance_num: [1, 1, 1]
+
+    nginx:
+      instance_num: [1, 1, 1]
+
+benchmark:
+  # http request behavior related fields
+  concurrency:               [1, 2, 4]
+  totoal_query_num:          [2048, 4096]
+  duration:                  [5, 10] # unit minutes
+  query_num_per_concurrency: [4, 8, 16]
+  possion:                   True
+  possion_arrival_rate:      1.0
+  warmup_iterations:         10
+  seed:                      1024
+
+  # dataset relted fields
+  dataset:                   pub_med10 # [dummy_english, dummy_chinese, pub_med100] predefined keywords for supported dataset
+  user_queries:              [1, 2, 4]
+  query_token_size:          128                   # if specified, means fixed query token size will be sent out
+
+  # advance settings in each component which will impact perf.
+  dataprep:                  # not target this time
+    chunk_size:              [1024]
+    chunk_overlap:           [1000]
+  retriever:                   # not target this time
+    algo:                    IVF
+    fetch_k:                 2
+    k:                       1
+  rerank:
+    top_n:                   2
+  llm:
+    max_token_size:          128   # specify the output token size
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,360 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
- 
-import os
-from datetime import datetime
- 
-import yaml
-import sys
-eval_path = '/home/sdp/GenAIEval/'
-sys.path.append(eval_path)
-from evals.benchmark.stresscli.commands.load_test import locust_runtests
-from kubernetes import client, config
-
-
-def load_yaml(file_path):
-    with open(file_path, "r") as f:
-        data = yaml.safe_load(f)
-    return data
-
- 
-def get_service_cluster_ip(service_name, namespace="default"):
-    # Load the Kubernetes configuration
-    config.load_kube_config()  # or use config.load_incluster_config() if running inside a Kubernetes pod
- 
-    # Create an API client for the core API (which handles services)
-    v1 = client.CoreV1Api()
- 
-    try:
-        # Get the service object
-        service = v1.read_namespaced_service(name=service_name, namespace=namespace)
- 
-        # Extract the Cluster IP
-        cluster_ip = service.spec.cluster_ip
- 
-        # Extract the port number (assuming the first port, modify if necessary)
-        if service.spec.ports:
-            port_number = service.spec.ports[0].port  # Get the first port number
-        else:
-            port_number = None
- 
-        return cluster_ip, port_number
-    except client.exceptions.ApiException as e:
-        print(f"Error fetching service: {e}")
-        return None
-
-
-service_endpoints = {
-    "chatqna": "/v1/chatqna",
-    "codegen": "/v1/codegen",
-    "codetrans": "/v1/codetrans",
-    "faqgen": "/v1/faqgen",
-    "audioqna": "/v1/audioqna",
-    "visualqna": "/v1/visualqna",
-}
- 
- 
-def extract_benchmark_config_data(content):
-    """Extract relevant data from the YAML based on the specified test cases."""
-    # Extract test suite configuration
-    test_suite_config = content.get("benchmark", {})
- 
-    return {
-        # no examples param
-        "example_name": test_suite_config.get("example_name", "chatqna"),
-        "concurrency": test_suite_config.get("concurrency", []),
-        "totoal_query_num": test_suite_config.get("user_queries", []),
-        "duration:": test_suite_config.get("duration:", []),
-        "query_num_per_concurrency": test_suite_config.get("query_num_per_concurrency", []),
-        "possion": test_suite_config.get("possion", False),
-        "possion_arrival_rate": test_suite_config.get("possion_arrival_rate", 1.0),
-        "warmup_iterations": test_suite_config.get("warmup_iterations", 10),
-        "seed": test_suite_config.get("seed", None),
-        "dataset": test_suite_config.get("dataset", []),
-        "user_queries": test_suite_config.get("user_queries", [1]),
-        "query_token_size": test_suite_config.get("query_token_size"),
-        # new params
-        "dataprep_chunk_size": test_suite_config.get("data_prep", {}).get("chunk_size", [1024]),
-        "dataprep_chunk_overlap": test_suite_config.get("data_prep", {}).get("chunk_overlap", [1000]),
-        "retriever_algo": test_suite_config.get("retriever", {}).get("algo", 'IVF'),
-        "retriever_fetch_k": test_suite_config.get("retriever", {}).get("fetch_k", 2),
-        "rerank_top_n": test_suite_config.get("rerank", {}).get("top_n", 2),
-        "llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", 1024),
-    }
- 
- 
-def create_run_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params):
-    """Create content for the run.yaml file."""
- 
-    # If a load shape includes the parameter concurrent_level,
-    # the parameter will be passed to Locust to launch fixed
-    # number of simulated users.
-    concurrency = 1
-    if num_queries >= 0:
-        concurrency = max(1, num_queries // test_params["concurrent_level"])
-    else:
-        concurrency = test_params["concurrent_level"]
- 
-    yaml_content = {
-        "profile": {
-            "storage": {"hostpath": test_params["test_output_dir"]},
-            "global-settings": {
-                "tool": "locust",
-                "locustfile": os.path.join(eval_path, "evals/benchmark/stresscli/locust/aistress.py"),
-                "host": base_url,
-                "stop-timeout": test_params["query_timeout"],
-                "processes": 2,
-                "namespace": test_params["namespace"],
-                "bench-target": bench_target,
-                "service-metric-collect": test_params["collect_service_metric"],
-                "service-list": service.get("service_list", []),
-                "dataset": service.get("dataset", "default"),
-                "prompts": service.get("prompts", None),
-                "max-output": service.get("max_output", 128),
-                "seed": test_params.get("seed", None),
-                "llm-model": test_params["llm_model"],
-                "deployment-type": test_params["deployment_type"],
-                "load-shape": test_params["load_shape"],
-            },
-            "runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
-        }
-    }
- 
-    # For the following scenarios, test will stop after the specified run-time
-    # 1) run_time is not specified in benchmark.yaml
-    # 2) Not a warm-up run
-    # TODO: According to Locust's doc, run-time should default to run forever,
-    # however the default is 48 hours.
-    if test_params["run_time"] is not None and test_phase != "warmup":
-        yaml_content["profile"]["global-settings"]["run-time"] = test_params["run_time"]
- 
-    return yaml_content
- 
- 
-def generate_stresscli_run_yaml(
-    example, case_type, case_params, test_params, test_phase, num_queries, base_url, ts
-) -> str:
-    """Create a stresscli configuration file and persist it on disk.
- 
-    Parameters
-    ----------
-        example : str
-            The name of the example.
-        case_type : str
-            The type of the test case
-        case_params : dict
-            The parameters of single test case.
-        test_phase : str [warmup|benchmark]
-            Current phase of the test.
-        num_queries : int
-            The number of test requests sent to SUT
-        base_url : str
-            The root endpoint of SUT
-        test_params : dict
-            The parameters of the test
-        ts : str
-            Timestamp
- 
-    Returns
-    -------
-        run_yaml_path : str
-            The path of the generated YAML file.
-    """
-    # Get the workload
-    dataset = test_params["dataset"]
-    if "pub_med" in dataset:
-        bench_target = "chatqna_qlist_pubmed"
-        max_lines = dataset.split("pub_med")[-1]
-        os.environ['DATASET'] = f"pubmed_{max_lines}.txt"
-        os.environ['MAX_LINES'] = max_lines
-
-    # Generate the content of stresscli configuration file
-    stresscli_yaml = create_run_yaml_content(case_params, base_url, bench_target, test_phase, num_queries, test_params)
- 
-    # Dump the stresscli configuration file
-    service_name = case_params.get("service_name")
-    run_yaml_path = os.path.join(
-        test_params["test_output_dir"], f"run_{service_name}_{ts}_{test_phase}_{num_queries}.yaml"
-    )
-    with open(run_yaml_path, "w") as yaml_file:
-        yaml.dump(stresscli_yaml, yaml_file)
- 
-    return run_yaml_path
- 
- 
-def create_and_save_run_yaml(example, deployment_type, service_type, service, base_url, test_suite_config, index):
-    """Create and save the run.yaml file for the service being tested."""
-    os.makedirs(test_suite_config["test_output_dir"], exist_ok=True)
-
-    run_yaml_paths = []
- 
-    # Add YAML configuration of stresscli for warm-ups
-    warm_ups = test_suite_config["warm_ups"]
-    if warm_ups is not None and warm_ups > 0:
-        run_yaml_paths.append(
-            generate_stresscli_run_yaml(
-                example, service_type, service, test_suite_config, "warmup", warm_ups, base_url, index
-            )
-        )
-
-    # Add YAML configuration of stresscli for benchmark
-    user_queries_lst = test_suite_config["user_queries"]
-    if user_queries_lst is None or len(user_queries_lst) == 0:
-        # Test stop is controlled by run time
-        run_yaml_paths.append(
-            generate_stresscli_run_yaml(
-                example, service_type, service, test_suite_config, "benchmark", -1, base_url, index
-            )
-        )
-    else:
-        # Test stop is controlled by request count
-        for user_queries in user_queries_lst:
-            run_yaml_paths.append(
-                generate_stresscli_run_yaml(
-                    example, service_type, service, test_suite_config, "benchmark", user_queries, base_url, index
-                )
-            )
- 
-    return run_yaml_paths
- 
- 
-def get_service_ip(service_name, deployment_type="k8s", service_ip=None, service_port=None, namespace="default"):
-    """Get the service IP and port based on the deployment type.
- 
-    Args:
-        service_name (str): The name of the service.
-        deployment_type (str): The type of deployment ("k8s" or "docker").
-        service_ip (str): The IP address of the service (required for Docker deployment).
-        service_port (int): The port of the service (required for Docker deployment).
- 
-    Returns:
-        (str, int): The service IP and port.
-    """
-    if deployment_type == "k8s":
-        # Kubernetes IP and port retrieval logic
-        svc_ip, port = get_service_cluster_ip(service_name, namespace)
-    elif deployment_type == "docker":
-        # For Docker deployment, service_ip and service_port must be specified
-        if not service_ip or not service_port:
-            raise ValueError(
-                "For Docker deployment, service_ip and service_port must be provided in the configuration."
-            )
-        svc_ip = service_ip
-        port = service_port
-    else:
-        raise ValueError("Unsupported deployment type. Use 'k8s' or 'docker'.")
- 
-    return svc_ip, port
- 
- 
-def run_service_test(example, service_type, service, test_suite_config):
- 
-    # Get the service name
-    service_name = service.get("service_name")
- 
-    # Get the deployment type from the test suite configuration
-    deployment_type = test_suite_config.get("deployment_type", "k8s")
- 
-    # Get the service IP and port based on deployment type
-    svc_ip, port = get_service_ip(
-        service_name,
-        deployment_type,
-        test_suite_config.get("service_ip"),
-        test_suite_config.get("service_port"),
-        test_suite_config.get("namespace"),
-    )
- 
-    base_url = f"http://{svc_ip}:{port}"
-    endpoint = service_endpoints[example]
-    url = f"{base_url}{endpoint}"
-    print(f"[OPEA BENCHMARK] 🚀 Running test for {service_name} at {url}")
- 
-    # Generate a unique index based on the current time
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- 
-    # Create the run.yaml for the service
-    run_yaml_paths = create_and_save_run_yaml(
-        example, deployment_type, service_type, service, base_url, test_suite_config, timestamp
-    )
-
-    # Run the test using locust_runtests function
-    output_folders = []
-    for index, run_yaml_path in enumerate(run_yaml_paths, start=1):
-        print(f"[OPEA BENCHMARK] 🚀 The {index} time test is running, run yaml: {run_yaml_path}...")
-        output_folders.append(locust_runtests(None, run_yaml_path))
- 
-    print(f"[OPEA BENCHMARK] 🚀 Test completed for {service_name} at {url}")
- 
-    return output_folders
- 
- 
-def process_service(example, service_type, case_data, test_suite_config):
-    print(f"[OPEA BENCHMARK] 🚀 Example: [ {example} ] Service: [ {case_data.get('service_name')} ], Running test...")
-    return run_service_test(example, service_type, case_data, test_suite_config)
- 
- 
-def run_benchmark(benchmark_config, report=False):
-    # Extract data
-    parsed_data = extract_benchmark_config_data(benchmark_config)
-    os.environ['MAX_TOKENS'] = str(parsed_data['llm_max_token_size'])
-    test_suite_config = {
-        "user_queries": parsed_data["user_queries"],  # num of user queries set to 1 by default
-        "random_prompt": False, # whether to use random prompt, set to False by default
-        "run_time": "60m", # The max total run time for the test suite, set to 60m by default
-        "collect_service_metric": False, # whether to collect service metrics, set to False by default
-        "llm_model": "Qwen/Qwen2.5-Coder-7B-Instruct", # The LLM model used for the test
-        "deployment_type": "k8s", # Default is "k8s", can also be "docker"
-        "service_ip": None, # Leave as None for k8s, specify for Docker
-        "service_port": None, # Leave as None for k8s, specify for Docker
-        "test_output_dir": "/home/sdp/letong/GenAIExamples/benchmark_output", # The directory to store the test output
-        "load_shape": {"name":"constant", "params":{"constant":{"concurrent_level": 4},"poisson":{"arrival_rate":1.0}}},
-        "concurrent_level": 4,
-        "arrival_rate": 1.0,
-        "query_timeout": 120,
-        "warm_ups": 0,
-        "seed": None,
-        "namespace": "default",
-        "dataset": parsed_data["dataset"],
-        "data_ratio": parsed_data["data_ratio"]
-    }
-
-    service_type="e2e"
-    dataset = None
-    query_data = None
-    case_data = {
-        "run_test": True,
-        "service_name": "chatqna-backend-server-svc",
-        "service_list": [
-            "chatqna-backend-server-svc",
-            "embedding-dependency-svc",
-            "embedding-svc",
-            "llm-dependency-svc",
-            "llm-svc",
-            "retriever-svc",
-            "vector-db"
-        ],
-        "dataset": dataset, # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
-        "prompts": query_data,
-        "max_output": parsed_data['llm_max_token_size'],  # max number of output tokens
-        "k": 1 # number of retrieved documents
-    }
-    output_folder = process_service(parsed_data["example_name"], service_type, case_data, test_suite_config)
-    print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.")
-
-    if report:
-        print(output_folder)
-        all_results = dict()
-        for folder in output_folder:
-            from evals.benchmark.stresscli.commands.report import get_report_results
-
-            results = get_report_results(folder)
-            all_results[folder] = results
-            print(f"results = {results}\n")
- 
-        return all_results
-
- 
-if __name__ == "__main__":
-    benchmark_config = load_yaml("./benchmark.yaml")
-    run_benchmark(benchmark_config)
- 
--- a/benchmark.yaml
+++ b/benchmark.yaml
@@ -1,31 +0,0 @@
-benchmark:
-  # example name
-  example_name: chatqna
-
-  # http request behavior related fields
-  concurrency:               [1, 2, 4]
-  totoal_query_num:          [2048, 4096]
-  duration:                  [5, 10] # unit minutes
-  query_num_per_concurrency: [4, 8, 16]
-  possion:                   True
-  possion_arrival_rate:      1.0
-  warmup_iterations:         10
-  seed:                      1024
-
-  # dataset relted fields
-  dataset:                pub_med10 # [dummy_english, dummy_chinese, pub_med100] predefined keywords for supported dataset
-  user_queries:           [1, 2, 4]
-  query_token_size:       128                   # if specified, means fixed query token size will be sent out
-
-  # advance settings in each component which will impact perf.
-  dataprep:                  # not target this time
-    chunk_size:              [1024]
-    chunk_overlap:           [1000]
-  retriever:                   # not target this time
-    algo:                    IVF
-    fetch_k:                 2
-    k:                       1
-  rerank:
-    top_n:                   2
-  llm:
-    max_token_size:          128   # specify the output token size
--- a/deploy_and_benchmark.py
+++ b/deploy_and_benchmark.py
--- a/benchmark_requirements.txt
+++ b/benchmark_requirements.txt