diff --git a/DocSum/benchmark_docsum.yaml b/DocSum/benchmark_docsum.yaml new file mode 100644 index 000000000..66aab5ba6 --- /dev/null +++ b/DocSum/benchmark_docsum.yaml @@ -0,0 +1,87 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deploy: + device: gaudi + version: 1.2.0 + modelUseHostPath: /mnt/models + HUGGINGFACEHUB_API_TOKEN: "" # mandatory + node: [1] + namespace: "" + node_name: [] + timeout: 1000 # timeout in seconds for services to be ready, default 30 minutes + interval: 5 # interval in seconds between service ready checks, default 5 seconds + + services: + backend: + resources: + enabled: False + cores_per_instance: "16" + memory_capacity: "8000Mi" + replicaCount: [1] + + teirerank: + enabled: False + + llm: + engine: vllm # or tgi + model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory + replicaCount: + without_teirerank: [1] # When teirerank.enabled is False + resources: + enabled: False + cards_per_instance: 1 + model_params: + vllm: # VLLM specific parameters + batch_params: + enabled: True + max_num_seqs: "8" # Each value triggers an LLM service upgrade + token_params: + enabled: True + max_input_length: "" + max_total_tokens: "" + max_batch_total_tokens: "" + max_batch_prefill_tokens: "" + tgi: # TGI specific parameters + batch_params: + enabled: True + max_batch_size: [1] # Each value triggers an LLM service upgrade + token_params: + enabled: False + max_input_length: "1280" + max_total_tokens: "2048" + max_batch_total_tokens: "65536" + max_batch_prefill_tokens: "4096" + + docsum-ui: + replicaCount: [1] + + whisper: + replicaCount: [1] + + llm-uservice: + model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory + replicaCount: [1] + + nginx: + replicaCount: [1] + +benchmark: + # http request behavior related fields + user_queries: [16] + concurrency: [4] + load_shape_type: "constant" # "constant" or "poisson" + poisson_arrival_rate: 1.0 # only used when load_shape_type is "poisson" + warmup_iterations: 10 + seed: 1024 + collect_service_metric: True + + # workload, all of the test cases will run for benchmark + bench_target: ["docsumfixed"] # specify the bench_target for benchmark + dataset: "/home/sdp/upload.txt" # specify the absolute path to the dataset file + summary_type: "stuff" + stream: True + + llm: + # specify the llm output token size + max_token_size: [1024] diff --git a/benchmark.py b/benchmark.py index 410ef1fed..3b5af5be8 100644 --- a/benchmark.py +++ b/benchmark.py @@ -12,6 +12,7 @@ from kubernetes import client, config # only support chatqna for now service_endpoints = { "chatqna": "/v1/chatqna", + "docsum": "/v1/docsum", } @@ -35,6 +36,9 @@ def construct_benchmark_config(test_suite_config): "dataset": test_suite_config.get("dataset", ""), "prompt": test_suite_config.get("prompt", [10]), "llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", [128]), + "collect_service_metric": test_suite_config.get("collect_service_metric", False), + "summary_type": test_suite_config.get("summary_type", "auto"), + "stream": test_suite_config.get("stream", "auto"), } @@ -144,6 +148,8 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie "llm-model": test_params["llm_model"], "deployment-type": test_params["deployment_type"], "load-shape": load_shape, + "summary_type": test_params.get("summary_type", "auto"), + "stream": test_params.get("stream", True), }, "runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}], } @@ -373,7 +379,9 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model "user_queries": parsed_data["user_queries"], # num of user queries "random_prompt": False, # whether to use random prompt, set to False by default "run_time": "30m", # The max total run time for the test suite, set to 60m by default - "collect_service_metric": False, # whether to collect service metrics, set to False by default + "collect_service_metric": ( + parsed_data["collect_service_metric"] if parsed_data["collect_service_metric"] else False + ), # Metrics collection set to False by default "llm_model": llm_model, # The LLM model used for the test "deployment_type": "k8s", # Default is "k8s", can also be "docker" "service_ip": None, # Leave as None for k8s, specify for Docker @@ -398,9 +406,15 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model "dataset": parsed_data["dataset"], "prompt": parsed_data["prompt"], "llm_max_token_size": parsed_data["llm_max_token_size"], + "summary_type": parsed_data["summary_type"], + "stream": parsed_data["stream"], } - dataset = None + if parsed_data["dataset"]: # This checks if user provided dataset/document for DocSum service + dataset = parsed_data["dataset"] + else: + dataset = None + query_data = None os.environ["MODEL_NAME"] = test_suite_config.get("llm_model", "meta-llama/Meta-Llama-3-8B-Instruct") # Do benchmark in for-loop for different llm_max_token_size @@ -428,6 +442,21 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model "max_output": llm_max_token, # max number of output tokens "k": 1, # number of retrieved documents } + if chart_name == "docsum": + case_data = { + "run_test": True, + "service_name": "docsum", + "service_list": [ + "docsum", + "docsum-llm-uservice", + "docsum-vllm", + ], + "stream": parsed_data["stream"], + "max_output": llm_max_token, # max number of output tokens + "summary_type": parsed_data["summary_type"], # Summary_type for DocSum + "dataset": dataset, # Dataset used for document summary + } + output_folder = _run_service_test(chart_name, case_data, test_suite_config, namespace) print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.") diff --git a/deploy.py b/deploy.py index bd3a8a87d..6c7da7474 100644 --- a/deploy.py +++ b/deploy.py @@ -177,6 +177,8 @@ def configure_models(values, deploy_config): values[service_name]["EMBEDDING_MODEL_ID"] = model_id elif service_name == "teirerank": values[service_name]["RERANK_MODEL_ID"] = model_id + elif service_name == "llm-uservice": + values[service_name]["LLM_MODEL_ID"] = model_id return values