Files
GenAIExamples/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/deploy.py
2025-01-06 09:16:20 +08:00

279 lines
11 KiB
Python

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import argparse
import glob
import json
import os
import shutil
import subprocess
import sys
from generate_helm_values import generate_helm_values
def run_kubectl_command(command):
"""Run a kubectl command and return the output."""
try:
result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.stdout
except subprocess.CalledProcessError as e:
print(f"Error running command: {command}\n{e.stderr}")
exit(1)
def get_all_nodes():
"""Get the list of all nodes in the Kubernetes cluster."""
command = ["kubectl", "get", "nodes", "-o", "json"]
output = run_kubectl_command(command)
nodes = json.loads(output)
return [node["metadata"]["name"] for node in nodes["items"]]
def add_label_to_node(node_name, label):
"""Add a label to the specified node."""
command = ["kubectl", "label", "node", node_name, label, "--overwrite"]
print(f"Labeling node {node_name} with {label}...")
run_kubectl_command(command)
print(f"Label {label} added to node {node_name} successfully.")
def add_labels_to_nodes(node_count=None, label=None, node_names=None):
"""Add a label to the specified number of nodes or to specified nodes."""
if node_names:
# Add label to the specified nodes
for node_name in node_names:
add_label_to_node(node_name, label)
else:
# Fetch the node list and label the specified number of nodes
all_nodes = get_all_nodes()
if node_count is None or node_count > len(all_nodes):
print(f"Error: Node count exceeds the number of available nodes ({len(all_nodes)} available).")
sys.exit(1)
selected_nodes = all_nodes[:node_count]
for node_name in selected_nodes:
add_label_to_node(node_name, label)
def clear_labels_from_nodes(label, node_names=None):
"""Clear the specified label from specific nodes if provided, otherwise from all nodes."""
label_key = label.split("=")[0] # Extract key from 'key=value' format
# If specific nodes are provided, use them; otherwise, get all nodes
nodes_to_clear = node_names if node_names else get_all_nodes()
for node_name in nodes_to_clear:
# Check if the node has the label by inspecting its metadata
command = ["kubectl", "get", "node", node_name, "-o", "json"]
node_info = run_kubectl_command(command)
node_metadata = json.loads(node_info)
# Check if the label exists on this node
labels = node_metadata["metadata"].get("labels", {})
if label_key in labels:
# Remove the label from the node
command = ["kubectl", "label", "node", node_name, f"{label_key}-"]
print(f"Removing label {label_key} from node {node_name}...")
run_kubectl_command(command)
print(f"Label {label_key} removed from node {node_name} successfully.")
else:
print(f"Label {label_key} not found on node {node_name}, skipping.")
def install_helm_release(release_name, chart_name, namespace, values_file, device_type):
"""Deploy a Helm release with a specified name and chart.
Parameters:
- release_name: The name of the Helm release.
- chart_name: The Helm chart name or path, e.g., "opea/chatqna".
- namespace: The Kubernetes namespace for deployment.
- values_file: The user values file for deployment.
- device_type: The device type (e.g., "gaudi") for specific configurations (optional).
"""
# Check if the namespace exists; if not, create it
try:
# Check if the namespace exists
command = ["kubectl", "get", "namespace", namespace]
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError:
# Namespace does not exist, create it
print(f"Namespace '{namespace}' does not exist. Creating it...")
command = ["kubectl", "create", "namespace", namespace]
subprocess.run(command, check=True)
print(f"Namespace '{namespace}' created successfully.")
# Handle gaudi-specific values file if device_type is "gaudi"
hw_values_file = None
untar_dir = None
if device_type == "gaudi":
print("Device type is gaudi. Pulling Helm chart to get gaudi-values.yaml...")
# Combine chart_name with fixed prefix
chart_pull_url = f"oci://ghcr.io/opea-project/charts/{chart_name}"
# Pull and untar the chart
subprocess.run(["helm", "pull", chart_pull_url, "--untar"], check=True)
# Find the untarred directory
untar_dirs = glob.glob(f"{chart_name}*")
if untar_dirs:
untar_dir = untar_dirs[0]
hw_values_file = os.path.join(untar_dir, "gaudi-values.yaml")
print("gaudi-values.yaml pulled and ready for use.")
else:
print(f"Error: Could not find untarred directory for {chart_name}")
return
# Prepare the Helm install command
command = ["helm", "install", release_name, chart_name, "--namespace", namespace]
# Append additional values file for gaudi if it exists
if hw_values_file:
command.extend(["-f", hw_values_file])
# Append the main values file
command.extend(["-f", values_file])
# Execute the Helm install command
try:
print(f"Running command: {' '.join(command)}") # Print full command for debugging
subprocess.run(command, check=True)
print("Deployment initiated successfully.")
except subprocess.CalledProcessError as e:
print(f"Error occurred while deploying Helm release: {e}")
# Cleanup: Remove the untarred directory
if untar_dir and os.path.isdir(untar_dir):
print(f"Removing temporary directory: {untar_dir}")
shutil.rmtree(untar_dir)
print("Temporary directory removed successfully.")
def uninstall_helm_release(release_name, namespace=None):
"""Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'."""
# Default to 'default' namespace if none is specified
if not namespace:
namespace = "default"
try:
# Uninstall the Helm release
command = ["helm", "uninstall", release_name, "--namespace", namespace]
print(f"Uninstalling Helm release {release_name} in namespace {namespace}...")
run_kubectl_command(command)
print(f"Helm release {release_name} uninstalled successfully.")
# If the namespace is specified and not 'default', delete it
if namespace != "default":
print(f"Deleting namespace {namespace}...")
delete_namespace_command = ["kubectl", "delete", "namespace", namespace]
run_kubectl_command(delete_namespace_command)
print(f"Namespace {namespace} deleted successfully.")
else:
print("Namespace is 'default', skipping deletion.")
except subprocess.CalledProcessError as e:
print(f"Error occurred while uninstalling Helm release or deleting namespace: {e}")
def main():
parser = argparse.ArgumentParser(description="Manage Helm Deployment.")
parser.add_argument(
"--release-name",
type=str,
default="chatqna",
help="The Helm release name created during deployment (default: chatqna).",
)
parser.add_argument(
"--chart-name",
type=str,
default="chatqna",
help="The chart name to deploy, composed of repo name and chart name (default: chatqna).",
)
parser.add_argument("--namespace", default="default", help="Kubernetes namespace (default: default).")
parser.add_argument("--hf-token", help="Hugging Face API token.")
parser.add_argument(
"--model-dir", help="Model directory, mounted as volumes for service access to pre-downloaded models"
)
parser.add_argument("--user-values", help="Path to a user-specified values.yaml file.")
parser.add_argument(
"--create-values-only", action="store_true", help="Only create the values.yaml file without deploying."
)
parser.add_argument("--uninstall", action="store_true", help="Uninstall the Helm release.")
parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes to use (default: 1).")
parser.add_argument("--node-names", nargs="*", help="Optional specific node names to label.")
parser.add_argument("--add-label", action="store_true", help="Add label to specified nodes if this flag is set.")
parser.add_argument(
"--delete-label", action="store_true", help="Delete label from specified nodes if this flag is set."
)
parser.add_argument(
"--label", default="node-type=opea-benchmark", help="Label to add/delete (default: node-type=opea-benchmark)."
)
parser.add_argument("--with-rerank", action="store_true", help="Include rerank service in the deployment.")
parser.add_argument(
"--tuned",
action="store_true",
help="Modify resources for services and change extraCmdArgs when creating values.yaml.",
)
parser.add_argument(
"--device-type",
type=str,
choices=["cpu", "gaudi"],
default="gaudi",
help="Specify the device type for deployment (choices: 'cpu', 'gaudi'; default: gaudi).",
)
args = parser.parse_args()
# Adjust num-nodes based on node-names if specified
if args.node_names:
num_node_names = len(args.node_names)
if args.num_nodes != 1 and args.num_nodes != num_node_names:
parser.error("--num-nodes must match the number of --node-names if both are specified.")
else:
args.num_nodes = num_node_names
# Node labeling management
if args.add_label:
add_labels_to_nodes(args.num_nodes, args.label, args.node_names)
return
elif args.delete_label:
clear_labels_from_nodes(args.label, args.node_names)
return
# Uninstall Helm release if specified
if args.uninstall:
uninstall_helm_release(args.release_name, args.namespace)
return
# Prepare values.yaml if not uninstalling
if args.user_values:
values_file_path = args.user_values
else:
if not args.hf_token:
parser.error("--hf-token are required")
node_selector = {args.label.split("=")[0]: args.label.split("=")[1]}
values_file_path = generate_helm_values(
with_rerank=args.with_rerank,
num_nodes=args.num_nodes,
hf_token=args.hf_token,
model_dir=args.model_dir,
node_selector=node_selector,
tune=args.tuned,
)
# Read back the generated YAML file for verification
with open(values_file_path, "r") as file:
print("Generated YAML contents:")
print(file.read())
# Deploy unless --create-values-only is specified
if not args.create_values_only:
install_helm_release(args.release_name, args.chart_name, args.namespace, values_file_path, args.device_type)
if __name__ == "__main__":
main()