Add Telemetry support for AgentQnA using Grafana, Prometheus and Jaeger (#1732)
Signed-off-by: louie tsai <louie.tsai@intel.com> Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tei-embedding-service:
|
||||
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
tei-reranking-service:
|
||||
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:1.67.0
|
||||
container_name: jaeger
|
||||
ports:
|
||||
- "16686:16686"
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
- "9411:9411"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
COLLECTOR_ZIPKIN_HOST_PORT: 9411
|
||||
restart: unless-stopped
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.52.0
|
||||
container_name: prometheus
|
||||
user: root
|
||||
volumes:
|
||||
- ./prometheus.yaml:/etc/prometheus/prometheus.yaml
|
||||
- ./prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yaml'
|
||||
ports:
|
||||
- '9091:9090'
|
||||
ipc: host
|
||||
restart: unless-stopped
|
||||
grafana:
|
||||
image: grafana/grafana:11.0.0
|
||||
container_name: grafana
|
||||
volumes:
|
||||
- ./grafana_data:/var/lib/grafana
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
user: root
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
||||
GF_RENDERING_CALLBACK_URL: http://grafana:3000/
|
||||
GF_LOG_FILTERS: rendering:debug
|
||||
depends_on:
|
||||
- prometheus
|
||||
ports:
|
||||
- '3000:3000'
|
||||
ipc: host
|
||||
restart: unless-stopped
|
||||
node-exporter:
|
||||
image: prom/node-exporter
|
||||
container_name: node-exporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- --collector.filesystem.ignored-mount-points
|
||||
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
|
||||
ports:
|
||||
- 9100:9100
|
||||
restart: always
|
||||
deploy:
|
||||
mode: global
|
||||
gaudi-exporter:
|
||||
image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
|
||||
container_name: gaudi-exporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /dev:/dev
|
||||
ports:
|
||||
- 41612:41611
|
||||
restart: always
|
||||
deploy:
|
||||
mode: global
|
||||
worker-rag-agent:
|
||||
environment:
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
worker-sql-agent:
|
||||
environment:
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
supervisor-react-agent:
|
||||
environment:
|
||||
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
|
||||
@@ -0,0 +1,10 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
rm *.json
|
||||
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/chatqna_megaservice_grafana.json
|
||||
mv chatqna_megaservice_grafana.json agentqna_microervices_grafana.json
|
||||
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
|
||||
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
|
||||
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
|
||||
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana.json
|
||||
@@ -0,0 +1,14 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,54 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# config file version
|
||||
apiVersion: 1
|
||||
|
||||
# list of datasources that should be deleted from the database
|
||||
deleteDatasources:
|
||||
- name: Prometheus
|
||||
orgId: 1
|
||||
|
||||
# list of datasources to insert/update depending
|
||||
# what's available in the database
|
||||
datasources:
|
||||
# <string, required> name of the datasource. Required
|
||||
- name: Prometheus
|
||||
# <string, required> datasource type. Required
|
||||
type: prometheus
|
||||
# <string, required> access mode. direct or proxy. Required
|
||||
access: proxy
|
||||
# <int> org id. will default to orgId 1 if not specified
|
||||
orgId: 1
|
||||
# <string> url
|
||||
url: http://prometheus:9090
|
||||
# <string> database password, if used
|
||||
password:
|
||||
# <string> database user, if used
|
||||
user:
|
||||
# <string> database name, if used
|
||||
database:
|
||||
# <bool> enable/disable basic auth
|
||||
basicAuth: false
|
||||
# <string> basic auth username, if used
|
||||
basicAuthUser:
|
||||
# <string> basic auth password, if used
|
||||
basicAuthPassword:
|
||||
# <bool> enable/disable with credentials headers
|
||||
withCredentials:
|
||||
# <bool> mark as default datasource. Max one per org
|
||||
isDefault: true
|
||||
# <map> fields that will be converted to json and stored in json_data
|
||||
jsonData:
|
||||
httpMethod: GET
|
||||
graphiteVersion: "1.1"
|
||||
tlsAuth: false
|
||||
tlsAuthWithCACert: false
|
||||
# <string> json object of data that will be encrypted.
|
||||
secureJsonData:
|
||||
tlsCACert: "..."
|
||||
tlsClientCert: "..."
|
||||
tlsClientKey: "..."
|
||||
version: 1
|
||||
# <bool> allow users to edit datasources from the UI.
|
||||
editable: true
|
||||
55
AgentQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
Normal file
55
AgentQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
global:
|
||||
scrape_interval: 5s
|
||||
external_labels:
|
||||
monitor: "my-monitor"
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
- job_name: "vllm"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["vllm-gaudi-server:8000"]
|
||||
- job_name: "tgi"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["tgi-gaudi-server:80"]
|
||||
- job_name: "tei-embedding"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["tei-embedding-server:80"]
|
||||
- job_name: "tei-reranking"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["tei-reranking-server:80"]
|
||||
- job_name: "retriever"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["retriever:7000"]
|
||||
- job_name: "dataprep-redis-service"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["dataprep-redis-service:5000"]
|
||||
- job_name: "prometheus-node-exporter"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
- job_name: "prometheus-gaudi-exporter"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["gaudi-exporter:41611"]
|
||||
- job_name: "supervisor-react-agent"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["react-agent-endpoint:9090"]
|
||||
- job_name: "worker-rag-agent"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["rag-agent-endpoint:9095"]
|
||||
- job_name: "worker-sql-agent"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["sql-agent-endpoint:9096"]
|
||||
@@ -64,6 +64,9 @@ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
|
||||
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
|
||||
export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
|
||||
export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
|
||||
# Set OpenTelemetry Tracing Endpoint
|
||||
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
|
||||
|
||||
export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$host_ip"
|
||||
export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,node-exporter,gaudi-exporter,127.0.0.1,localhost,0.0.0.0,$host_ip,,$JAEGER_IP"
|
||||
|
||||
Reference in New Issue
Block a user