Merge branch '1.12.1-otel-ee' into deploy/enterprise

2026-03-06 15:45:14 +00:00 · 2026-02-28 17:41:17 -08:00
parent 7a8c96b4b7 5e57f73598
commit 9bd938b4e1
4 changed files with 453 additions and 758 deletions
--- a/api/core/ops/ops_trace_manager.py
+++ b/api/core/ops/ops_trace_manager.py
@@ -945,6 +945,17 @@ class TraceTask:
                        "embedding_model_provider": row[2] or "",
                    }

+        # Extract rerank model info from retrieval_model kwargs
+        rerank_model_provider = ""
+        rerank_model_name = ""
+        if "retrieval_model" in kwargs:
+            retrieval_model = kwargs["retrieval_model"]
+            if isinstance(retrieval_model, dict):
+                reranking_model = retrieval_model.get("reranking_model")
+                if isinstance(reranking_model, dict):
+                    rerank_model_provider = reranking_model.get("reranking_provider_name", "")
+                    rerank_model_name = reranking_model.get("reranking_model_name", "")
+
        metadata = {
            "message_id": message_id,
            "ls_provider": message_data.model_provider,
@@ -961,6 +972,8 @@ class TraceTask:
            "app_name": app_name,
            "workspace_name": workspace_name,
            "embedding_models": embedding_models,
+            "rerank_model_provider": rerank_model_provider,
+            "rerank_model_name": rerank_model_name,
        }
        if node_execution_id := kwargs.get("node_execution_id"):
            metadata["node_execution_id"] = node_execution_id
--- a/api/enterprise/telemetry/DATA_DICTIONARY.md
+++ b/api/enterprise/telemetry/DATA_DICTIONARY.md
--- a/api/enterprise/telemetry/README.md
+++ b/api/enterprise/telemetry/README.md
@@ -0,0 +1,116 @@
+# Dify Enterprise Telemetry
+
+This document provides an overview of the Dify Enterprise OpenTelemetry (OTEL) exporter and how to configure it for integration with observability stacks like Prometheus, Grafana, Jaeger, or Honeycomb.
+
+## Overview
+
+Dify Enterprise uses a "slim span + rich companion log" architecture to provide high-fidelity observability without overwhelming trace storage.
+
+- **Traces (Spans)**: Capture the structure, identity, and timing of high-level operations (Workflows and Nodes).
+- **Structured Logs**: Provide deep context (inputs, outputs, metadata) for every event, correlated to spans via `trace_id` and `span_id`.
+- **Metrics**: Provide 100% accurate counters and histograms for usage, performance, and error tracking.
+
+### Signal Architecture
+
+```mermaid
+graph TD
+    A[Workflow Run] -->|Span| B(dify.workflow.run)
+    A -->|Log| C(dify.workflow.run detail)
+    B ---|trace_id| C
+    
+    D[Node Execution] -->|Span| E(dify.node.execution)
+    D -->|Log| F(dify.node.execution detail)
+    E ---|span_id| F
+    
+    G[Message/Tool/etc] -->|Log| H(dify.* event)
+    G -->|Metric| I(dify.* counter/histogram)
+```
+
+## Configuration
+
+The Enterprise OTEL exporter is configured via environment variables.
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `ENTERPRISE_ENABLED` | Master switch for all enterprise features. | `false` |
+| `ENTERPRISE_TELEMETRY_ENABLED` | Master switch for enterprise telemetry. | `false` |
+| `ENTERPRISE_OTLP_ENDPOINT` | OTLP collector endpoint (e.g., `http://otel-collector:4318`). | - |
+| `ENTERPRISE_OTLP_HEADERS` | Custom headers for OTLP requests (e.g., `x-scope-orgid=tenant1`). | - |
+| `ENTERPRISE_OTLP_PROTOCOL` | OTLP transport protocol (`http` or `grpc`). | `http` |
+| `ENTERPRISE_OTLP_API_KEY` | Bearer token for authentication. | - |
+| `ENTERPRISE_INCLUDE_CONTENT` | Whether to include sensitive content (inputs/outputs) in logs. | `true` |
+| `ENTERPRISE_SERVICE_NAME` | Service name reported to OTEL. | `dify` |
+| `ENTERPRISE_OTEL_SAMPLING_RATE` | Sampling rate for traces (0.0 to 1.0). Metrics are always 100%. | `1.0` |
+
+## Correlation Model
+
+Dify uses deterministic ID generation to ensure signals are correlated across different services and asynchronous tasks.
+
+### ID Generation Rules
+- `trace_id`: Derived from the correlation ID (workflow_run_id or node_execution_id for drafts) using `int(UUID(correlation_id))`
+- `span_id`: Derived from the source ID using `SHA256(source_id)[:8]`
+
+### Scenario A: Simple Workflow
+A single workflow run with multiple nodes. All spans and logs share the same `trace_id` (derived from `workflow_run_id`).
+
+```
+trace_id = UUID(workflow_run_id)
+├── [root span] dify.workflow.run (span_id = hash(workflow_run_id))
+│   ├── [child] dify.node.execution - "Start" (span_id = hash(node_exec_id_1))
+│   ├── [child] dify.node.execution - "LLM" (span_id = hash(node_exec_id_2))
+│   └── [child] dify.node.execution - "End" (span_id = hash(node_exec_id_3))
+```
+
+### Scenario B: Nested Sub-Workflow
+A workflow calling another workflow via a Tool or Sub-workflow node. The child workflow's spans are linked to the parent via `parent_span_id`. Both workflows share the same trace_id.
+
+```
+trace_id = UUID(outer_workflow_run_id)     ← shared across both workflows
+├── [root] dify.workflow.run (outer) (span_id = hash(outer_workflow_run_id))
+│   ├── dify.node.execution - "Start Node"
+│   ├── dify.node.execution - "Tool Node" (triggers sub-workflow)
+│   │   └── [child] dify.workflow.run (inner) (span_id = hash(inner_workflow_run_id))
+│   │       ├── dify.node.execution - "Inner Start"
+│   │       └── dify.node.execution - "Inner End"
+│   └── dify.node.execution - "End Node"
+```
+
+**Key attributes for nested workflows:**
+- Inner workflow's `dify.parent.trace_id` = outer `workflow_run_id`
+- Inner workflow's `dify.parent.node.execution_id` = tool node's `execution_id`
+- Inner workflow's `dify.parent.workflow.run_id` = outer `workflow_run_id`
+- Inner workflow's `dify.parent.app.id` = outer `app_id`
+
+### Scenario C: Draft Node Execution
+A single node run in isolation (debugger/preview mode). It creates its own trace where the node span is the root.
+
+```
+trace_id = UUID(node_execution_id)   ← own trace, NOT part of any workflow
+└── dify.node.execution.draft (span_id = hash(node_execution_id))
+```
+
+**Key difference:** Draft executions use `node_execution_id` as the correlation_id, so they are NOT children of any workflow trace.
+
+## Content Gating
+
+When `ENTERPRISE_INCLUDE_CONTENT` is set to `false`, sensitive content attributes (inputs, outputs, queries) are replaced with reference strings (e.g., `ref:workflow_run_id=...`) to prevent data leakage to the OTEL collector.
+
+**Reference String Format:**
+
+```
+ref:{id_type}={uuid}
+```
+
+**Examples:**
+
+```
+ref:workflow_run_id=550e8400-e29b-41d4-a716-446655440000
+ref:node_execution_id=660e8400-e29b-41d4-a716-446655440001
+ref:message_id=770e8400-e29b-41d4-a716-446655440002
+```
+
+To retrieve actual content when gating is enabled, query the Dify database using the provided UUID.
+
+## Reference
+
+For a complete list of telemetry signals, attributes, and data structures, see [DATA_DICTIONARY.md](./DATA_DICTIONARY.md).
--- a/api/enterprise/telemetry/enterprise_trace.py
+++ b/api/enterprise/telemetry/enterprise_trace.py
@@ -419,9 +419,11 @@ class EnterpriseOtelTrace:
                **labels,
                type=request_type,
                status=info.status,
+                model_name=info.model_name or "",
            ),
        )
        duration_labels = dict(labels)
+        duration_labels["model_name"] = info.model_name or ""
        plugin_name = metadata.get("plugin_name")
        if plugin_name and info.node_type in {"tool", "knowledge-retrieval"}:
            duration_labels["plugin_name"] = plugin_name
@@ -434,6 +436,7 @@ class EnterpriseOtelTrace:
                self._labels(
                    **labels,
                    type=request_type,
+                    model_name=info.model_name or "",
                ),
            )

@@ -674,6 +677,8 @@ class EnterpriseOtelTrace:
            self._labels(
                **labels,
                type="suggested_question",
+                model_provider=info.model_provider or "",
+                model_name=info.model_id or "",
            ),
        )

@@ -738,6 +743,13 @@ class EnterpriseOtelTrace:
            attrs["dify.dataset.embedding_providers"] = self._maybe_json(providers)
            attrs["dify.dataset.embedding_models"] = self._maybe_json(models)

+        # Add rerank model to logs
+        rerank_provider = metadata.get("rerank_model_provider", "")
+        rerank_model = metadata.get("rerank_model_name", "")
+        if rerank_provider or rerank_model:
+            attrs["dify.retrieval.rerank_provider"] = rerank_provider
+            attrs["dify.retrieval.rerank_model"] = rerank_model
+
        ref = f"ref:message_id={info.message_id}"
        retrieval_inputs = self._safe_payload_value(info.inputs)
        attrs["dify.retrieval.query"] = self._content_or_ref(retrieval_inputs, ref)
@@ -766,12 +778,25 @@ class EnterpriseOtelTrace:
        )

        for did in dataset_ids:
+            # Get embedding model for this specific dataset
+            ds_embedding_info = embedding_models.get(did, {})
+            embedding_provider = ds_embedding_info.get("embedding_model_provider", "")
+            embedding_model = ds_embedding_info.get("embedding_model", "")
+
+            # Get rerank model (same for all datasets in this retrieval)
+            rerank_provider = metadata.get("rerank_model_provider", "")
+            rerank_model = metadata.get("rerank_model_name", "")
+
            self._exporter.increment_counter(
                EnterpriseTelemetryCounter.DATASET_RETRIEVALS,
                1,
                self._labels(
                    **labels,
                    dataset_id=did,
+                    embedding_model_provider=embedding_provider,
+                    embedding_model=embedding_model,
+                    rerank_model_provider=rerank_provider,
+                    rerank_model=rerank_model,
                ),
            )