dapr-agents/dapr_agents/observability/context_storage.py

267 lines
11 KiB
Python

"""
Thread-safe context storage for Dapr Workflow OpenTelemetry context propagation.
This module provides a global storage mechanism for OpenTelemetry contexts
that can cross Dapr Workflow boundaries without modifying function signatures.
The key challenge with Dapr Workflows is that OpenTelemetry context doesn't
naturally propagate across workflow task boundaries due to the Dapr runtime's
serialization/deserialization process. This storage provides a non-invasive
solution by storing W3C Trace Context data that can be retrieved by workflow
tasks using their instance ID.
Architecture:
- Store W3C context during workflow task creation (in instrumentor.py)
- Retrieve context during workflow task execution (in workflow_task.py)
- Use thread-safe storage to handle concurrent workflow executions
- Automatic cleanup to prevent memory leaks from completed workflows
"""
import logging
import threading
from typing import Dict, Optional, Any
logger = logging.getLogger(__name__)
class WorkflowContextStorage:
"""
Thread-safe storage for workflow OpenTelemetry contexts with W3C Trace Context support.
This class provides a centralized storage mechanism for OpenTelemetry contexts
that need to be propagated across Dapr Workflow execution boundaries. It uses
workflow instance IDs as keys to store and retrieve W3C Trace Context data.
Key features:
- Thread-safe operations using RLock for concurrent workflow execution
- Instance ID-based storage for precise context retrieval
- W3C Trace Context format support (traceparent/tracestate)
- Memory management with cleanup capabilities
- Debug statistics for monitoring storage usage
Usage pattern:
1. Store context during workflow task creation (monkey-patched in instrumentor)
2. Retrieve context during workflow task execution (in workflow_task wrapper)
3. Clean up context when workflow completes to prevent memory leaks
"""
def __init__(self):
"""
Initialize the workflow context storage.
Creates thread-safe storage using RLock to handle concurrent access
from multiple workflow instances executing simultaneously.
"""
self._storage: Dict[str, Dict[str, Any]] = {}
self._lock = threading.RLock()
def store_context(self, instance_id: str, otel_context: Dict[str, Any]) -> None:
"""
Store OpenTelemetry context for a workflow instance using W3C Trace Context format.
Stores the W3C Trace Context data (traceparent/tracestate) that was extracted
during workflow task creation, allowing it to be retrieved later during
workflow task execution to maintain distributed trace continuity.
Args:
instance_id (str): Unique workflow instance ID from WorkflowActivityContext
otel_context (Dict[str, Any]): W3C context data from extract_otel_context()
containing traceparent, tracestate, and debug info
"""
with self._lock:
self._storage[instance_id] = otel_context
logger.debug(f"🔗 Stored context for instance {instance_id}")
def get_context(self, instance_id: str) -> Optional[Dict[str, Any]]:
"""
Retrieve OpenTelemetry context for a workflow instance using W3C Trace Context format.
Retrieves the stored W3C Trace Context data that can be used to restore
OpenTelemetry context and create child spans with proper parent-child
relationships in distributed traces.
Args:
instance_id (str): Unique workflow instance ID from WorkflowActivityContext
Returns:
Optional[Dict[str, Any]]: W3C context data with traceparent/tracestate headers,
or None if no context found for the instance
"""
with self._lock:
context = self._storage.get(instance_id)
if context:
logger.debug(f"🔗 Retrieved context for instance {instance_id}")
else:
logger.warning(f"⚠️ No context found for instance {instance_id}")
return context
def create_resumed_workflow_context(
self, instance_id: str, agent_name: Optional[str] = None
) -> Dict[str, Any]:
"""
Create a new trace context for a resumed workflow after app restart.
When an app restarts, the in-memory context storage is lost. This method
creates a new trace context for resumed workflows so they can still be
traced, even though they won't be connected to the original trace.
Args:
instance_id (str): Unique workflow instance ID
Returns:
Dict[str, Any]: New W3C context data for the resumed workflow
"""
try:
from opentelemetry import trace
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator,
)
# Create a new trace for the resumed workflow with proper AGENT span
tracer = trace.get_tracer(__name__)
# Create AGENT span with proper agent name for resumed workflow
agent_display_name = agent_name or "DurableAgent"
span_name = f"{agent_display_name}.ToolCallingWorkflow"
with tracer.start_as_current_span(span_name) as span:
# Set AGENT span attributes
from .constants import OPENINFERENCE_SPAN_KIND
span.set_attribute(OPENINFERENCE_SPAN_KIND, "AGENT")
span.set_attribute("workflow.instance_id", instance_id)
span.set_attribute("workflow.resumed", True)
span.set_attribute("agent.name", agent_name)
# Extract the new context
propagator = TraceContextTextMapPropagator()
carrier = {}
propagator.inject(carrier)
context_data = {
"traceparent": carrier.get("traceparent"),
"tracestate": carrier.get("tracestate"),
"instance_id": instance_id,
"resumed": True,
"debug_info": f"New trace created for resumed workflow {instance_id}",
}
# Store the new context
self.store_context(instance_id, context_data)
logger.info(
f"Created new trace context for resumed workflow {instance_id}"
)
return context_data
except Exception as e:
logger.error(
f"Failed to create resumed workflow context for {instance_id}: {e}"
)
return {
"traceparent": None,
"tracestate": None,
"instance_id": instance_id,
"resumed": True,
"error": str(e),
}
def cleanup_context(self, instance_id: str) -> None:
"""
Clean up stored context for a completed workflow instance to prevent memory leaks.
Removes the stored W3C context data when a workflow completes to prevent
the storage from growing indefinitely. Should be called when workflow
execution finishes or fails.
Args:
instance_id (str): Unique workflow instance ID to clean up
"""
with self._lock:
if instance_id in self._storage:
del self._storage[instance_id]
logger.debug(f"🧹 Cleaned up context for instance {instance_id}")
def get_storage_stats(self) -> Dict[str, Any]:
"""
Get statistics about stored contexts for debugging and monitoring.
Provides visibility into the storage state for troubleshooting context
propagation issues and monitoring memory usage from stored contexts.
Returns:
Dict[str, Any]: Storage statistics including:
- stored_instances: Number of currently stored contexts
- instance_ids: List of workflow instance IDs with stored contexts
"""
with self._lock:
return {
"stored_instances": len(self._storage),
"instance_ids": list(self._storage.keys()),
}
# Global instance for workflow context storage across the application
_context_storage = WorkflowContextStorage()
def store_workflow_context(instance_id: str, otel_context: Dict[str, Any]) -> None:
"""
Store OpenTelemetry context for a workflow instance using the global storage.
Convenience function that provides a simple interface to store W3C Trace Context
data for workflow instances. Used during workflow task creation to preserve
context across Dapr Workflow runtime boundaries.
Args:
instance_id (str): Unique workflow instance ID from WorkflowActivityContext.workflow_id
otel_context (Dict[str, Any]): W3C context data from extract_otel_context()
containing traceparent, tracestate, and debug components
"""
_context_storage.store_context(instance_id, otel_context)
def get_workflow_context(instance_id: str) -> Optional[Dict[str, Any]]:
"""
Retrieve OpenTelemetry context for a workflow instance using the global storage.
Convenience function that provides a simple interface to retrieve stored
W3C Trace Context data for workflow instances. Used during workflow task
execution to restore context and maintain distributed trace continuity.
Args:
instance_id (str): Unique workflow instance ID from WorkflowActivityContext.workflow_id
Returns:
Optional[Dict[str, Any]]: W3C context data with traceparent/tracestate headers
for creating child spans, or None if not found
"""
return _context_storage.get_context(instance_id)
def cleanup_workflow_context(instance_id: str) -> None:
"""
Clean up stored context for a completed workflow instance using the global storage.
Convenience function that provides a simple interface to clean up stored
context data when workflows complete. Important for preventing memory
leaks in long-running applications with many workflow executions.
Args:
instance_id (str): Unique workflow instance ID to clean up from storage
"""
_context_storage.cleanup_context(instance_id)
def get_context_storage_stats() -> Dict[str, Any]:
"""
Get statistics about stored contexts for debugging and monitoring using the global storage.
Convenience function that provides visibility into the storage state for
troubleshooting context propagation issues and monitoring memory usage.
Useful for debugging workflow context issues and ensuring proper cleanup.
Returns:
Dict[str, Any]: Storage statistics including count of stored instances
and list of active workflow instance IDs
"""
return _context_storage.get_storage_stats()