mirror of https://github.com/dapr/dapr-agents.git
267 lines
11 KiB
Python
267 lines
11 KiB
Python
"""
|
|
Thread-safe context storage for Dapr Workflow OpenTelemetry context propagation.
|
|
|
|
This module provides a global storage mechanism for OpenTelemetry contexts
|
|
that can cross Dapr Workflow boundaries without modifying function signatures.
|
|
|
|
The key challenge with Dapr Workflows is that OpenTelemetry context doesn't
|
|
naturally propagate across workflow task boundaries due to the Dapr runtime's
|
|
serialization/deserialization process. This storage provides a non-invasive
|
|
solution by storing W3C Trace Context data that can be retrieved by workflow
|
|
tasks using their instance ID.
|
|
|
|
Architecture:
|
|
- Store W3C context during workflow task creation (in instrumentor.py)
|
|
- Retrieve context during workflow task execution (in workflow_task.py)
|
|
- Use thread-safe storage to handle concurrent workflow executions
|
|
- Automatic cleanup to prevent memory leaks from completed workflows
|
|
"""
|
|
|
|
import logging
|
|
import threading
|
|
from typing import Dict, Optional, Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class WorkflowContextStorage:
|
|
"""
|
|
Thread-safe storage for workflow OpenTelemetry contexts with W3C Trace Context support.
|
|
|
|
This class provides a centralized storage mechanism for OpenTelemetry contexts
|
|
that need to be propagated across Dapr Workflow execution boundaries. It uses
|
|
workflow instance IDs as keys to store and retrieve W3C Trace Context data.
|
|
|
|
Key features:
|
|
- Thread-safe operations using RLock for concurrent workflow execution
|
|
- Instance ID-based storage for precise context retrieval
|
|
- W3C Trace Context format support (traceparent/tracestate)
|
|
- Memory management with cleanup capabilities
|
|
- Debug statistics for monitoring storage usage
|
|
|
|
Usage pattern:
|
|
1. Store context during workflow task creation (monkey-patched in instrumentor)
|
|
2. Retrieve context during workflow task execution (in workflow_task wrapper)
|
|
3. Clean up context when workflow completes to prevent memory leaks
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize the workflow context storage.
|
|
|
|
Creates thread-safe storage using RLock to handle concurrent access
|
|
from multiple workflow instances executing simultaneously.
|
|
"""
|
|
self._storage: Dict[str, Dict[str, Any]] = {}
|
|
self._lock = threading.RLock()
|
|
|
|
def store_context(self, instance_id: str, otel_context: Dict[str, Any]) -> None:
|
|
"""
|
|
Store OpenTelemetry context for a workflow instance using W3C Trace Context format.
|
|
|
|
Stores the W3C Trace Context data (traceparent/tracestate) that was extracted
|
|
during workflow task creation, allowing it to be retrieved later during
|
|
workflow task execution to maintain distributed trace continuity.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID from WorkflowActivityContext
|
|
otel_context (Dict[str, Any]): W3C context data from extract_otel_context()
|
|
containing traceparent, tracestate, and debug info
|
|
"""
|
|
with self._lock:
|
|
self._storage[instance_id] = otel_context
|
|
logger.debug(f"🔗 Stored context for instance {instance_id}")
|
|
|
|
def get_context(self, instance_id: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Retrieve OpenTelemetry context for a workflow instance using W3C Trace Context format.
|
|
|
|
Retrieves the stored W3C Trace Context data that can be used to restore
|
|
OpenTelemetry context and create child spans with proper parent-child
|
|
relationships in distributed traces.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID from WorkflowActivityContext
|
|
|
|
Returns:
|
|
Optional[Dict[str, Any]]: W3C context data with traceparent/tracestate headers,
|
|
or None if no context found for the instance
|
|
"""
|
|
with self._lock:
|
|
context = self._storage.get(instance_id)
|
|
if context:
|
|
logger.debug(f"🔗 Retrieved context for instance {instance_id}")
|
|
else:
|
|
logger.warning(f"⚠️ No context found for instance {instance_id}")
|
|
return context
|
|
|
|
def create_resumed_workflow_context(
|
|
self, instance_id: str, agent_name: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Create a new trace context for a resumed workflow after app restart.
|
|
|
|
When an app restarts, the in-memory context storage is lost. This method
|
|
creates a new trace context for resumed workflows so they can still be
|
|
traced, even though they won't be connected to the original trace.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID
|
|
|
|
Returns:
|
|
Dict[str, Any]: New W3C context data for the resumed workflow
|
|
"""
|
|
try:
|
|
from opentelemetry import trace
|
|
from opentelemetry.trace.propagation.tracecontext import (
|
|
TraceContextTextMapPropagator,
|
|
)
|
|
|
|
# Create a new trace for the resumed workflow with proper AGENT span
|
|
tracer = trace.get_tracer(__name__)
|
|
|
|
# Create AGENT span with proper agent name for resumed workflow
|
|
agent_display_name = agent_name or "DurableAgent"
|
|
span_name = f"{agent_display_name}.ToolCallingWorkflow"
|
|
with tracer.start_as_current_span(span_name) as span:
|
|
# Set AGENT span attributes
|
|
from .constants import OPENINFERENCE_SPAN_KIND
|
|
|
|
span.set_attribute(OPENINFERENCE_SPAN_KIND, "AGENT")
|
|
span.set_attribute("workflow.instance_id", instance_id)
|
|
span.set_attribute("workflow.resumed", True)
|
|
span.set_attribute("agent.name", agent_name)
|
|
# Extract the new context
|
|
propagator = TraceContextTextMapPropagator()
|
|
carrier = {}
|
|
propagator.inject(carrier)
|
|
|
|
context_data = {
|
|
"traceparent": carrier.get("traceparent"),
|
|
"tracestate": carrier.get("tracestate"),
|
|
"instance_id": instance_id,
|
|
"resumed": True,
|
|
"debug_info": f"New trace created for resumed workflow {instance_id}",
|
|
}
|
|
|
|
# Store the new context
|
|
self.store_context(instance_id, context_data)
|
|
logger.info(
|
|
f"Created new trace context for resumed workflow {instance_id}"
|
|
)
|
|
|
|
return context_data
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to create resumed workflow context for {instance_id}: {e}"
|
|
)
|
|
return {
|
|
"traceparent": None,
|
|
"tracestate": None,
|
|
"instance_id": instance_id,
|
|
"resumed": True,
|
|
"error": str(e),
|
|
}
|
|
|
|
def cleanup_context(self, instance_id: str) -> None:
|
|
"""
|
|
Clean up stored context for a completed workflow instance to prevent memory leaks.
|
|
|
|
Removes the stored W3C context data when a workflow completes to prevent
|
|
the storage from growing indefinitely. Should be called when workflow
|
|
execution finishes or fails.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID to clean up
|
|
"""
|
|
with self._lock:
|
|
if instance_id in self._storage:
|
|
del self._storage[instance_id]
|
|
logger.debug(f"🧹 Cleaned up context for instance {instance_id}")
|
|
|
|
def get_storage_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about stored contexts for debugging and monitoring.
|
|
|
|
Provides visibility into the storage state for troubleshooting context
|
|
propagation issues and monitoring memory usage from stored contexts.
|
|
|
|
Returns:
|
|
Dict[str, Any]: Storage statistics including:
|
|
- stored_instances: Number of currently stored contexts
|
|
- instance_ids: List of workflow instance IDs with stored contexts
|
|
"""
|
|
with self._lock:
|
|
return {
|
|
"stored_instances": len(self._storage),
|
|
"instance_ids": list(self._storage.keys()),
|
|
}
|
|
|
|
|
|
# Global instance for workflow context storage across the application
|
|
_context_storage = WorkflowContextStorage()
|
|
|
|
|
|
def store_workflow_context(instance_id: str, otel_context: Dict[str, Any]) -> None:
|
|
"""
|
|
Store OpenTelemetry context for a workflow instance using the global storage.
|
|
|
|
Convenience function that provides a simple interface to store W3C Trace Context
|
|
data for workflow instances. Used during workflow task creation to preserve
|
|
context across Dapr Workflow runtime boundaries.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID from WorkflowActivityContext.workflow_id
|
|
otel_context (Dict[str, Any]): W3C context data from extract_otel_context()
|
|
containing traceparent, tracestate, and debug components
|
|
"""
|
|
_context_storage.store_context(instance_id, otel_context)
|
|
|
|
|
|
def get_workflow_context(instance_id: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Retrieve OpenTelemetry context for a workflow instance using the global storage.
|
|
|
|
Convenience function that provides a simple interface to retrieve stored
|
|
W3C Trace Context data for workflow instances. Used during workflow task
|
|
execution to restore context and maintain distributed trace continuity.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID from WorkflowActivityContext.workflow_id
|
|
|
|
Returns:
|
|
Optional[Dict[str, Any]]: W3C context data with traceparent/tracestate headers
|
|
for creating child spans, or None if not found
|
|
"""
|
|
return _context_storage.get_context(instance_id)
|
|
|
|
|
|
def cleanup_workflow_context(instance_id: str) -> None:
|
|
"""
|
|
Clean up stored context for a completed workflow instance using the global storage.
|
|
|
|
Convenience function that provides a simple interface to clean up stored
|
|
context data when workflows complete. Important for preventing memory
|
|
leaks in long-running applications with many workflow executions.
|
|
|
|
Args:
|
|
instance_id (str): Unique workflow instance ID to clean up from storage
|
|
"""
|
|
_context_storage.cleanup_context(instance_id)
|
|
|
|
|
|
def get_context_storage_stats() -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about stored contexts for debugging and monitoring using the global storage.
|
|
|
|
Convenience function that provides visibility into the storage state for
|
|
troubleshooting context propagation issues and monitoring memory usage.
|
|
Useful for debugging workflow context issues and ensuring proper cleanup.
|
|
|
|
Returns:
|
|
Dict[str, Any]: Storage statistics including count of stored instances
|
|
and list of active workflow instance IDs
|
|
"""
|
|
return _context_storage.get_storage_stats()
|