From 7c135b93bd4a46ae7c04474e2d76c279a386ef1c Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 29 Jul 2022 21:49:47 -0500 Subject: [PATCH] Easier to follow local vs remote span tracing The `incoming-federation-request` vs `process-federation_request` was first introduced in https://github.com/matrix-org/synapse/pull/11870 - Span for remote trace: `incoming-federation-request` - `child_of` reference: `origin_span_context` - `follows_from` reference: `servlet_span` - Span for local trace: `process-federation-request` - `child_of` reference: `servlet_span` (by the nature of it being active) - `follows_from` reference: `incoming-federation-request` --- synapse/federation/transport/server/_base.py | 80 +++++++++++--------- synapse/logging/tracing.py | 62 +++++++++++++-- 2 files changed, 98 insertions(+), 44 deletions(-) diff --git a/synapse/federation/transport/server/_base.py b/synapse/federation/transport/server/_base.py index b35fefc7dc..45d8b2e8ef 100644 --- a/synapse/federation/transport/server/_base.py +++ b/synapse/federation/transport/server/_base.py @@ -27,6 +27,8 @@ from synapse.http.site import SynapseRequest from synapse.logging.context import run_in_background from synapse.logging.tracing import ( get_active_span, + Link, + create_non_recording_span, set_attribute, span_context_from_request, start_active_span, @@ -313,55 +315,61 @@ class BaseFederationServlet: # if the origin is authenticated and whitelisted, use its span context # as the parent. - context = None + origin_span_context = None if origin and whitelisted_homeserver(origin): - context = span_context_from_request(request) + origin_span_context = span_context_from_request(request) - if context: - servlet_span = get_active_span() - # a scope which uses the origin's context as a parent - processing_start_time = time.time() - scope = start_active_span_follows_from( + if origin_span_context: + local_servlet_span = get_active_span() + # Create a span which uses the `origin_span_context` as a parent + # so we can see how the incoming payload was processed while + # we're looking at the outgoing trace. Since the parent is set + # to a remote span (from the origin), it won't show up in the + # local trace which is why we create another span below for the + # local trace. A span can only have one parent so we have to + # create two separate ones. + remote_parent_span = start_active_span( "incoming-federation-request", - child_of=context, - contexts=(servlet_span,), - start_time=processing_start_time, + context=origin_span_context, + # Cross-link back to the local trace so we can jump + # to the incoming side from the remote origin trace. + links=[Link(local_servlet_span.get_span_context())], + ) + + # Create a local span to appear in the local trace + local_parent_span = start_active_span( + "process-federation-request", + # Cross-link back to the remote outgoing trace so we jump over + # there. + links=[Link(remote_parent_span.get_span_context())], ) else: - # just use our context as a parent - scope = start_active_span( - "incoming-federation-request", + # Otherwise just use our local context as a parent + local_parent_span = start_active_span( + "process-federation-request", ) - try: - with scope: - if origin and self.RATELIMIT: - with ratelimiter.ratelimit(origin) as d: - await d - if request._disconnected: - logger.warning( - "client disconnected before we started processing " - "request" - ) - return None - response = await func( - origin, content, request.args, *args, **kwargs + # Don't need to record anything for the remote + remote_parent_span = create_non_recording_span() + + with remote_parent_span, local_parent_span: + if origin and self.RATELIMIT: + with ratelimiter.ratelimit(origin) as d: + await d + if request._disconnected: + logger.warning( + "client disconnected before we started processing " + "request" ) - else: + return None response = await func( origin, content, request.args, *args, **kwargs ) - finally: - # if we used the origin's context as the parent, add a new span using - # the servlet span as a parent, so that we have a link - if context: - scope2 = start_active_span_follows_from( - "process-federation_request", - contexts=(scope.span,), - start_time=processing_start_time, + else: + response = await func( + origin, content, request.args, *args, **kwargs ) - scope2.close() return response diff --git a/synapse/logging/tracing.py b/synapse/logging/tracing.py index 5aa05ee9a2..59f4ab66d6 100644 --- a/synapse/logging/tracing.py +++ b/synapse/logging/tracing.py @@ -162,6 +162,7 @@ Gotchas than one caller? Will all of those calling functions have be in a context with an active span? """ +from abc import ABC import contextlib import enum import inspect @@ -204,7 +205,7 @@ if TYPE_CHECKING: # Helper class -# Always returns the fixed value given for any accessed property +# This will always returns the fixed value given for any accessed property class _DummyLookup(object): def __init__(self, value): self.value = value @@ -213,6 +214,21 @@ class _DummyLookup(object): return object.__getattribute__(self, "value") +class DummyLink(ABC): + def __init__(self): + self.not_implemented_message = ( + "opentelemetry wasn't imported so this is just a dummy link placeholder" + ) + + @property + def context(self): + raise NotImplementedError(self.not_implemented_message) + + @property + def context(self): + raise NotImplementedError(self.not_implemented_message) + + # These dependencies are optional so they can fail to import # and we try: @@ -229,12 +245,13 @@ try: SpanKind = opentelemetry.trace.SpanKind SpanAttributes = opentelemetry.semconv.trace.SpanAttributes StatusCode = opentelemetry.trace.StatusCode - + Link = opentelemetry.trace.Link except ImportError: opentelemetry = None # type: ignore[assignment] SpanKind = _DummyLookup(0) SpanAttributes = _DummyLookup("fake-attribute") StatusCode = _DummyLookup(0) + Link = DummyLink logger = logging.getLogger(__name__) @@ -435,6 +452,15 @@ def whitelisted_homeserver(destination: str) -> bool: # Start spans and scopes +def create_non_recording_span(): + if opentelemetry is None: + return contextlib.nullcontext() # type: ignore[unreachable] + + return opentelemetry.trace.NonRecordingSpan( + opentelemetry.trace.INVALID_SPAN_CONTEXT + ) + + def start_active_span( name: str, *, @@ -446,11 +472,15 @@ def start_active_span( record_exception: bool = True, set_status_on_exception: bool = True, end_on_exit: bool = True, + # For testing only + tracer: Optional["opentelemetry.sdk.trace.TracerProvider"] = None, ) -> Iterator["opentelemetry.trace.span.Span"]: if opentelemetry is None: return contextlib.nullcontext() # type: ignore[unreachable] - tracer = opentelemetry.trace.get_tracer(__name__) + if tracer is None: + tracer = opentelemetry.trace.get_tracer(__name__) + return tracer.start_as_current_span( name=name, context=context, @@ -464,6 +494,8 @@ def start_active_span( ) +# TODO: I don't think we even need this function over the normal `start_active_span`. +# The only difference is the `inherit_force_tracing` stuff. def start_active_span_follows_from( operation_name: str, contexts: Collection, @@ -491,8 +523,21 @@ def start_active_span_follows_from( forced, the new span will also have tracing forced. tracer: override the opentracing tracer. By default the global tracer is used. """ - # TODO - pass + if opentelemetry is None: + return contextlib.nullcontext() # type: ignore[unreachable] + + links = [opentelemetry.trace.Link(context) for context in contexts] + span = start_active_span( + name=operation_name, + links=links, + ) + + if inherit_force_tracing and any( + is_context_forced_tracing(ctx) for ctx in contexts + ): + force_tracing(span) + + return span def start_active_span_from_edu( @@ -529,7 +574,7 @@ def set_attribute(key: str, value: Union[str, bool, int, float]) -> None: @ensure_active_span("set the status") def set_status( - status: "opentelemetry.trace.StatusCode", exc: Optional(Exception) + status: "opentelemetry.trace.StatusCode", exc: Optional[Exception] ) -> None: """Sets a tag on the active span""" active_span = get_active_span() @@ -825,15 +870,16 @@ def trace_servlet( } request_name = request.request_metrics.name - context = span_context_from_request(request) if extract_context else None + span_context = span_context_from_request(request) if extract_context else None # we configure the scope not to finish the span immediately on exit, and instead # pass the span into the SynapseRequest, which will finish it once we've finished # sending the response to the client. + with start_active_span( request_name, kind=SpanKind.SERVER, - context=context, + context=span_context, end_on_exit=False, ) as span: request.set_tracing_span(span)