Easier to follow local vs remote span tracing

The `incoming-federation-request` vs `process-federation_request` was first introduced in
https://github.com/matrix-org/synapse/pull/11870

 - Span for remote trace: `incoming-federation-request`
    - `child_of` reference: `origin_span_context`
    - `follows_from` reference: `servlet_span`
 - Span for local trace: `process-federation-request`
    - `child_of` reference: `servlet_span` (by the nature of it being active)
    - `follows_from` reference: `incoming-federation-request`
This commit is contained in:
Eric Eastwood 2022-07-29 21:49:47 -05:00
parent 786dd9b4b1
commit 7c135b93bd
2 changed files with 98 additions and 44 deletions

View file

@ -27,6 +27,8 @@ from synapse.http.site import SynapseRequest
from synapse.logging.context import run_in_background from synapse.logging.context import run_in_background
from synapse.logging.tracing import ( from synapse.logging.tracing import (
get_active_span, get_active_span,
Link,
create_non_recording_span,
set_attribute, set_attribute,
span_context_from_request, span_context_from_request,
start_active_span, start_active_span,
@ -313,29 +315,45 @@ class BaseFederationServlet:
# if the origin is authenticated and whitelisted, use its span context # if the origin is authenticated and whitelisted, use its span context
# as the parent. # as the parent.
context = None origin_span_context = None
if origin and whitelisted_homeserver(origin): if origin and whitelisted_homeserver(origin):
context = span_context_from_request(request) origin_span_context = span_context_from_request(request)
if context: if origin_span_context:
servlet_span = get_active_span() local_servlet_span = get_active_span()
# a scope which uses the origin's context as a parent # Create a span which uses the `origin_span_context` as a parent
processing_start_time = time.time() # so we can see how the incoming payload was processed while
scope = start_active_span_follows_from( # we're looking at the outgoing trace. Since the parent is set
# to a remote span (from the origin), it won't show up in the
# local trace which is why we create another span below for the
# local trace. A span can only have one parent so we have to
# create two separate ones.
remote_parent_span = start_active_span(
"incoming-federation-request", "incoming-federation-request",
child_of=context, context=origin_span_context,
contexts=(servlet_span,), # Cross-link back to the local trace so we can jump
start_time=processing_start_time, # to the incoming side from the remote origin trace.
links=[Link(local_servlet_span.get_span_context())],
)
# Create a local span to appear in the local trace
local_parent_span = start_active_span(
"process-federation-request",
# Cross-link back to the remote outgoing trace so we jump over
# there.
links=[Link(remote_parent_span.get_span_context())],
) )
else: else:
# just use our context as a parent # Otherwise just use our local context as a parent
scope = start_active_span( local_parent_span = start_active_span(
"incoming-federation-request", "process-federation-request",
) )
try: # Don't need to record anything for the remote
with scope: remote_parent_span = create_non_recording_span()
with remote_parent_span, local_parent_span:
if origin and self.RATELIMIT: if origin and self.RATELIMIT:
with ratelimiter.ratelimit(origin) as d: with ratelimiter.ratelimit(origin) as d:
await d await d
@ -352,16 +370,6 @@ class BaseFederationServlet:
response = await func( response = await func(
origin, content, request.args, *args, **kwargs origin, content, request.args, *args, **kwargs
) )
finally:
# if we used the origin's context as the parent, add a new span using
# the servlet span as a parent, so that we have a link
if context:
scope2 = start_active_span_follows_from(
"process-federation_request",
contexts=(scope.span,),
start_time=processing_start_time,
)
scope2.close()
return response return response

View file

@ -162,6 +162,7 @@ Gotchas
than one caller? Will all of those calling functions have be in a context than one caller? Will all of those calling functions have be in a context
with an active span? with an active span?
""" """
from abc import ABC
import contextlib import contextlib
import enum import enum
import inspect import inspect
@ -204,7 +205,7 @@ if TYPE_CHECKING:
# Helper class # Helper class
# Always returns the fixed value given for any accessed property # This will always returns the fixed value given for any accessed property
class _DummyLookup(object): class _DummyLookup(object):
def __init__(self, value): def __init__(self, value):
self.value = value self.value = value
@ -213,6 +214,21 @@ class _DummyLookup(object):
return object.__getattribute__(self, "value") return object.__getattribute__(self, "value")
class DummyLink(ABC):
def __init__(self):
self.not_implemented_message = (
"opentelemetry wasn't imported so this is just a dummy link placeholder"
)
@property
def context(self):
raise NotImplementedError(self.not_implemented_message)
@property
def context(self):
raise NotImplementedError(self.not_implemented_message)
# These dependencies are optional so they can fail to import # These dependencies are optional so they can fail to import
# and we # and we
try: try:
@ -229,12 +245,13 @@ try:
SpanKind = opentelemetry.trace.SpanKind SpanKind = opentelemetry.trace.SpanKind
SpanAttributes = opentelemetry.semconv.trace.SpanAttributes SpanAttributes = opentelemetry.semconv.trace.SpanAttributes
StatusCode = opentelemetry.trace.StatusCode StatusCode = opentelemetry.trace.StatusCode
Link = opentelemetry.trace.Link
except ImportError: except ImportError:
opentelemetry = None # type: ignore[assignment] opentelemetry = None # type: ignore[assignment]
SpanKind = _DummyLookup(0) SpanKind = _DummyLookup(0)
SpanAttributes = _DummyLookup("fake-attribute") SpanAttributes = _DummyLookup("fake-attribute")
StatusCode = _DummyLookup(0) StatusCode = _DummyLookup(0)
Link = DummyLink
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -435,6 +452,15 @@ def whitelisted_homeserver(destination: str) -> bool:
# Start spans and scopes # Start spans and scopes
def create_non_recording_span():
if opentelemetry is None:
return contextlib.nullcontext() # type: ignore[unreachable]
return opentelemetry.trace.NonRecordingSpan(
opentelemetry.trace.INVALID_SPAN_CONTEXT
)
def start_active_span( def start_active_span(
name: str, name: str,
*, *,
@ -446,11 +472,15 @@ def start_active_span(
record_exception: bool = True, record_exception: bool = True,
set_status_on_exception: bool = True, set_status_on_exception: bool = True,
end_on_exit: bool = True, end_on_exit: bool = True,
# For testing only
tracer: Optional["opentelemetry.sdk.trace.TracerProvider"] = None,
) -> Iterator["opentelemetry.trace.span.Span"]: ) -> Iterator["opentelemetry.trace.span.Span"]:
if opentelemetry is None: if opentelemetry is None:
return contextlib.nullcontext() # type: ignore[unreachable] return contextlib.nullcontext() # type: ignore[unreachable]
if tracer is None:
tracer = opentelemetry.trace.get_tracer(__name__) tracer = opentelemetry.trace.get_tracer(__name__)
return tracer.start_as_current_span( return tracer.start_as_current_span(
name=name, name=name,
context=context, context=context,
@ -464,6 +494,8 @@ def start_active_span(
) )
# TODO: I don't think we even need this function over the normal `start_active_span`.
# The only difference is the `inherit_force_tracing` stuff.
def start_active_span_follows_from( def start_active_span_follows_from(
operation_name: str, operation_name: str,
contexts: Collection, contexts: Collection,
@ -491,8 +523,21 @@ def start_active_span_follows_from(
forced, the new span will also have tracing forced. forced, the new span will also have tracing forced.
tracer: override the opentracing tracer. By default the global tracer is used. tracer: override the opentracing tracer. By default the global tracer is used.
""" """
# TODO if opentelemetry is None:
pass return contextlib.nullcontext() # type: ignore[unreachable]
links = [opentelemetry.trace.Link(context) for context in contexts]
span = start_active_span(
name=operation_name,
links=links,
)
if inherit_force_tracing and any(
is_context_forced_tracing(ctx) for ctx in contexts
):
force_tracing(span)
return span
def start_active_span_from_edu( def start_active_span_from_edu(
@ -529,7 +574,7 @@ def set_attribute(key: str, value: Union[str, bool, int, float]) -> None:
@ensure_active_span("set the status") @ensure_active_span("set the status")
def set_status( def set_status(
status: "opentelemetry.trace.StatusCode", exc: Optional(Exception) status: "opentelemetry.trace.StatusCode", exc: Optional[Exception]
) -> None: ) -> None:
"""Sets a tag on the active span""" """Sets a tag on the active span"""
active_span = get_active_span() active_span = get_active_span()
@ -825,15 +870,16 @@ def trace_servlet(
} }
request_name = request.request_metrics.name request_name = request.request_metrics.name
context = span_context_from_request(request) if extract_context else None span_context = span_context_from_request(request) if extract_context else None
# we configure the scope not to finish the span immediately on exit, and instead # we configure the scope not to finish the span immediately on exit, and instead
# pass the span into the SynapseRequest, which will finish it once we've finished # pass the span into the SynapseRequest, which will finish it once we've finished
# sending the response to the client. # sending the response to the client.
with start_active_span( with start_active_span(
request_name, request_name,
kind=SpanKind.SERVER, kind=SpanKind.SERVER,
context=context, context=span_context,
end_on_exit=False, end_on_exit=False,
) as span: ) as span:
request.set_tracing_span(span) request.set_tracing_span(span)