Easier to follow local vs remote span tracing

The `incoming-federation-request` vs `process-federation_request` was first introduced in
https://github.com/matrix-org/synapse/pull/11870

 - Span for remote trace: `incoming-federation-request`
    - `child_of` reference: `origin_span_context`
    - `follows_from` reference: `servlet_span`
 - Span for local trace: `process-federation-request`
    - `child_of` reference: `servlet_span` (by the nature of it being active)
    - `follows_from` reference: `incoming-federation-request`
This commit is contained in:
Eric Eastwood 2022-07-29 21:49:47 -05:00
parent 786dd9b4b1
commit 7c135b93bd
2 changed files with 98 additions and 44 deletions

View file

@ -27,6 +27,8 @@ from synapse.http.site import SynapseRequest
from synapse.logging.context import run_in_background
from synapse.logging.tracing import (
get_active_span,
Link,
create_non_recording_span,
set_attribute,
span_context_from_request,
start_active_span,
@ -313,55 +315,61 @@ class BaseFederationServlet:
# if the origin is authenticated and whitelisted, use its span context
# as the parent.
context = None
origin_span_context = None
if origin and whitelisted_homeserver(origin):
context = span_context_from_request(request)
origin_span_context = span_context_from_request(request)
if context:
servlet_span = get_active_span()
# a scope which uses the origin's context as a parent
processing_start_time = time.time()
scope = start_active_span_follows_from(
if origin_span_context:
local_servlet_span = get_active_span()
# Create a span which uses the `origin_span_context` as a parent
# so we can see how the incoming payload was processed while
# we're looking at the outgoing trace. Since the parent is set
# to a remote span (from the origin), it won't show up in the
# local trace which is why we create another span below for the
# local trace. A span can only have one parent so we have to
# create two separate ones.
remote_parent_span = start_active_span(
"incoming-federation-request",
child_of=context,
contexts=(servlet_span,),
start_time=processing_start_time,
context=origin_span_context,
# Cross-link back to the local trace so we can jump
# to the incoming side from the remote origin trace.
links=[Link(local_servlet_span.get_span_context())],
)
# Create a local span to appear in the local trace
local_parent_span = start_active_span(
"process-federation-request",
# Cross-link back to the remote outgoing trace so we jump over
# there.
links=[Link(remote_parent_span.get_span_context())],
)
else:
# just use our context as a parent
scope = start_active_span(
"incoming-federation-request",
# Otherwise just use our local context as a parent
local_parent_span = start_active_span(
"process-federation-request",
)
try:
with scope:
if origin and self.RATELIMIT:
with ratelimiter.ratelimit(origin) as d:
await d
if request._disconnected:
logger.warning(
"client disconnected before we started processing "
"request"
)
return None
response = await func(
origin, content, request.args, *args, **kwargs
# Don't need to record anything for the remote
remote_parent_span = create_non_recording_span()
with remote_parent_span, local_parent_span:
if origin and self.RATELIMIT:
with ratelimiter.ratelimit(origin) as d:
await d
if request._disconnected:
logger.warning(
"client disconnected before we started processing "
"request"
)
else:
return None
response = await func(
origin, content, request.args, *args, **kwargs
)
finally:
# if we used the origin's context as the parent, add a new span using
# the servlet span as a parent, so that we have a link
if context:
scope2 = start_active_span_follows_from(
"process-federation_request",
contexts=(scope.span,),
start_time=processing_start_time,
else:
response = await func(
origin, content, request.args, *args, **kwargs
)
scope2.close()
return response

View file

@ -162,6 +162,7 @@ Gotchas
than one caller? Will all of those calling functions have be in a context
with an active span?
"""
from abc import ABC
import contextlib
import enum
import inspect
@ -204,7 +205,7 @@ if TYPE_CHECKING:
# Helper class
# Always returns the fixed value given for any accessed property
# This will always returns the fixed value given for any accessed property
class _DummyLookup(object):
def __init__(self, value):
self.value = value
@ -213,6 +214,21 @@ class _DummyLookup(object):
return object.__getattribute__(self, "value")
class DummyLink(ABC):
def __init__(self):
self.not_implemented_message = (
"opentelemetry wasn't imported so this is just a dummy link placeholder"
)
@property
def context(self):
raise NotImplementedError(self.not_implemented_message)
@property
def context(self):
raise NotImplementedError(self.not_implemented_message)
# These dependencies are optional so they can fail to import
# and we
try:
@ -229,12 +245,13 @@ try:
SpanKind = opentelemetry.trace.SpanKind
SpanAttributes = opentelemetry.semconv.trace.SpanAttributes
StatusCode = opentelemetry.trace.StatusCode
Link = opentelemetry.trace.Link
except ImportError:
opentelemetry = None # type: ignore[assignment]
SpanKind = _DummyLookup(0)
SpanAttributes = _DummyLookup("fake-attribute")
StatusCode = _DummyLookup(0)
Link = DummyLink
logger = logging.getLogger(__name__)
@ -435,6 +452,15 @@ def whitelisted_homeserver(destination: str) -> bool:
# Start spans and scopes
def create_non_recording_span():
if opentelemetry is None:
return contextlib.nullcontext() # type: ignore[unreachable]
return opentelemetry.trace.NonRecordingSpan(
opentelemetry.trace.INVALID_SPAN_CONTEXT
)
def start_active_span(
name: str,
*,
@ -446,11 +472,15 @@ def start_active_span(
record_exception: bool = True,
set_status_on_exception: bool = True,
end_on_exit: bool = True,
# For testing only
tracer: Optional["opentelemetry.sdk.trace.TracerProvider"] = None,
) -> Iterator["opentelemetry.trace.span.Span"]:
if opentelemetry is None:
return contextlib.nullcontext() # type: ignore[unreachable]
tracer = opentelemetry.trace.get_tracer(__name__)
if tracer is None:
tracer = opentelemetry.trace.get_tracer(__name__)
return tracer.start_as_current_span(
name=name,
context=context,
@ -464,6 +494,8 @@ def start_active_span(
)
# TODO: I don't think we even need this function over the normal `start_active_span`.
# The only difference is the `inherit_force_tracing` stuff.
def start_active_span_follows_from(
operation_name: str,
contexts: Collection,
@ -491,8 +523,21 @@ def start_active_span_follows_from(
forced, the new span will also have tracing forced.
tracer: override the opentracing tracer. By default the global tracer is used.
"""
# TODO
pass
if opentelemetry is None:
return contextlib.nullcontext() # type: ignore[unreachable]
links = [opentelemetry.trace.Link(context) for context in contexts]
span = start_active_span(
name=operation_name,
links=links,
)
if inherit_force_tracing and any(
is_context_forced_tracing(ctx) for ctx in contexts
):
force_tracing(span)
return span
def start_active_span_from_edu(
@ -529,7 +574,7 @@ def set_attribute(key: str, value: Union[str, bool, int, float]) -> None:
@ensure_active_span("set the status")
def set_status(
status: "opentelemetry.trace.StatusCode", exc: Optional(Exception)
status: "opentelemetry.trace.StatusCode", exc: Optional[Exception]
) -> None:
"""Sets a tag on the active span"""
active_span = get_active_span()
@ -825,15 +870,16 @@ def trace_servlet(
}
request_name = request.request_metrics.name
context = span_context_from_request(request) if extract_context else None
span_context = span_context_from_request(request) if extract_context else None
# we configure the scope not to finish the span immediately on exit, and instead
# pass the span into the SynapseRequest, which will finish it once we've finished
# sending the response to the client.
with start_active_span(
request_name,
kind=SpanKind.SERVER,
context=context,
context=span_context,
end_on_exit=False,
) as span:
request.set_tracing_span(span)