mirror of
https://github.com/element-hq/synapse
synced 2024-09-19 22:15:11 +00:00
Easier to follow local vs remote span tracing
The `incoming-federation-request` vs `process-federation_request` was first introduced in https://github.com/matrix-org/synapse/pull/11870 - Span for remote trace: `incoming-federation-request` - `child_of` reference: `origin_span_context` - `follows_from` reference: `servlet_span` - Span for local trace: `process-federation-request` - `child_of` reference: `servlet_span` (by the nature of it being active) - `follows_from` reference: `incoming-federation-request`
This commit is contained in:
parent
786dd9b4b1
commit
7c135b93bd
2 changed files with 98 additions and 44 deletions
|
@ -27,6 +27,8 @@ from synapse.http.site import SynapseRequest
|
|||
from synapse.logging.context import run_in_background
|
||||
from synapse.logging.tracing import (
|
||||
get_active_span,
|
||||
Link,
|
||||
create_non_recording_span,
|
||||
set_attribute,
|
||||
span_context_from_request,
|
||||
start_active_span,
|
||||
|
@ -313,55 +315,61 @@ class BaseFederationServlet:
|
|||
|
||||
# if the origin is authenticated and whitelisted, use its span context
|
||||
# as the parent.
|
||||
context = None
|
||||
origin_span_context = None
|
||||
if origin and whitelisted_homeserver(origin):
|
||||
context = span_context_from_request(request)
|
||||
origin_span_context = span_context_from_request(request)
|
||||
|
||||
if context:
|
||||
servlet_span = get_active_span()
|
||||
# a scope which uses the origin's context as a parent
|
||||
processing_start_time = time.time()
|
||||
scope = start_active_span_follows_from(
|
||||
if origin_span_context:
|
||||
local_servlet_span = get_active_span()
|
||||
# Create a span which uses the `origin_span_context` as a parent
|
||||
# so we can see how the incoming payload was processed while
|
||||
# we're looking at the outgoing trace. Since the parent is set
|
||||
# to a remote span (from the origin), it won't show up in the
|
||||
# local trace which is why we create another span below for the
|
||||
# local trace. A span can only have one parent so we have to
|
||||
# create two separate ones.
|
||||
remote_parent_span = start_active_span(
|
||||
"incoming-federation-request",
|
||||
child_of=context,
|
||||
contexts=(servlet_span,),
|
||||
start_time=processing_start_time,
|
||||
context=origin_span_context,
|
||||
# Cross-link back to the local trace so we can jump
|
||||
# to the incoming side from the remote origin trace.
|
||||
links=[Link(local_servlet_span.get_span_context())],
|
||||
)
|
||||
|
||||
# Create a local span to appear in the local trace
|
||||
local_parent_span = start_active_span(
|
||||
"process-federation-request",
|
||||
# Cross-link back to the remote outgoing trace so we jump over
|
||||
# there.
|
||||
links=[Link(remote_parent_span.get_span_context())],
|
||||
)
|
||||
|
||||
else:
|
||||
# just use our context as a parent
|
||||
scope = start_active_span(
|
||||
"incoming-federation-request",
|
||||
# Otherwise just use our local context as a parent
|
||||
local_parent_span = start_active_span(
|
||||
"process-federation-request",
|
||||
)
|
||||
|
||||
try:
|
||||
with scope:
|
||||
if origin and self.RATELIMIT:
|
||||
with ratelimiter.ratelimit(origin) as d:
|
||||
await d
|
||||
if request._disconnected:
|
||||
logger.warning(
|
||||
"client disconnected before we started processing "
|
||||
"request"
|
||||
)
|
||||
return None
|
||||
response = await func(
|
||||
origin, content, request.args, *args, **kwargs
|
||||
# Don't need to record anything for the remote
|
||||
remote_parent_span = create_non_recording_span()
|
||||
|
||||
with remote_parent_span, local_parent_span:
|
||||
if origin and self.RATELIMIT:
|
||||
with ratelimiter.ratelimit(origin) as d:
|
||||
await d
|
||||
if request._disconnected:
|
||||
logger.warning(
|
||||
"client disconnected before we started processing "
|
||||
"request"
|
||||
)
|
||||
else:
|
||||
return None
|
||||
response = await func(
|
||||
origin, content, request.args, *args, **kwargs
|
||||
)
|
||||
finally:
|
||||
# if we used the origin's context as the parent, add a new span using
|
||||
# the servlet span as a parent, so that we have a link
|
||||
if context:
|
||||
scope2 = start_active_span_follows_from(
|
||||
"process-federation_request",
|
||||
contexts=(scope.span,),
|
||||
start_time=processing_start_time,
|
||||
else:
|
||||
response = await func(
|
||||
origin, content, request.args, *args, **kwargs
|
||||
)
|
||||
scope2.close()
|
||||
|
||||
return response
|
||||
|
||||
|
|
|
@ -162,6 +162,7 @@ Gotchas
|
|||
than one caller? Will all of those calling functions have be in a context
|
||||
with an active span?
|
||||
"""
|
||||
from abc import ABC
|
||||
import contextlib
|
||||
import enum
|
||||
import inspect
|
||||
|
@ -204,7 +205,7 @@ if TYPE_CHECKING:
|
|||
|
||||
# Helper class
|
||||
|
||||
# Always returns the fixed value given for any accessed property
|
||||
# This will always returns the fixed value given for any accessed property
|
||||
class _DummyLookup(object):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
@ -213,6 +214,21 @@ class _DummyLookup(object):
|
|||
return object.__getattribute__(self, "value")
|
||||
|
||||
|
||||
class DummyLink(ABC):
|
||||
def __init__(self):
|
||||
self.not_implemented_message = (
|
||||
"opentelemetry wasn't imported so this is just a dummy link placeholder"
|
||||
)
|
||||
|
||||
@property
|
||||
def context(self):
|
||||
raise NotImplementedError(self.not_implemented_message)
|
||||
|
||||
@property
|
||||
def context(self):
|
||||
raise NotImplementedError(self.not_implemented_message)
|
||||
|
||||
|
||||
# These dependencies are optional so they can fail to import
|
||||
# and we
|
||||
try:
|
||||
|
@ -229,12 +245,13 @@ try:
|
|||
SpanKind = opentelemetry.trace.SpanKind
|
||||
SpanAttributes = opentelemetry.semconv.trace.SpanAttributes
|
||||
StatusCode = opentelemetry.trace.StatusCode
|
||||
|
||||
Link = opentelemetry.trace.Link
|
||||
except ImportError:
|
||||
opentelemetry = None # type: ignore[assignment]
|
||||
SpanKind = _DummyLookup(0)
|
||||
SpanAttributes = _DummyLookup("fake-attribute")
|
||||
StatusCode = _DummyLookup(0)
|
||||
Link = DummyLink
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -435,6 +452,15 @@ def whitelisted_homeserver(destination: str) -> bool:
|
|||
# Start spans and scopes
|
||||
|
||||
|
||||
def create_non_recording_span():
|
||||
if opentelemetry is None:
|
||||
return contextlib.nullcontext() # type: ignore[unreachable]
|
||||
|
||||
return opentelemetry.trace.NonRecordingSpan(
|
||||
opentelemetry.trace.INVALID_SPAN_CONTEXT
|
||||
)
|
||||
|
||||
|
||||
def start_active_span(
|
||||
name: str,
|
||||
*,
|
||||
|
@ -446,11 +472,15 @@ def start_active_span(
|
|||
record_exception: bool = True,
|
||||
set_status_on_exception: bool = True,
|
||||
end_on_exit: bool = True,
|
||||
# For testing only
|
||||
tracer: Optional["opentelemetry.sdk.trace.TracerProvider"] = None,
|
||||
) -> Iterator["opentelemetry.trace.span.Span"]:
|
||||
if opentelemetry is None:
|
||||
return contextlib.nullcontext() # type: ignore[unreachable]
|
||||
|
||||
tracer = opentelemetry.trace.get_tracer(__name__)
|
||||
if tracer is None:
|
||||
tracer = opentelemetry.trace.get_tracer(__name__)
|
||||
|
||||
return tracer.start_as_current_span(
|
||||
name=name,
|
||||
context=context,
|
||||
|
@ -464,6 +494,8 @@ def start_active_span(
|
|||
)
|
||||
|
||||
|
||||
# TODO: I don't think we even need this function over the normal `start_active_span`.
|
||||
# The only difference is the `inherit_force_tracing` stuff.
|
||||
def start_active_span_follows_from(
|
||||
operation_name: str,
|
||||
contexts: Collection,
|
||||
|
@ -491,8 +523,21 @@ def start_active_span_follows_from(
|
|||
forced, the new span will also have tracing forced.
|
||||
tracer: override the opentracing tracer. By default the global tracer is used.
|
||||
"""
|
||||
# TODO
|
||||
pass
|
||||
if opentelemetry is None:
|
||||
return contextlib.nullcontext() # type: ignore[unreachable]
|
||||
|
||||
links = [opentelemetry.trace.Link(context) for context in contexts]
|
||||
span = start_active_span(
|
||||
name=operation_name,
|
||||
links=links,
|
||||
)
|
||||
|
||||
if inherit_force_tracing and any(
|
||||
is_context_forced_tracing(ctx) for ctx in contexts
|
||||
):
|
||||
force_tracing(span)
|
||||
|
||||
return span
|
||||
|
||||
|
||||
def start_active_span_from_edu(
|
||||
|
@ -529,7 +574,7 @@ def set_attribute(key: str, value: Union[str, bool, int, float]) -> None:
|
|||
|
||||
@ensure_active_span("set the status")
|
||||
def set_status(
|
||||
status: "opentelemetry.trace.StatusCode", exc: Optional(Exception)
|
||||
status: "opentelemetry.trace.StatusCode", exc: Optional[Exception]
|
||||
) -> None:
|
||||
"""Sets a tag on the active span"""
|
||||
active_span = get_active_span()
|
||||
|
@ -825,15 +870,16 @@ def trace_servlet(
|
|||
}
|
||||
|
||||
request_name = request.request_metrics.name
|
||||
context = span_context_from_request(request) if extract_context else None
|
||||
span_context = span_context_from_request(request) if extract_context else None
|
||||
|
||||
# we configure the scope not to finish the span immediately on exit, and instead
|
||||
# pass the span into the SynapseRequest, which will finish it once we've finished
|
||||
# sending the response to the client.
|
||||
|
||||
with start_active_span(
|
||||
request_name,
|
||||
kind=SpanKind.SERVER,
|
||||
context=context,
|
||||
context=span_context,
|
||||
end_on_exit=False,
|
||||
) as span:
|
||||
request.set_tracing_span(span)
|
||||
|
|
Loading…
Reference in a new issue