Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add prometheus metrics to track federation delays #8430

Merged
merged 2 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/8430.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add prometheus metrics to track federation delays.
10 changes: 10 additions & 0 deletions docs/sample_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ acme:
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]


## Federation ##

# Restrict federation to the following whitelist of domains.
# N.B. we recommend also firewalling your federation listener to limit
Expand Down Expand Up @@ -662,6 +663,15 @@ federation_ip_range_blacklist:
- 'fe80::/64'
- 'fc00::/7'

# Report prometheus metrics on the age of PDUs being sent to and received from
# the following domains. This can be used to give an idea of "delay" on inbound
# and outbound federation, though be aware that any delay can be due to problems
# at either end or with the intermediate network.
#
#federation_metrics_domains:
# - matrix.org
# - example.com


## Caching ##

Expand Down
6 changes: 4 additions & 2 deletions synapse/config/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, List
from typing import Any, Iterable

import jsonschema

from synapse.config._base import ConfigError
from synapse.types import JsonDict


def validate_config(json_schema: JsonDict, config: Any, config_path: List[str]) -> None:
def validate_config(
json_schema: JsonDict, config: Any, config_path: Iterable[str]
) -> None:
"""Validates a config setting against a JsonSchema definition

This can be used to validate a section of the config file against a schema
Expand Down
25 changes: 24 additions & 1 deletion synapse/config/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

from netaddr import IPSet

from ._base import Config, ConfigError
from synapse.config._base import Config, ConfigError
from synapse.config._util import validate_config


class FederationConfig(Config):
Expand Down Expand Up @@ -52,8 +53,18 @@ def read_config(self, config, **kwargs):
"Invalid range(s) provided in federation_ip_range_blacklist: %s" % e
)

federation_metrics_domains = config.get("federation_metrics_domains") or []
validate_config(
_METRICS_FOR_DOMAINS_SCHEMA,
federation_metrics_domains,
("federation_metrics_domains",),
)
self.federation_metrics_domains = set(federation_metrics_domains)

def generate_config_section(self, config_dir_path, server_name, **kwargs):
return """\
## Federation ##

# Restrict federation to the following whitelist of domains.
# N.B. we recommend also firewalling your federation listener to limit
# inbound federation traffic as early as possible, rather than relying
Expand Down Expand Up @@ -85,4 +96,16 @@ def generate_config_section(self, config_dir_path, server_name, **kwargs):
- '::1/128'
- 'fe80::/64'
- 'fc00::/7'

# Report prometheus metrics on the age of PDUs being sent to and received from
# the following domains. This can be used to give an idea of "delay" on inbound
# and outbound federation, though be aware that any delay can be due to problems
# at either end or with the intermediate network.
richvdh marked this conversation as resolved.
Show resolved Hide resolved
#
#federation_metrics_domains:
# - matrix.org
# - example.com
"""


_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
1 change: 0 additions & 1 deletion synapse/config/homeserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,4 @@ class HomeServerConfig(RootConfig):
TracerConfig,
WorkerConfig,
RedisConfig,
FederationConfig,
richvdh marked this conversation as resolved.
Show resolved Hide resolved
]
1 change: 0 additions & 1 deletion synapse/config/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,6 @@ def generate_config_section(
# or by checking matrix.org/federationtester/api/report?server_name=$host
#
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]

"""
# Lowercase the string representation of boolean values
% {
Expand Down
24 changes: 23 additions & 1 deletion synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
Union,
)

from prometheus_client import Counter, Histogram
from prometheus_client import Counter, Gauge, Histogram

from twisted.internet import defer
from twisted.internet.abstract import isIPAddress
Expand Down Expand Up @@ -88,6 +88,13 @@
)


last_pdu_age_metric = Gauge(
"synapse_federation_last_received_pdu_age",
"The age (in seconds) of the last PDU successfully received from the given domain",
labelnames=("server_name",),
)


class FederationServer(FederationBase):
def __init__(self, hs):
super().__init__(hs)
Expand Down Expand Up @@ -118,6 +125,10 @@ def __init__(self, hs):
hs, "state_ids_resp", timeout_ms=30000
)

self._federation_metrics_domains = (
hs.get_config().federation.federation_metrics_domains
)

async def on_backfill_request(
self, origin: str, room_id: str, versions: List[str], limit: int
) -> Tuple[int, Dict[str, Any]]:
Expand Down Expand Up @@ -262,7 +273,11 @@ async def _handle_pdus_in_txn(

pdus_by_room = {} # type: Dict[str, List[EventBase]]

newest_pdu_ts = 0

for p in transaction.pdus: # type: ignore
# FIXME (richardv): I don't think this works:
# https://github.com/matrix-org/synapse/issues/8429
if "unsigned" in p:
unsigned = p["unsigned"]
if "age" in unsigned:
Expand Down Expand Up @@ -300,6 +315,9 @@ async def _handle_pdus_in_txn(
event = event_from_pdu_json(p, room_version)
pdus_by_room.setdefault(room_id, []).append(event)

if event.origin_server_ts > newest_pdu_ts:
newest_pdu_ts = event.origin_server_ts

pdu_results = {}

# we can process different rooms in parallel (which is useful if they
Expand Down Expand Up @@ -340,6 +358,10 @@ async def process_pdus_for_room(room_id: str):
process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT
)

if newest_pdu_ts and origin in self._federation_metrics_domains:
newest_pdu_age = self._clock.time_msec() - newest_pdu_ts
last_pdu_age_metric.labels(server_name=origin).set(newest_pdu_age / 1000)

return pdu_results

async def _handle_edus_in_txn(self, origin: str, transaction: Transaction):
Expand Down
22 changes: 22 additions & 0 deletions synapse/federation/sender/transaction_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import logging
from typing import TYPE_CHECKING, List

from prometheus_client import Gauge

from synapse.api.errors import HttpResponseException
from synapse.events import EventBase
from synapse.federation.persistence import TransactionActions
Expand All @@ -34,6 +36,12 @@

logger = logging.getLogger(__name__)

last_pdu_age_metric = Gauge(
"synapse_federation_last_sent_pdu_age",
"The age (in seconds) of the last PDU successfully sent to the given domain",
labelnames=("server_name",),
)


class TransactionManager:
"""Helper class which handles building and sending transactions
Expand All @@ -48,6 +56,10 @@ def __init__(self, hs: "synapse.server.HomeServer"):
self._transaction_actions = TransactionActions(self._store)
self._transport_layer = hs.get_federation_transport_client()

self._federation_metrics_domains = (
hs.get_config().federation.federation_metrics_domains
)

# HACK to get unique tx id
self._next_txn_id = int(self.clock.time_msec())

Expand Down Expand Up @@ -119,6 +131,9 @@ async def send_new_transaction(

# FIXME (erikj): This is a bit of a hack to make the Pdu age
# keys work
# FIXME (richardv): I also believe it no longer works. We (now?) store
# "age_ts" in "unsigned" rather than at the top level. See
# https://github.com/matrix-org/synapse/issues/8429.
def json_data_cb():
data = transaction.get_dict()
now = int(self.clock.time_msec())
Expand Down Expand Up @@ -167,5 +182,12 @@ def json_data_cb():
)
success = False

if success and pdus and destination in self._federation_metrics_domains:
last_pdu = pdus[-1]
last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts
last_pdu_age_metric.labels(server_name=destination).set(
last_pdu_age / 1000
)

set_tag(tags.ERROR, not success)
return success