Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add prometheus metrics to track federation delays #8430

Merged
merged 2 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/8430.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add prometheus metrics to track federation delays.
12 changes: 12 additions & 0 deletions docs/sample_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ acme:
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]


## Federation ##

# Restrict federation to the following whitelist of domains.
# N.B. we recommend also firewalling your federation listener to limit
Expand Down Expand Up @@ -662,6 +663,17 @@ federation_ip_range_blacklist:
- 'fe80::/64'
- 'fc00::/7'

# Report prometheus metrics on the age of PDUs being sent to and received from
# the following domains. This can be used to give an idea of "delay" on inbound
# and outbound federation, though be aware that any delay can be due to problems
# at either end or with the intermediate network.
#
# By default, no domains are monitored in this way.
#
#federation_metrics_domains:
# - matrix.org
# - example.com


## Caching ##

Expand Down
6 changes: 4 additions & 2 deletions synapse/config/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, List
from typing import Any, Iterable

import jsonschema

from synapse.config._base import ConfigError
from synapse.types import JsonDict


def validate_config(json_schema: JsonDict, config: Any, config_path: List[str]) -> None:
def validate_config(
json_schema: JsonDict, config: Any, config_path: Iterable[str]
) -> None:
"""Validates a config setting against a JsonSchema definition

This can be used to validate a section of the config file against a schema
Expand Down
27 changes: 26 additions & 1 deletion synapse/config/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

from netaddr import IPSet

from ._base import Config, ConfigError
from synapse.config._base import Config, ConfigError
from synapse.config._util import validate_config


class FederationConfig(Config):
Expand Down Expand Up @@ -52,8 +53,18 @@ def read_config(self, config, **kwargs):
"Invalid range(s) provided in federation_ip_range_blacklist: %s" % e
)

federation_metrics_domains = config.get("federation_metrics_domains") or []
validate_config(
_METRICS_FOR_DOMAINS_SCHEMA,
federation_metrics_domains,
("federation_metrics_domains",),
)
self.federation_metrics_domains = set(federation_metrics_domains)

def generate_config_section(self, config_dir_path, server_name, **kwargs):
return """\
## Federation ##

# Restrict federation to the following whitelist of domains.
# N.B. we recommend also firewalling your federation listener to limit
# inbound federation traffic as early as possible, rather than relying
Expand Down Expand Up @@ -85,4 +96,18 @@ def generate_config_section(self, config_dir_path, server_name, **kwargs):
- '::1/128'
- 'fe80::/64'
- 'fc00::/7'

# Report prometheus metrics on the age of PDUs being sent to and received from
# the following domains. This can be used to give an idea of "delay" on inbound
# and outbound federation, though be aware that any delay can be due to problems
# at either end or with the intermediate network.
richvdh marked this conversation as resolved.
Show resolved Hide resolved
#
# By default, no domains are monitored in this way.
#
#federation_metrics_domains:
# - matrix.org
# - example.com
"""


_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
1 change: 0 additions & 1 deletion synapse/config/homeserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,4 @@ class HomeServerConfig(RootConfig):
TracerConfig,
WorkerConfig,
RedisConfig,
FederationConfig,
richvdh marked this conversation as resolved.
Show resolved Hide resolved
]
1 change: 0 additions & 1 deletion synapse/config/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,6 @@ def generate_config_section(
# or by checking matrix.org/federationtester/api/report?server_name=$host
#
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]

"""
# Lowercase the string representation of boolean values
% {
Expand Down
24 changes: 23 additions & 1 deletion synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
Union,
)

from prometheus_client import Counter, Histogram
from prometheus_client import Counter, Gauge, Histogram

from twisted.internet import defer
from twisted.internet.abstract import isIPAddress
Expand Down Expand Up @@ -88,6 +88,13 @@
)


last_pdu_age_metric = Gauge(
"synapse_federation_last_received_pdu_age",
"The age (in seconds) of the last PDU successfully received from the given domain",
labelnames=("server_name",),
)


class FederationServer(FederationBase):
def __init__(self, hs):
super().__init__(hs)
Expand Down Expand Up @@ -118,6 +125,10 @@ def __init__(self, hs):
hs, "state_ids_resp", timeout_ms=30000
)

self._federation_metrics_domains = (
hs.get_config().federation.federation_metrics_domains
)

async def on_backfill_request(
self, origin: str, room_id: str, versions: List[str], limit: int
) -> Tuple[int, Dict[str, Any]]:
Expand Down Expand Up @@ -262,7 +273,11 @@ async def _handle_pdus_in_txn(

pdus_by_room = {} # type: Dict[str, List[EventBase]]

newest_pdu_ts = 0

for p in transaction.pdus: # type: ignore
# FIXME (richardv): I don't think this works:
# https://github.com/matrix-org/synapse/issues/8429
if "unsigned" in p:
unsigned = p["unsigned"]
if "age" in unsigned:
Expand Down Expand Up @@ -300,6 +315,9 @@ async def _handle_pdus_in_txn(
event = event_from_pdu_json(p, room_version)
pdus_by_room.setdefault(room_id, []).append(event)

if event.origin_server_ts > newest_pdu_ts:
newest_pdu_ts = event.origin_server_ts

pdu_results = {}

# we can process different rooms in parallel (which is useful if they
Expand Down Expand Up @@ -340,6 +358,10 @@ async def process_pdus_for_room(room_id: str):
process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT
)

if newest_pdu_ts and origin in self._federation_metrics_domains:
newest_pdu_age = self._clock.time_msec() - newest_pdu_ts
last_pdu_age_metric.labels(server_name=origin).set(newest_pdu_age / 1000)

return pdu_results

async def _handle_edus_in_txn(self, origin: str, transaction: Transaction):
Expand Down
22 changes: 22 additions & 0 deletions synapse/federation/sender/transaction_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import logging
from typing import TYPE_CHECKING, List

from prometheus_client import Gauge

from synapse.api.errors import HttpResponseException
from synapse.events import EventBase
from synapse.federation.persistence import TransactionActions
Expand All @@ -34,6 +36,12 @@

logger = logging.getLogger(__name__)

last_pdu_age_metric = Gauge(
"synapse_federation_last_sent_pdu_age",
"The age (in seconds) of the last PDU successfully sent to the given domain",
labelnames=("server_name",),
)


class TransactionManager:
"""Helper class which handles building and sending transactions
Expand All @@ -48,6 +56,10 @@ def __init__(self, hs: "synapse.server.HomeServer"):
self._transaction_actions = TransactionActions(self._store)
self._transport_layer = hs.get_federation_transport_client()

self._federation_metrics_domains = (
hs.get_config().federation.federation_metrics_domains
)

# HACK to get unique tx id
self._next_txn_id = int(self.clock.time_msec())

Expand Down Expand Up @@ -119,6 +131,9 @@ async def send_new_transaction(

# FIXME (erikj): This is a bit of a hack to make the Pdu age
# keys work
# FIXME (richardv): I also believe it no longer works. We (now?) store
# "age_ts" in "unsigned" rather than at the top level. See
# https://github.com/matrix-org/synapse/issues/8429.
def json_data_cb():
data = transaction.get_dict()
now = int(self.clock.time_msec())
Expand Down Expand Up @@ -167,5 +182,12 @@ def json_data_cb():
)
success = False

if success and pdus and destination in self._federation_metrics_domains:
last_pdu = pdus[-1]
last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts
last_pdu_age_metric.labels(server_name=destination).set(
last_pdu_age / 1000
)

set_tag(tags.ERROR, not success)
return success