diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/docker-compose.yml b/scripts/docker-integration-tests/prom_remote_write_backend/docker-compose.yml new file mode 100644 index 0000000000..6c38ae39de --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/docker-compose.yml @@ -0,0 +1,108 @@ +version: "3.5" +services: + m3aggregator01: + expose: + - "6001" + ports: + - "127.0.0.1:6001:6001" + networks: + - backend + environment: + - M3AGGREGATOR_HOST_ID=m3aggregator01 + image: "m3aggregator_integration:${REVISION}" + volumes: + - "./m3aggregator.yml:/etc/m3aggregator/m3aggregator.yml" + m3aggregator02: + expose: + - "6002" + ports: + - "127.0.0.1:6002:6001" + networks: + - backend + environment: + - M3AGGREGATOR_HOST_ID=m3aggregator02 + image: "m3aggregator_integration:${REVISION}" + volumes: + - "./m3aggregator.yml:/etc/m3aggregator/m3aggregator.yml" + m3coordinator01: + expose: + - "7202" + ports: + - "0.0.0.0:7202:7201" + networks: + - backend + image: "m3coordinator_integration:${REVISION}" + volumes: + - "./:/etc/m3coordinator/" + coordinatoradmin: + expose: + - "7201" + ports: + - "0.0.0.0:7201:7201" + networks: + - backend + image: "m3coordinator_integration:${REVISION}" + volumes: + - "./m3coordinator-admin.yml:/etc/m3coordinator/m3coordinator.yml" + prometheusraw: + expose: + - "9090" + ports: + - "0.0.0.0:9090:9090" + networks: + - backend + image: prom/prometheus:latest + volumes: + - "./prometheus.yml:/etc/prometheus/prometheus.yml" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--enable-feature=remote-write-receiver" + prometheusagg: + expose: + - "9091" + ports: + - "0.0.0.0:9091:9090" + networks: + - backend + image: prom/prometheus:latest + volumes: + - "./prometheus.yml:/etc/prometheus/prometheus.yml" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--enable-feature=remote-write-receiver" + etcd01: + expose: + - "2379-2380" + ports: + - "0.0.0.0:2379-2380:2379-2380" + networks: + - backend + image: quay.io/coreos/etcd:v3.4.3 + command: + - "etcd" + - "--name" + - "etcd01" + - "--listen-peer-urls" + - "http://0.0.0.0:2380" + - "--listen-client-urls" + - "http://0.0.0.0:2379" + - "--advertise-client-urls" + - "http://etcd01:2379" + - "--initial-cluster-token" + - "etcd-cluster-1" + - "--initial-advertise-peer-urls" + - "http://etcd01:2380" + - "--initial-cluster" + - "etcd01=http://etcd01:2380" + - "--initial-cluster-state" + - "new" + - "--data-dir" + - "/var/lib/etcd" +networks: + backend: diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/m3aggregator.yml b/scripts/docker-integration-tests/prom_remote_write_backend/m3aggregator.yml new file mode 100644 index 0000000000..1d77b0a035 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/m3aggregator.yml @@ -0,0 +1,216 @@ +logging: + level: info + +metrics: + scope: + prefix: m3aggregator + prometheus: + onError: none + handlerPath: /metrics + listenAddress: 0.0.0.0:6002 + timerType: histogram + sanitization: prometheus + samplingRate: 1.0 + extended: none + +m3msg: + server: + listenAddress: 0.0.0.0:6000 + retry: + maxBackoff: 10s + jitter: true + consumer: + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + +http: + listenAddress: 0.0.0.0:6001 + readTimeout: 60s + writeTimeout: 60s + +kvClient: + etcd: + env: override_test_env + zone: embedded + service: m3aggregator + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +runtimeOptions: + kvConfig: + environment: override_test_env + zone: embedded + writeValuesPerMetricLimitPerSecondKey: write-values-per-metric-limit-per-second + writeValuesPerMetricLimitPerSecond: 0 + writeNewMetricLimitClusterPerSecondKey: write-new-metric-limit-cluster-per-second + writeNewMetricLimitClusterPerSecond: 0 + writeNewMetricNoLimitWarmupDuration: 0 + +aggregator: + hostID: + resolver: environment + envVarName: M3AGGREGATOR_HOST_ID + instanceID: + type: host_id + verboseErrors: true + metricPrefix: "" + counterPrefix: "" + timerPrefix: "" + gaugePrefix: "" + aggregationTypes: + counterTransformFnType: empty + timerTransformFnType: suffix + gaugeTransformFnType: empty + aggregationTypesPool: + size: 1024 + quantilesPool: + buckets: + - count: 256 + capacity: 4 + - count: 128 + capacity: 8 + stream: + eps: 0.001 + capacity: 32 + streamPool: + size: 4096 + samplePool: + size: 4096 + floatsPool: + buckets: + - count: 4096 + capacity: 16 + - count: 2048 + capacity: 32 + - count: 1024 + capacity: 64 + client: + type: m3msg + m3msg: + producer: + writer: + topicName: aggregator_ingest + topicServiceOverride: + zone: embedded + environment: override_test_env + placement: + isStaged: true + placementServiceOverride: + namespaces: + placement: /placement + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + placementManager: + kvConfig: + namespace: /placement + environment: override_test_env + zone: embedded + placementWatcher: + key: m3aggregator + initWatchTimeout: 10s + hashType: murmur32 + bufferDurationBeforeShardCutover: 10m + bufferDurationAfterShardCutoff: 10m + bufferDurationForFutureTimedMetric: 10m # Allow test to write into future. + bufferDurationForPastTimedMetric: 10s # Don't wait too long for timed metrics to flush. + resignTimeout: 1m + flushTimesManager: + kvConfig: + environment: override_test_env + zone: embedded + flushTimesKeyFmt: shardset/%d/flush + flushTimesPersistRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 2s + maxRetries: 3 + electionManager: + election: + leaderTimeout: 10s + resignTimeout: 10s + ttlSeconds: 10 + serviceID: + name: m3aggregator + environment: override_test_env + zone: embedded + electionKeyFmt: shardset/%d/lock + campaignRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 2s + forever: true + jitter: true + changeRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 5s + forever: true + jitter: true + resignRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 5s + forever: true + jitter: true + campaignStateCheckInterval: 1s + shardCutoffCheckOffset: 30s + flushManager: + checkEvery: 1s + jitterEnabled: true + maxJitters: + - flushInterval: 5s + maxJitterPercent: 1.0 + - flushInterval: 10s + maxJitterPercent: 0.5 + - flushInterval: 1m + maxJitterPercent: 0.5 + - flushInterval: 10m + maxJitterPercent: 0.5 + - flushInterval: 1h + maxJitterPercent: 0.25 + numWorkersPerCPU: 0.5 + flushTimesPersistEvery: 10s + maxBufferSize: 10m + forcedFlushWindowSize: 10s + flush: + handlers: + - dynamicBackend: + name: m3msg + hashType: murmur32 + producer: + writer: + topicName: aggregated_metrics + topicServiceOverride: + zone: embedded + environment: override_test_env + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + passthrough: + enabled: true + forwarding: + maxConstDelay: 1m # Need to add some buffer window, since timed metrics by default are delayed by 1min. + entryTTL: 1h + entryCheckInterval: 10m + maxTimerBatchSizePerWrite: 140 + maxNumCachedSourceSets: 2 + discardNaNAggregatedValues: true + entryPool: + size: 4096 + counterElemPool: + size: 4096 + timerElemPool: + size: 4096 + gaugeElemPool: + size: 4096 diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator-admin.yml b/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator-admin.yml new file mode 100644 index 0000000000..c3c08c0104 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator-admin.yml @@ -0,0 +1,29 @@ +listenAddress: 0.0.0.0:7201 + +logging: + level: info + +metrics: + scope: + prefix: "coordinator" + prometheus: + handlerPath: /metrics + listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved + sanitization: prometheus + samplingRate: 1.0 + extended: none + +backend: noop-etcd +clusterManagement: + etcd: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +tagOptions: + idScheme: quoted diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator.yml b/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator.yml new file mode 100644 index 0000000000..d6c54c8430 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/m3coordinator.yml @@ -0,0 +1,102 @@ +listenAddress: 0.0.0.0:7201 + +logging: + level: info + +metrics: + scope: + prefix: "coordinator" + prometheus: + handlerPath: /metrics + listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved + sanitization: prometheus + samplingRate: 1.0 + extended: none + +backend: prom-remote + +prometheusRemoteBackend: + endpoints: + - name: raw + address: "http://prometheusraw:9090/api/v1/write" + - name: aggregated + address: "http://prometheusagg:9090/api/v1/write" + storagePolicy: + retention: 1h + resolution: 5s + downsample: + all: true + +clusterManagement: + etcd: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +tagOptions: + idScheme: quoted + +downsample: + rules: + rollupRules: + - name: "prom_remote_write_test_metric rolled up" + filter: "__name__:prom_remote_write_test_metric" + transforms: + - transform: + type: "Increase" + - rollup: + metricName: "prom_remote_write_test_metric_rolled_up" + groupBy: [] + aggregations: ["Sum"] + - transform: + type: "Add" + storagePolicies: + - resolution: 5s + retention: 1h + matcher: + requireNamespaceWatchOnInit: false + remoteAggregator: + client: + type: m3msg + m3msg: + producer: + writer: + topicName: aggregator_ingest + topicServiceOverride: + zone: embedded + environment: override_test_env + placement: + isStaged: true + placementServiceOverride: + namespaces: + placement: /placement + connection: + numConnections: 4 + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + +ingest: + ingester: + workerPoolSize: 10000 + opPool: + size: 10000 + retry: + maxRetries: 3 + jitter: true + logSampleRate: 0.01 + m3msg: + server: + listenAddress: "0.0.0.0:7507" + retry: + maxBackoff: 10s + jitter: true + +storeMetricsType: true \ No newline at end of file diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/prometheus.yml b/scripts/docker-integration-tests/prom_remote_write_backend/prometheus.yml new file mode 100644 index 0000000000..ddc9637b89 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/prometheus.yml @@ -0,0 +1,16 @@ +global: + external_labels: + role: "remote" + +alerting: + alertmanagers: + - static_configs: + - targets: + +rule_files: + +scrape_configs: + +remote_read: + +remote_write: diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/test.sh b/scripts/docker-integration-tests/prom_remote_write_backend/test.sh new file mode 100755 index 0000000000..208bb88565 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/test.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -xe + +M3_PATH=${M3_PATH:-$GOPATH/src/github.com/m3db/m3} +TESTDIR="$M3_PATH"/scripts/docker-integration-tests/ +REVISION=$(git rev-parse HEAD) +export REVISION +COMPOSE_FILE="$TESTDIR"/prom_remote_write_backend/docker-compose.yml +PROMREMOTECLI_IMAGE=quay.io/m3db/prometheus_remote_client_golang:v0.4.3 +TEST_SUCCESS=false + +source "$TESTDIR"/common.sh +source "$TESTDIR"/prom_remote_write_backend/utils.sh +source "$TESTDIR"/prom_remote_write_backend/tests.sh + +echo "Pull containers required for test" +docker pull $PROMREMOTECLI_IMAGE + +trap 'cleanup ${COMPOSE_FILE} ${TEST_SUCCESS}' EXIT + +echo "Run ETCD" +docker-compose -f "${COMPOSE_FILE}" up -d etcd01 + +echo "Run Coordinator in Admin mode" +docker-compose -f "${COMPOSE_FILE}" up -d coordinatoradmin +wait_until_ready "0.0.0.0:7201" + +initialize_m3_via_coordinator_admin + +echo "Run M3 containers" +docker-compose -f "${COMPOSE_FILE}" up -d m3aggregator01 +docker-compose -f "${COMPOSE_FILE}" up -d m3aggregator02 +docker-compose -f "${COMPOSE_FILE}" up -d m3coordinator01 + +echo "Start Prometheus containers" +docker-compose -f "${COMPOSE_FILE}" up -d prometheusraw +docker-compose -f "${COMPOSE_FILE}" up -d prometheusagg + +wait_until_leader_elected +wait_until_ready "0.0.0.0:7202" + +echo "Running tests" + +test_prometheus_remote_write_multi_namespaces + +TEST_SUCCESS=true diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/tests.sh b/scripts/docker-integration-tests/prom_remote_write_backend/tests.sh new file mode 100644 index 0000000000..b53403ef13 --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/tests.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -xe + +source "$M3_PATH"/scripts/docker-integration-tests/common.sh +source "$M3_PATH"/scripts/docker-integration-tests/prom_remote_write_backend/utils.sh + +function test_prometheus_remote_write_multi_namespaces { + now=$(date +"%s") + now_truncate_by=$(( now % 5 )) + now_truncated=$(( now - now_truncate_by )) + prometheus_raw_local_address="0.0.0.0:9090" + prometheus_agg_local_address="0.0.0.0:9091" + metric_name=prom_remote_write_test_metric + + # NB(antanas): just sending metrics multiple times to make sure everything is stable after startup. + for _ in {1..10} ; do + prometheus_remote_write \ + $metric_name $now_truncated 42 \ + true "Expected request to succeed" \ + 200 "Expected request to return status code 200" + done + + echo "Querying for data in raw prometheus" + query_metric $metric_name $prometheus_raw_local_address + + echo "Querying for data in aggregated prometheus" + query_metric "${metric_name}_rolled_up" $prometheus_agg_local_address +} \ No newline at end of file diff --git a/scripts/docker-integration-tests/prom_remote_write_backend/utils.sh b/scripts/docker-integration-tests/prom_remote_write_backend/utils.sh new file mode 100644 index 0000000000..96ffa2211e --- /dev/null +++ b/scripts/docker-integration-tests/prom_remote_write_backend/utils.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash + +set -xe + +source "$M3_PATH"/scripts/docker-integration-tests/common.sh + +function prometheus_remote_write { + local metric_name=$1 + local datapoint_timestamp=$2 + local datapoint_value=$3 + local expect_success=$4 + local expect_success_err=$5 + local expect_status=$6 + local expect_status_err=$7 + + network_name="prom_remote_write_backend_backend" + network=$(docker network ls | grep -F $network_name | tr -s ' ' | cut -f 1 -d ' ' | tail -n 1) + + out=$( (docker run -it --rm --network "$network" \ + "$PROMREMOTECLI_IMAGE" \ + -u http://m3coordinator01:7201/api/v1/prom/remote/write \ + -t __name__:"${metric_name}" \ + -d "${datapoint_timestamp}","${datapoint_value}" | grep -v promremotecli_log) || true) + + success=$(echo "$out" | grep -v promremotecli_log | jq .success) + status=$(echo "$out" | grep -v promremotecli_log | jq .statusCode) + if [[ "$success" != "$expect_success" ]]; then + echo "$expect_success_err" + return 1 + fi + if [[ "$status" != "$expect_status" ]]; then + echo "${expect_status_err}: actual=${status}" + return 1 + fi + echo "Returned success=${success}, status=${status} as expected" + return 0 +} + +function wait_until_ready { + host=$1 + # Check readiness probe eventually succeeds + echo "Check readiness probe eventually succeeds" + ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff \ + "[[ \$(curl --write-out \"%{http_code}\" --silent --output /dev/null $host/ready) -eq \"200\" ]]" +} + +function query_metric { + metric_name=$1 + host=$2 + ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff \ + "[[ \$(curl -sSf $host/api/v1/query?query=$metric_name | jq -r .data.result[0].value[1]) -gt 0 ]]" +} + +function wait_until_leader_elected { + ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff \ + "[[ \$(curl localhost:6001/status localhost:6002/status | grep leader) ]]" +} + +function cleanup { + local compose_file=$1 + local success=$2 + if [[ "$success" != "true" ]]; then + echo "Test failure, printing docker-compose logs" + docker-compose -f "${compose_file}" logs + fi + + docker-compose -f "${compose_file}" down || echo "unable to shutdown containers" # CI fails to stop all containers sometimes +} + +function initialize_m3_via_coordinator_admin { + echo "Initializing aggregator topology" + curl -vvvsSf -X POST -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/services/m3aggregator/placement/init -d '{ + "num_shards": 64, + "replication_factor": 2, + "instances": [ + { + "id": "m3aggregator01", + "isolation_group": "availability-zone-a", + "zone": "embedded", + "weight": 100, + "endpoint": "m3aggregator01:6000", + "hostname": "m3aggregator01", + "port": 6000 + }, + { + "id": "m3aggregator02", + "isolation_group": "availability-zone-b", + "zone": "embedded", + "weight": 100, + "endpoint": "m3aggregator02:6000", + "hostname": "m3aggregator02", + "port": 6000 + } + ] + }' + + echo "Initializing m3msg inbound topic for m3aggregator ingestion from m3coordinators" + curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic/init -d '{ + "numberOfShards": 64 + }' + + # Do this after placement and topic for m3aggregator is created. + echo "Adding m3aggregator as a consumer to the aggregator ingest topic" + curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic -d '{ + "consumerService": { + "serviceId": { + "name": "m3aggregator", + "environment": "override_test_env", + "zone": "embedded" + }, + "consumptionType": "REPLICATED", + "messageTtlNanos": "600000000000" + } + }' # msgs will be discarded after 600000000000ns = 10mins + + echo "Initializing m3coordinator topology" + curl -vvvsSf -X POST localhost:7201/api/v1/services/m3coordinator/placement/init -d '{ + "instances": [ + { + "id": "m3coordinator01", + "zone": "embedded", + "endpoint": "m3coordinator01:7507", + "hostname": "m3coordinator01", + "port": 7507 + } + ] + }' + echo "Done initializing m3coordinator topology" + + echo "Validating m3coordinator topology" + [ "$(curl -sSf localhost:7201/api/v1/services/m3coordinator/placement | jq .placement.instances.m3coordinator01.id)" == '"m3coordinator01"' ] + echo "Done validating topology" + + # Do this after placement for m3coordinator is created. + echo "Initializing m3msg outbound topic for m3coordinator ingestion from m3aggregators" + curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" 0.0.0.0:7201/api/v1/topic/init -d '{ + "numberOfShards": 64 + }' + + echo "Adding m3coordinator as a consumer to the aggregator publish topic" + curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" 0.0.0.0:7201/api/v1/topic -d '{ + "consumerService": { + "serviceId": { + "name": "m3coordinator", + "environment": "default_env", + "zone": "embedded" + }, + "consumptionType": "SHARED", + "messageTtlNanos": "600000000000" + } + }' # msgs will be discarded after 600000000000ns = 10mins +} \ No newline at end of file diff --git a/scripts/docker-integration-tests/run.sh b/scripts/docker-integration-tests/run.sh index c0ff48903c..844d924fc6 100755 --- a/scripts/docker-integration-tests/run.sh +++ b/scripts/docker-integration-tests/run.sh @@ -17,6 +17,7 @@ TESTS=( scripts/docker-integration-tests/multi_cluster_write/test.sh scripts/docker-integration-tests/coordinator_config_rules/test.sh scripts/docker-integration-tests/coordinator_noop/test.sh + scripts/docker-integration-tests/prom_remote_write_backend/test.sh ) # Some systems, including our default Buildkite hosts, don't come with netcat