diff --git a/ci/tests.sh b/ci/tests.sh index e326093a02..6720df94d7 100755 --- a/ci/tests.sh +++ b/ci/tests.sh @@ -14,10 +14,6 @@ pushd "${ROOT_DIR}" || exit 1 echo "Test helm templates generation" "${TESTS_DIR}/run.sh" || (echo "Failed testing templates" && exit 1) -# Test upgrade v2 script -echo "Test upgrade v2 script..." -"${TESTS_DIR}/upgrade_v2_script/run.sh" || (echo "Failed testing upgrade v2 script" && exit 1) - popd || exit 1 echo "DONE" diff --git a/deploy/helm/sumologic/upgrade-2.0.0.sh b/deploy/helm/sumologic/upgrade-2.0.0.sh deleted file mode 100755 index 5c4ba21698..0000000000 --- a/deploy/helm/sumologic/upgrade-2.0.0.sh +++ /dev/null @@ -1,1141 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail -IFS=$'\n\t' - -readonly OLD_VALUES_YAML="${1:---help}" -readonly PREVIOUS_VERSION=1.3 - -readonly TEMP_FILE=upgrade-2.0.0-temp-file - -readonly MIN_BASH_VERSION=4.0 -readonly MIN_YQ_VERSION=3.4.0 -readonly MAX_YQ_VERSION=4.0.0 - -readonly KEY_MAPPINGS=" -prometheus-operator.prometheusOperator.tlsProxy.enabled:kube-prometheus-stack.prometheusOperator.tls.enabled -otelcol.deployment.image.name:otelcol.deployment.image.repository -fluent-bit.image.fluent_bit.repository:fluent-bit.image.repository -fluent-bit.image.fluent_bit.tag:fluent-bit.image.tag -" - -readonly KEY_VALUE_MAPPINGS=" -" - -readonly KEY_MAPPINGS_MULTIPLE=" -image.repository:fluentd.image.repository:sumologic.setup.job.image.repository -image.tag:fluentd.image.tag:sumologic.setup.job.image.tag -image.pullPolicy:fluentd.image.pullPolicy:sumologic.setup.job.image.pullPolicy -" - -readonly KEYS_TO_DELETE=" -prometheus-operator -fluent-bit.metrics -fluent-bit.trackOffsets -fluent-bit.service.flush -fluent-bit.backend -fluent-bit.input -fluent-bit.parsers -fluent-bit.rawConfig -" - -# https://slides.com/perk/how-to-train-your-bash#/41 -readonly LOG_FILE="/tmp/$(basename "$0").log" -info() { echo -e "[INFO] $*" | tee -a "${LOG_FILE}" >&2 ; } -warning() { echo -e "[WARNING] $*" | tee -a "${LOG_FILE}" >&2 ; } -error() { echo -e "[ERROR] $*" | tee -a "${LOG_FILE}" >&2 ; } -fatal() { echo -e "[FATAL] $*" | tee -a "${LOG_FILE}" >&2 ; exit 1 ; } - -function print_help_and_exit() { - local MAN - set +e - read -r -d '' MAN < x >= ${MIN_YQ_VERSION}) https://github.com/mikefarah/yq/releases/tag/${MIN_YQ_VERSION} - grep - sed - bash (>= ${MIN_BASH_VERSION}) - -Usage: - # for default helm release name 'collection' and namespace 'sumologic' - ./upgrade-2.0.0.sh /path/to/values.yaml - - # for non-default helm release name and k8s namespace - ./upgrade-2.0.0.sh /path/to/values.yaml helm_release_name k8s_namespace - -Returns: - new_values.yaml - -For more details, please refer to Migration steps and Changelog here: -https://github.com/SumoLogic/sumologic-kubernetes-collection/blob/release-v2.0/deploy/docs/v2_migration_doc.md -EOF - set -e - - echo "${MAN}" - exit 0 -} - -function check_if_print_help_and_exit() { - if [[ "$1" == "--help" ]]; then - print_help_and_exit - fi -} - -function check_required_command() { - local command_to_check="$1" - command -v "${command_to_check}" >/dev/null 2>&1 || { error "Required command is missing: ${command_to_check}"; fatal "Please consult --help and install missing commands before continue. Aborting."; } -} - -function compare_versions() { - local no_lower_than="${1}" - local app_version="${2}" - - if [[ "$(printf '%s\n' "${app_version}" "${no_lower_than}" | sort -V | head -n 1)" == "${no_lower_than}" ]]; then - echo "pass" - else - echo "fail" - fi -} - -function check_app_version() { - local app_name="${1}" - local no_lower_than="${2}" - local app_version="${3}" - - if [[ -z ${app_version} ]] || [[ $(compare_versions "${no_lower_than}" "${app_version}") == "fail" ]]; then - error "${app_name} version: '${app_version}' is invalid - it should be no lower than ${no_lower_than}" - fatal "Please update your ${app_name} and retry." - fi -} - -function check_app_version_with_max() { - local app_name="${1}" - local no_lower_than="${2}" - local lower_than="${3}" - local app_version="${4}" - - if [[ -z ${app_version} ]] || [[ $(compare_versions "${no_lower_than}" "${app_version}") == "fail" ]]; then - error "${app_name} version: '${app_version}' is invalid - it should be no lower than ${no_lower_than}" - fatal "Please update your ${app_name} and retry." - fi - - if [[ "${app_version}" == "${lower_than}" ]] || [[ $(compare_versions "${app_version}" "${lower_than}") == "fail" ]]; then - error "${app_name} version: '${app_version}' is invalid - it should be lower than ${lower_than}" - fatal "Please downgrade '${app_name}' and retry." - fi -} - -function check_yq_version() { - local yq_version - readonly yq_version=$(yq --version | grep -oE '[^[:space:]]+$') - - check_app_version_with_max "yq" "${MIN_YQ_VERSION}" "${MAX_YQ_VERSION}" "${yq_version}" -} - -function check_bash_version() { - check_app_version "bash" "${MIN_BASH_VERSION}" "${BASH_VERSION}" -} - -function create_temp_file() { - echo -n > "${TEMP_FILE}" -} - -function migrate_prometheus_operator_to_kube_prometheus_stack() { - # Nothing to migrate, return - if [[ -z $(yq r "${TEMP_FILE}" prometheus-operator) ]] ; then - return - fi - - if [[ -n "$(yq r "${TEMP_FILE}" 'prometheus-operator.prometheus.prometheusSpec.containers.(name==prometheus-config-reloader)')" ]]; then - info "Migrating prometheus-config-reloader container to config-reloader in prometheusSpec" - yq m -i --arrays append \ - "${TEMP_FILE}" \ - <( - yq p <( - yq w <( - yq r "${TEMP_FILE}" -- 'prometheus-operator.prometheus.prometheusSpec.containers.(name==prometheus-config-reloader)' \ - ) name config-reloader \ - ) 'prometheus-operator.prometheus.prometheusSpec.containers[+]' - ) - yq d -i "${TEMP_FILE}" "prometheus-operator.prometheus.prometheusSpec.containers.(name==prometheus-config-reloader)" - fi - - info "Migrating from prometheus-operator to kube-prometheus-stack" - yq m -i \ - "${TEMP_FILE}" \ - <( - yq p \ - <(yq r "${TEMP_FILE}" "prometheus-operator") \ - "kube-prometheus-stack" \ - ) - yq d -i "${TEMP_FILE}" "prometheus-operator" -} - -function migrate_prometheus_retention_period() { - if [[ -z "$(yq r "${TEMP_FILE}" -- 'prometheus-operator')" ]]; then - return - fi - local RETENTION - readonly RETENTION="$(yq r "${TEMP_FILE}" -- 'prometheus-operator.prometheus.prometheusSpec.retention')" - - if [[ -z "${RETENTION}" ]]; then - return - fi - - if [[ "${RETENTION}" == "7d" ]]; then - info "Changing prometheus retention period from 7d to 1d" - yq w -i "${TEMP_FILE}" 'prometheus-operator.prometheus.prometheusSpec.retention' 1d - elif [[ "${RETENTION}" != "1d" ]]; then - warning "Prometheus retention set to ${RETENTION} (different than 7d - default for 1.3). Bailing migration to 1d (default for 2.0)" - fi -} - -function migrate_prometheus_recording_rules() { - if [[ -z "$(yq r "${TEMP_FILE}" -- 'prometheus-operator')" ]]; then - return - fi - - local RECORDING_RULES_OVERRIDE - readonly RECORDING_RULES_OVERRIDE=$(yq r "${TEMP_FILE}" -- 'prometheus-operator.kubeTargetVersionOverride') - - if [[ "${RECORDING_RULES_OVERRIDE}" == "1.13.0-0" ]]; then - add_prometheus_pre_1_14_recording_rules "${TEMP_FILE}" - info "Removing prometheus kubeTargetVersionOverride='1.13.0-0'" - yq d -i "${TEMP_FILE}" "prometheus-operator.kubeTargetVersionOverride" - elif [[ -n "${RECORDING_RULES_OVERRIDE}" ]]; then - warning "prometheus-operator.kubeTargetVersionOverride should be unset or set to '1.13.0-0' but it's set to: ${RECORDING_RULES_OVERRIDE}" - warning "Please unset it or set it to '1.13.0-0' and rerun this script" - fi -} - -function kube_prometheus_stack_update_remote_write_regexes() { - local URL_METRICS_OPERATOR_RULE - # shellcheck disable=SC2016 - readonly URL_METRICS_OPERATOR_RULE='http://$(FLUENTD_METRICS_SVC).$(NAMESPACE).svc.cluster.local:9888/prometheus.metrics.operator.rule' - - local PROMETHEUS_METRICS_OPERATOR_RULE_REGEX - readonly PROMETHEUS_METRICS_OPERATOR_RULE_REGEX="cluster_quantile:apiserver_request_latencies:histogram_quantile|instance:node_filesystem_usage:sum|instance:node_network_receive_bytes:rate:sum|cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile|cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile|cluster_quantile:scheduler_binding_latency:histogram_quantile|node_namespace_pod:kube_pod_info:|:kube_pod_info_node_count:|node:node_num_cpu:sum|:node_cpu_utilisation:avg1m|node:node_cpu_utilisation:avg1m|node:cluster_cpu_utilisation:ratio|:node_cpu_saturation_load1:|node:node_cpu_saturation_load1:|:node_memory_utilisation:|node:node_memory_bytes_total:sum|node:node_memory_utilisation:ratio|node:cluster_memory_utilisation:ratio|:node_memory_swap_io_bytes:sum_rate|node:node_memory_utilisation:|node:node_memory_utilisation_2:|node:node_memory_swap_io_bytes:sum_rate|:node_disk_utilisation:avg_irate|node:node_disk_utilisation:avg_irate|:node_disk_saturation:avg_irate|node:node_disk_saturation:avg_irate|node:node_filesystem_usage:|node:node_filesystem_avail:|:node_net_utilisation:sum_irate|node:node_net_utilisation:sum_irate|:node_net_saturation:sum_irate|node:node_net_saturation:sum_irate|node:node_inodes_total:|node:node_inodes_free:" - - local TEMP_REWRITE_PROMETHEUS_METRICS_OPERATOR_RULE - readonly TEMP_REWRITE_PROMETHEUS_METRICS_OPERATOR_RULE="$( - yq r "${TEMP_FILE}" \ - "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.\"url==${URL_METRICS_OPERATOR_RULE}\"" - )" - - local CURRENT_METRICS_OPERATOR_RULE_REGEX - readonly CURRENT_METRICS_OPERATOR_RULE_REGEX="$( - yq r "${TEMP_FILE}" \ - "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.\"url==${URL_METRICS_OPERATOR_RULE}\".writeRelabelConfigs[0].regex" - )" - if [[ -n "${CURRENT_METRICS_OPERATOR_RULE_REGEX}" ]]; then - if [[ -n $(diff <(echo "${PROMETHEUS_METRICS_OPERATOR_RULE_REGEX}") <(echo "${CURRENT_METRICS_OPERATOR_RULE_REGEX}")) ]] ; then - info "Updating prometheus regex in rewrite rule for url: ${URL_METRICS_OPERATOR_RULE} but it has a different value than expected" - info "Actual: '${CURRENT_METRICS_OPERATOR_RULE_REGEX}'" - info "Expected: '${PROMETHEUS_METRICS_OPERATOR_RULE_REGEX}'" - fi - fi - - if [[ -n "${TEMP_REWRITE_PROMETHEUS_METRICS_OPERATOR_RULE}" ]]; then - info "Updating prometheus regex in rewrite rule for url: ${URL_METRICS_OPERATOR_RULE}..." - # shellcheck disable=SC2016 - yq delete -i "${TEMP_FILE}" 'kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite."url==http://$(FLUENTD_METRICS_SVC).$(NAMESPACE).svc.cluster.local:9888/prometheus.metrics.operator.rule"' - - local SCRIPT - SCRIPT="$(cat <<- EOF - - command: update - path: 'kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.[+]' - value: - url: ${URL_METRICS_OPERATOR_RULE} - remoteTimeout: 5s - writeRelabelConfigs: - - action: keep - regex: 'cluster_quantile:apiserver_request_duration_seconds:histogram_quantile|instance:node_filesystem_usage:sum|instance:node_network_receive_bytes:rate:sum|cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile|cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile|cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile|node_namespace_pod:kube_pod_info:|:kube_pod_info_node_count:|node:node_num_cpu:sum|:node_cpu_utilisation:avg1m|node:node_cpu_utilisation:avg1m|node:cluster_cpu_utilisation:ratio|:node_cpu_saturation_load1:|node:node_cpu_saturation_load1:|:node_memory_utilisation:|node:node_memory_bytes_total:sum|node:node_memory_utilisation:ratio|node:cluster_memory_utilisation:ratio|:node_memory_swap_io_bytes:sum_rate|node:node_memory_utilisation:|node:node_memory_utilisation_2:|node:node_memory_swap_io_bytes:sum_rate|:node_disk_utilisation:avg_irate|node:node_disk_utilisation:avg_irate|:node_disk_saturation:avg_irate|node:node_disk_saturation:avg_irate|node:node_filesystem_usage:|node:node_filesystem_avail:|:node_net_utilisation:sum_irate|node:node_net_utilisation:sum_irate|:node_net_saturation:sum_irate|node:node_net_saturation:sum_irate|node:node_inodes_total:|node:node_inodes_free:' - sourceLabels: [__name__] - EOF -)" - - yq w -i "${TEMP_FILE}" --script <(echo "${SCRIPT}") - fi - - ############################################################################## - - local URL_METRICS_CONTROL_PLANE_COREDNS -# shellcheck disable=SC2016 - readonly URL_METRICS_CONTROL_PLANE_COREDNS='http://$(FLUENTD_METRICS_SVC).$(NAMESPACE).svc.cluster.local:9888/prometheus.metrics.control-plane.coredns' - - local PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS_REGEX - readonly PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS_REGEX="coredns;(?:coredns_cache_(size|(hits|misses)_total)|coredns_dns_request_duration_seconds_(count|sum)|coredns_(dns_request|dns_response_rcode|forward_request)_count_total|process_(cpu_seconds_total|open_fds|resident_memory_bytes))" - - local TEMP_REWRITE_PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS - readonly TEMP_REWRITE_PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS="$( - yq r "${TEMP_FILE}" \ - "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.\"url==${URL_METRICS_CONTROL_PLANE_COREDNS}\"" - )" - - local CURRENT_METRICS_CONTROL_PLANE_COREDNS_REGEX - readonly CURRENT_METRICS_CONTROL_PLANE_COREDNS_REGEX="$( - yq r "${TEMP_FILE}" \ - "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.\"url==${URL_METRICS_CONTROL_PLANE_COREDNS}\".writeRelabelConfigs[0].regex" - )" - if [[ -n "${CURRENT_METRICS_CONTROL_PLANE_COREDNS_REGEX}" ]] ; then - if [[ -n $(diff <(echo "${PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS_REGEX}") <(echo "${CURRENT_METRICS_CONTROL_PLANE_COREDNS_REGEX}")) ]] ; then - info "Updating prometheus regex in rewrite rule for url: ${URL_METRICS_CONTROL_PLANE_COREDNS} but it has a different value than expected" - info "Actual: '${CURRENT_METRICS_CONTROL_PLANE_COREDNS_REGEX}'" - info "Expected: '${PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS_REGEX}'" - fi - fi - - if [[ -n "${TEMP_REWRITE_PROMETHEUS_METRICS_CONTROL_PLANE_COREDNS}" ]]; then - info "Updating prometheus regex in rewrite rule for url: ${URL_METRICS_CONTROL_PLANE_COREDNS}..." - yq delete -i "${TEMP_FILE}" "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.\"url==${URL_METRICS_CONTROL_PLANE_COREDNS}\"" - - local SCRIPT - SCRIPT="$(cat <<- EOF - - command: update - path: 'kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.[+]' - value: - url: ${URL_METRICS_CONTROL_PLANE_COREDNS} - remoteTimeout: 5s - writeRelabelConfigs: - - action: keep - regex: 'coredns;(?:coredns_cache_(size|entries|(hits|misses)_total)|coredns_dns_request_duration_seconds_(count|sum)|coredns_(dns_request|dns_response_rcode|forward_request)_count_total|coredns_(forward_requests|dns_requests|dns_responses)_total|process_(cpu_seconds_total|open_fds|resident_memory_bytes))' - sourceLabels: [job, __name__] - EOF -)" - - yq w -i "${TEMP_FILE}" --script <(echo "${SCRIPT}") - fi -} - -function kube_prometheus_stack_migrate_remote_write_urls() { - info "Migrating prometheus remote write urls" - - # shellcheck disable=SC2016 - sed -i'.bak' \ - 's#http://$(CHART).$(NAMESPACE).svc.cluster.local:9888/prometheus#http://$(FLUENTD_METRICS_SVC).$(NAMESPACE).svc.cluster.local:9888/prometheus#g' \ - "${TEMP_FILE}" && \ - rm "${TEMP_FILE}".bak -} - -function kube_prometheus_stack_migrate_chart_env_variable() { - if [[ -z "$(yq r "${TEMP_FILE}" -- 'kube-prometheus-stack.prometheus.prometheusSpec.containers.(name==config-reloader).env.(name==CHART)')" ]]; then - return - fi - - yq w -i "${TEMP_FILE}" \ - 'kube-prometheus-stack.prometheus.prometheusSpec.containers.(name==config-reloader).env.(name==CHART).name' \ - FLUENTD_METRICS_SVC -} - -function add_prometheus_pre_1_14_recording_rules() { - local temp_file="${1}" - local PROMETHEUS_RULES - # Using tags below for heredoc - PROMETHEUS_RULES=$(cat <<- "EOF" - groups: - - name: node-pre-1.14.rules - rules: - - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) - record: :node_cpu_utilisation:avg1m - - expr: |- - 1 - avg by (node) ( - rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info:) - record: node:node_cpu_utilisation:avg1m - - expr: |- - 1 - - sum( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: ':node_memory_utilisation:' - - expr: |- - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_available:sum - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - node:node_memory_bytes_total:sum - record: node:node_memory_utilisation:ratio - - expr: |- - 1 - - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: 'node:node_memory_utilisation:' - - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) - record: 'node:node_memory_utilisation_2:' - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_usage:' - - expr: |- - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_total:sum - - expr: |- - sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_utilisation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_utilisation:sum_irate - - expr: |- - sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_saturation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_saturation:sum_irate - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_usage:' - - expr: |- - sum(node_load1{job="node-exporter"}) - / - sum(node:node_num_cpu:sum) - record: ':node_cpu_saturation_load1:' - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: 'node:node_cpu_saturation_load1:' - - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_saturation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_saturation:avg_irate - - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_utilisation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_utilisation:avg_irate - - expr: |- - 1e3 * sum( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - ) - record: :node_memory_swap_io_bytes:sum_rate - - expr: |- - 1e3 * sum by (node) ( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_swap_io_bytes:sum_rate - - expr: |- - node:node_cpu_utilisation:avg1m - * - node:node_num_cpu:sum - / - scalar(sum(node:node_num_cpu:sum)) - record: node:cluster_cpu_utilisation:ratio - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - scalar(sum(node:node_memory_bytes_total:sum)) - record: node:cluster_memory_utilisation:ratio - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: 'node:node_cpu_saturation_load1:' - - expr: |- - max by (instance, namespace, pod, device) ( - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - / - node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - ) - record: 'node:node_filesystem_avail:' - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_usage:' - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_total:' - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_free:' - EOF -) - - info "Adding 'additionalPrometheusRulesMap.pre-1.14-node-rules' to prometheus configuration" - yq w -i "${temp_file}" 'prometheus-operator.additionalPrometheusRulesMap."pre-1.14-node-rules"' \ - --from <(echo "${PROMETHEUS_RULES}") -} - -function add_new_scrape_labels_to_prometheus_service_monitors(){ - if [[ -z "$(yq r "${TEMP_FILE}" -- 'prometheus-operator.prometheus.additionalServiceMonitors')" ]]; then - return - fi - - info "Adding 'sumologic.com/scrape: \"true\"' scrape labels to prometheus service monitors" - yq w --style double -i "${TEMP_FILE}" \ - 'prometheus-operator.prometheus.additionalServiceMonitors.[*].selector.matchLabels."sumologic.com/scrape"' true -} - -function kube_prometheus_stack_set_remote_write_timeout_to_5s() { - local prometheus_spec - readonly prometheus_spec="$(yq r "${TEMP_FILE}" -- "kube-prometheus-stack.prometheus.prometheusSpec")" - - if [[ -z "$(yq r - 'remoteWrite' <<< "${prometheus_spec}")" ]]; then - # No remoteWrite to migrate - return - fi - - if [[ -n "$(yq r - 'remoteWrite.[*].remoteTimeout' <<< "${prometheus_spec}")" ]]; then - echo - info "kube-prometheus-stack.prometheus.prometheusSpec.remoteWrite.[*].remoteTimeout is set" - info "Please note that we've set it by default to 5s in 2.0.0" - fi - - local new_remote_write - new_remote_write="" - - local remote_timeout - local remote_write - - local len - readonly len="$(yq r --length - "remoteWrite" <<< "${prometheus_spec}")" - for (( i=0; i - ``` - - to - - ```yaml - spec: - selector: - matchLabels: - app.kubernetes.io/name: fluent-bit - app.kubernetes.io/instance: - ``` - -- Persistence for Fluentd is enabled by default. - -- Gzip compression is enabled by default. - If you already had Fluentd persistence enabled, but gzip compression was disabled, - after the upgrade Fluentd will not be able to read the non-compressed chunks written before the upgrade. - To fix this, see [Troubleshooting - Gzip compression errors](#gzip-compression-errors). - -### How to upgrade - -#### Requirements - -- `helm3` -- `yq` in version: `3.4.0` <= `x` < `4.0.0` -- `bash` 4.0 or higher - -**Note: The below steps are using Helm 3. Helm 2 is not supported.** - -#### 1. Upgrade to helm chart version `v1.3.5` - -If you're running a newer version than `v1.3.5`, instructions from this document -will also work for you. - -##### Ensure you have sumologic helm repo added - -Before running commands shown below please make sure that you have -sumologic helm repo configured. -One can check that using: - -```bash -helm repo list -NAME URL -... -sumologic https://sumologic.github.io/sumologic-kubernetes-collection -... -``` - -If sumologic helm repo is not configured use the following command to add it: - -```bash -helm repo add sumologic https://sumologic.github.io/sumologic-kubernetes-collection -``` - -##### Update - -Run the command shown below to fetch the latest helm chart: - -```bash -helm repo update -``` - -For users who are not already on `v1.3.5` of the helm chart, please upgrade -to that version first by running the below command: - -```bash -helm upgrade sumologic/sumologic --reuse-values --version=1.3.5 -``` - -#### 2. Upgrade Prometheus CRDs - -Due to changes in `kube-prometheus-stack` which this chart depends on, one will -need to run the following commands in order to update Prometheus related CRDs: - -```bash -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml -kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.43.2/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml -``` - -If you have a separate Prometheus operator installation, you need to make sure its version -is [v0.43.2](https://github.com/prometheus-operator/prometheus-operator/releases/tag/v0.43.2) -or higher but compatible before proceeding with the next steps of the collection upgrade. - -#### 3. Prepare Fluent Bit instance - -As `spec.selector` in Fluent Bit Helm chart was modified, it is required to manually recreate -or delete existing DaemonSet with old version of `spec.selector` before upgrade. - -One of the following two strategies can be used: - -- ##### Recreating Fluent Bit DaemonSet - - Recreating Fluent Bit DaemonSet with new `spec.selector` may cause that - applications' logs and Fluent Bit metrics will not be available in the time of recreation. - It usually shouldn't take more than several seconds. - - To recreate the Fluent Bit DaemonSet with new `spec.selector` one can run the following command: - - ```bash - kubectl get daemonset --namespace --selector "app=fluent-bit,release=" --output yaml | \ - yq w - "items[*].spec.selector.matchLabels[app.kubernetes.io/name]" "fluent-bit" | \ - yq w - "items[*].spec.selector.matchLabels[app.kubernetes.io/instance]" "" | \ - yq w - "items[*].spec.template.metadata.labels[app.kubernetes.io/name]" "fluent-bit" | \ - yq w - "items[*].spec.template.metadata.labels[app.kubernetes.io/instance]" "" | \ - yq d - "items[*].spec.selector.matchLabels[app]" | \ - yq d - "items[*].spec.selector.matchLabels[release]" | \ - kubectl apply --namespace --force --filename - - ``` - - **Notice** When DaemonSet managed by helm is modified by the command specified above, - one might expect a warning similar to the one below: - `Warning: kubectl apply should be used on resource created by either kubectl create --save-config or kubectl apply` - -- ##### Preparing temporary instance of Fluent Bit - - Create temporary instance of Fluent Bit and delete DaemonSet with old version of `spec.selector`. - This will cause application' logs to be duplicated until temporary instance of Fluent Bit is deleted - after the upgrade is complete. As temporary instance of Fluent Bit creates additional Pods - which are selected by the same Fluent Bit Service you may observe changes in Fluent Bit metrics. - - Copy of database, in which Fluent Bit keeps track of monitored files and offsets, - is used by temporary instance of Fluent Bit (Fluent Bit database is copied in initContainer). - Temporary instance of Fluent Bit will start reading logs with offsets saved in database. - - To create a temporary copy of Fluent Bit DaemonSet: - - ```bash - INIT_CONTAINER=$(cat <<-"EOF" - name: init-tmp-fluent-bit - image: busybox:latest - command: ['sh', '-c', 'mkdir -p /tail-db/tmp; cp /tail-db/*.db /tail-db/tmp'] - volumeMounts: - - mountPath: /tail-db - name: tail-db - EOF - ) && \ - TMP_VOLUME=$(cat <<-"EOF" - hostPath: - path: /var/lib/fluent-bit/tmp - type: DirectoryOrCreate - name: tmp-tail-db - EOF - ) && \ - kubectl get daemonset --namespace --selector "app=fluent-bit,release=" --output yaml | \ - yq w - "items[*].metadata.name" "tmp-fluent-bit" | \ - yq w - "items[*].metadata.labels[heritage]" "tmp" | \ - yq w - "items[*].spec.template.metadata.labels[app.kubernetes.io/name]" "fluent-bit" | \ - yq w - "items[*].spec.template.metadata.labels[app.kubernetes.io/instance]" "" | \ - yq w - "items[*].spec.template.spec.initContainers[+]" --from <(echo "${INIT_CONTAINER}") | \ - yq w - "items[*].spec.template.spec.volumes[+]" --from <(echo "${TMP_VOLUME}") | \ - yq w - "items[*].spec.template.spec.containers[*].volumeMounts[*].(.==tail-db)" "tmp-tail-db" | \ - kubectl create --filename - - ``` - - Please make sure that Pods related to new DaemonSet are running: - - ```bash - kubectl get pod \ - --namespace \ - --selector "app=fluent-bit,release=,app.kubernetes.io/name=fluent-bit,app.kubernetes.io/instance=" - ``` - - Please check that the latest logs are duplicated in Sumo. - - To delete Fluent Bit DaemonSet with old version of `spec.selector`: - - ```bash - kubectl delete daemonset \ - --namespace \ - --selector "app=fluent-bit,heritage=Helm,release=" - ``` - - **Notice:** When collection upgrade creates new DaemonSet for Fluent Bit, - logs will be duplicated. - In order to stop data duplication it is required to remove the temporary copy - of Fluent Bit DaemonSet after the upgrade has finished. - - After collection upgrade is done, in order to remove the temporary Fluent Bit - DaemonSet run the following commands: - - ```bash - kubectl wait --for=condition=ready pod \ - --namespace \ - --selector "app.kubernetes.io/name=fluent-bit,app.kubernetes.io/instance=,app!=fluent-bit,release!=" && \ - kubectl delete daemonset \ - --namespace \ - --selector "app=fluent-bit,release=,heritage=tmp" - ``` - -#### 4. Configure Fluentd persistence - -Starting with `v2.0.0` we're using file-based buffer for Fluentd instead of less -reliable in-memory buffer (`fluentd.persistence.enabled=true`) by default. - -When Fluentd persistence is enabled then no action is required in order to upgrade. - -When Fluentd persistence is disabled (default setting in `1.3.5` release) -it is required to either go through persistence enabling procedure before upgrade (recommended) -or preserve existing setting and modify default setting for Fluentd persistence in `2.0.0` release. - -**In order to enable persistence in existing collection** please follow one -of persistence enabling procedures described in -[Enabling Fluentd Persistence guide](fluentd-persistence.md#enabling-fluentd-persistence) -before upgrade. - -If Fluentd persistence is disabled and it is desired to preserve this setting, -modify defaults and disable persistence either by adding `--set fluentd.persistence.enabled=false` -to `helm upgrade` command or in the `user-values.yaml` file under the `fluentd` key as follows: - -```yaml -fluentd: - persistence: - enabled: false -``` - -#### 5. Run upgrade script - -For Helm users, the only breaking changes are the renamed config parameters. -For users who use a `user-values.yaml` file, we provide a script that users can run -to convert their existing `user-values.yaml` file into one that is compatible with the major release. - -- Get the existing values for the helm chart and store it as `current_user-values.yaml` - with the below command: - - ```bash - helm get values --output yaml > current_user-values.yaml - ``` - -- Run the upgrade script. You can run it: - - - On your the host. Please refer to the [requirements](#requirements) so that you have - all the required software packages installed. - - ```bash - curl -LJO https://raw.githubusercontent.com/SumoLogic/sumologic-kubernetes-collection/release-v2.0/deploy/helm/sumologic/upgrade-2.0.0.sh \ - && chmod +x upgrade-2.0.0.sh \ - && ./upgrade-2.0.0.sh current_user-values.yaml - ``` - - - In a docker container: - - ```bash - cat current_user-values.yaml | \ - docker run \ - --rm \ - -i sumologic/kubernetes-tools:2.13.0 upgrade-2.0 | \ - tee new_user-values.yaml - ``` - - Note that this will output both migration script logs and new values file but - only the values file contents will be put into `new_user-values.yaml` due to `tee`. - - - In a container on your cluster: - - ```bash - cat current_user-values.yaml | \ - kubectl run kubernetes-tools -i \ - --quiet \ - --rm \ - --restart=Never \ - --image sumologic/kubernetes-tools:2.13.0 -- upgrade-2.0 | \ - tee new_user-values.yaml - ``` - - Note that this will output both migration script logs and new values file but - only the values file contents will be put into `new_user-values.yaml` due to `tee`. - -- At this point you should have `new_user-values.yaml` in your working directory which - can be used for the upgrade: - - ```bash - helm upgrade sumologic/sumologic --version=2.0.0 -f new_user-values.yaml - ``` - -#### 6. Troubleshooting - -##### Gzip compression errors - -If after the upgrade you can see the following errors from Fluentd logs or metrics pods: - -```console -2021-01-18 15:47:23 +0000 [warn]: #0 [sumologic.endpoint.logs.gc] failed to flush the buffer. retry_time=3 next_retry_seconds=2021-01-18 15:47:27 +0000 chunk="5b92e97a5ee3cbd7e59859644d9686e3" error_class=Zlib::GzipFile::Error error="not in gzip format" -``` - -This means Fluentd persistence has already been enabled before migration, but gzip compression was not. -Fluentd is unable to read the non-compressed chunks persisted before the upgrade. - -To fix this, delete the Fluentd pods that emit this error, -deleting their PVC-s at the same time to drop the non-compressed chunks. - -For example, if the namespace where the collection is installed is `collection` -and the pod that emits the error is named `sumologic-fluentd-logs-1`, -run the following set of commands: - -```bash -NAMESPACE_NAME=collection -POD_NAME=sumologic-fluentd-logs-1 -kubectl -n ${NAMESPACE_NAME} delete pvc "buffer-${POD_NAME}" & -kubectl -n ${NAMESPACE_NAME} delete pod ${POD_NAME} -kubectl -n ${NAMESPACE_NAME} delete pod ${POD_NAME} -``` - -The duplicated pod deletion command is there to make sure the pod is not stuck in `Pending` state -with event `persistentvolumeclaim "buffer-sumologic-fluentd-logs-1" not found`. - -## Non-Helm Users - -### Breaking Changes - -- From `v2.0.0` we recommend to use helm3 template as replacement for pre-generated - kubernetes templates. - Because of that, all custom changes made to the templates should be moved to `user-values.yaml`. - This will simplify and improve experience for non-helm installation. - -### How to upgrade for Non-helm Users - -#### 1. Tear down existing Fluentd, Prometheus, Fluent Bit and Falco resources - -You will need the YAML files you created when you first installed collection. -Run the following commands to remove Falco, Fluent-bit, Prometheus Operator and FluentD. -You do not need to delete the Namespace and Secret you originally created as they will still be used. - -```sh -kubectl delete -f falco.yaml -kubectl delete -f fluent-bit.yaml -kubectl delete -f prometheus.yaml -kubectl delete -f fluentd-sumologic.yaml -``` - -#### 2. Deploy collection with new approach - -- Follow steps mentioned [here][non_helm_installation_customizing_installation] - to deploy new collection. - -[non_helm_installation_customizing_installation]: https://github.com/SumoLogic/sumologic-kubernetes-collection/blob/release-v2.0/deploy/docs/Non_Helm_Installation.md#customizing-installation diff --git a/tests/helm/upgrade_v2_script/run.sh b/tests/helm/upgrade_v2_script/run.sh deleted file mode 100755 index 0afb92780d..0000000000 --- a/tests/helm/upgrade_v2_script/run.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# Test generation: -# export test_name=example_test; \ -# bash deploy/helm/sumologic/upgrade-2.0.0.sh \ -# tests/upgrade_v2_script/static/${test_name}.input.yaml \ -# 1> tests/upgrade_v2_script/static/${test_name}.log 2>&1 \ -# && cp new_values.yaml tests/upgrade_v2_script/static/${test_name}.output.yaml - -SCRIPT_PATH="$( dirname "$(realpath "${0}")" )" - -# shellcheck disable=SC1090 -# shellcheck source=tests/helm/functions.sh -source "${SCRIPT_PATH}/../functions.sh" -readonly TEST_TMP_OUT="${SCRIPT_PATH}/tmp/out.log" - -set_variables "${SCRIPT_PATH}" - -prepare_tests - -TEST_SUCCESS=true -for input_file in ${TEST_INPUT_FILES}; do - test_name="${input_file//.input.yaml/}" - output_file="${test_name}.output.yaml" - log_file="${test_name}.log" - - test_start "${test_name}" - bash "${TEST_SCRIPT_PATH}/../../../deploy/helm/sumologic/upgrade-2.0.0.sh" "${TEST_STATICS_PATH}/${input_file}" 1>"${TEST_TMP_OUT}" 2>&1 - mv new_values.yaml "${TEST_OUT}" - - test_output=$(diff "${TEST_STATICS_PATH}/${output_file}" "${TEST_OUT}") - test_log=$(diff "${TEST_STATICS_PATH}/${log_file}" "${TEST_TMP_OUT}") - rm "${TEST_TMP_OUT}" "${TEST_OUT}" - - if [[ -n "${test_output}" || -n "${test_log}" ]]; then - if [[ -n "${test_output}" ]]; then - echo -e "\tOutput diff (${TEST_STATICS_PATH}/${output_file}):\n${test_output}" - fi - if [[ -n "${test_log}" ]]; then - echo -e "\tLog diff (${TEST_STATICS_PATH}/${log_file}):\n${test_log}" - fi - test_failed "${test_name}" - TEST_SUCCESS=false - else - test_passed "${test_name}" - fi -done - -cleanup_tests - -if [[ "${TEST_SUCCESS}" = "true" ]]; then - exit 0 -else - exit 1 -fi diff --git a/tests/helm/upgrade_v2_script/static/fluent_bit.input.yaml b/tests/helm/upgrade_v2_script/static/fluent_bit.input.yaml deleted file mode 100644 index 7aea75f186..0000000000 --- a/tests/helm/upgrade_v2_script/static/fluent_bit.input.yaml +++ /dev/null @@ -1,160 +0,0 @@ -fluent-bit: - podLabels: {} - podAnnotations: {} - env: - - name: CHART - valueFrom: - configMapKeyRef: - name: sumologic-configmap - key: fluentdLogs - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - extraVolumes: - - name: project2-db - emptyDir: {} - - name: project-db - emptyDir: {} - - name: project2-alerts - hostPath: - path: /var/project2/logs/alerts/alerts.json - type: File - - name: company-db - emptyDir: {} - extraVolumeMounts: - - name: project2-db - mountPath: /fluent-bit/project2 - - name: project-db - mountPath: /fluent-bit/project - - name: project2-alerts - mountPath: /var/project2/logs/alerts/alerts.json - readOnly: true - - name: company-db - mountPath: /fluent-bit/company - tolerations: - - effect: NoSchedule - operator: Exists - image: - fluent_bit: - repository: public.ecr.aws/sumologic/fluent-bit - tag: 1.6.10 - pullPolicy: Always - service: - flush: 5 - metrics: - enabled: true - trackOffsets: true - - backend: - type: forward - forward: - ## NOTE: Requires trailing "." for fully-qualified name resolution - host: ${CHART}.${NAMESPACE}.svc.cluster.local. - port: 24321 - tls: "off" - tls_verify: "on" - tls_debug: 1 - shared_key: - input: - systemd: - enabled: true - parsers: - enabled: true - ## This regex matches the first line of a multiline log starting with a date of the format : "2019-11-17 07:14:12" or "2019-11-17T07:14:12" - regex: - - name: multi_line - regex: (?^{"log":"\d{4}-\d{1,2}-\d{1,2}.\d{2}:\d{2}:\d{2}.*) - - name: crio - regex: ^(?