From a69a2e3bd9e9ca21b9bebccb6f80c0a6656be125 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Thu, 10 Aug 2023 17:04:36 +0300 Subject: [PATCH] Changes representative of linux-3.10.0-1160.99.1.el7.tar.xz --- Makefile | 2 +- arch/x86/include/asm/microcode.h | 1 + arch/x86/include/asm/microcode_amd.h | 4 +- arch/x86/include/asm/msr-index.h | 8 +- arch/x86/kernel/cpu/amd.c | 197 ++++++---- arch/x86/kernel/cpu/common.c | 2 + fs/gfs2/dir.c | 39 +- fs/gfs2/quota.c | 9 +- include/net/netfilter/nf_conntrack_core.h | 7 +- kernel/sched/fair.c | 115 +++--- net/netfilter/nf_conntrack_ftp.c | 5 +- net/netfilter/nf_conntrack_h323_main.c | 3 +- net/netfilter/nf_conntrack_irc.c | 5 +- net/netfilter/nf_conntrack_pptp.c | 4 +- net/netfilter/nf_conntrack_proto_tcp.c | 447 +++++++++++++++------- net/netfilter/nf_conntrack_sane.c | 5 +- net/sched/cls_flower.c | 3 + scripts/kernel.spec | 38 +- 18 files changed, 589 insertions(+), 305 deletions(-) diff --git a/Makefile b/Makefile index ff4ebedce6d8c0..b1bb02db24fc71 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ EXTRAVERSION = NAME = Unicycling Gorilla RHEL_MAJOR = 7 RHEL_MINOR = 9 -RHEL_RELEASE = 1160.95.1 +RHEL_RELEASE = 1160.99.1 # # DRM backport version diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index a4ae5260f1f87b..bb1deda1353624 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -4,6 +4,7 @@ #include #include #include +#include #define native_rdmsr(msr, val1, val2) \ do { \ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 2b907ec5936431..70123d2610dbc4 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -47,11 +47,13 @@ extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); void reload_ucode_amd(void); +extern void amd_check_microcode(void); #else static inline void __init load_ucode_amd_bsp(unsigned int family) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } -void reload_ucode_amd(void) {} +static inline void reload_ucode_amd(void) {} +static inline void amd_check_microcode(void) {} #endif extern bool check_current_patch_level(u32 *rev, bool early); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 38110791185131..7b8e27a3789ec3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -395,6 +395,12 @@ #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) +#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9 + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -447,8 +453,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 619851c677bd2e..4389d288e72acd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -28,6 +28,74 @@ */ static u32 nodes_per_socket = 1; +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -637,10 +705,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) amd_get_topology_early(c); } -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); - void init_spectral_chicken(struct cpuinfo_x86 *c) { u64 value; @@ -689,6 +753,47 @@ static void init_amd_zn(struct cpuinfo_x86 *c) } } +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107a; break; + case 0x60 ... 0x67: good_rev = 0x0860010b; break; + case 0x68 ... 0x6f: good_rev = 0x08608105; break; + case 0x70 ... 0x7f: good_rev = 0x08701032; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } +} + static void init_amd(struct cpuinfo_x86 *c) { unsigned long long value; @@ -819,10 +924,10 @@ static void init_amd(struct cpuinfo_x86 *c) * but msr_set_bit() uses rdmsrl_safe() and wrmsrl_safe(). */ if (c->x86 > 0xf) - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); - /* LFENCE with MSR_F10H_DECFG[1]=1 stops RDTSC speculation */ + /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } @@ -903,6 +1008,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10 || c->x86 == 0x12) set_cpu_cap(c, X86_FEATURE_IBP_DISABLE); + + zenbleed_check(c); } #ifdef CONFIG_X86_32 @@ -1009,70 +1116,6 @@ static const struct cpu_dev amd_cpu_dev = { cpu_dev_register(amd_cpu_dev); -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_mask; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - void set_dr_addr_mask(unsigned long mask, int dr) { if (!cpu_has_bpext) @@ -1091,3 +1134,15 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b156d5eaaeaaa..b3f97ecffd5692 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1958,4 +1958,6 @@ void cpu_init(void) void microcode_check(void) { perf_check_microcode(); + + amd_check_microcode(); } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c5e83830a5cdbb..d1986d4d6b482a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -366,23 +366,17 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); + kvfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) { - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); - } else { + if (likely(!ip->i_hash_cache)) { ip->i_hash_cache = hc; + hc = NULL; } spin_unlock(&inode->i_lock); + kvfree(hc); return ip->i_hash_cache; } @@ -400,10 +394,7 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) spin_lock(&ip->i_inode.i_lock); hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); + kvfree(hc); spin_unlock(&ip->i_inode.i_lock); } @@ -1216,10 +1207,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - if (is_vmalloc_addr(hc2)) - vfree(hc2); - else - kfree(hc2); + kvfree(hc2); return error; } @@ -1355,14 +1343,6 @@ static void *gfs2_alloc_sort_buffer(unsigned size) return ptr; } -static void gfs2_free_sort_buffer(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); -} - static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, unsigned leaf_nr, struct gfs2_dirent **darr, unsigned entries) @@ -1493,7 +1473,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, for(i = 0; i < leaf; i++) if (larr[i]) brelse(larr[i]); - gfs2_free_sort_buffer(larr); + kvfree(larr); out: return error; } @@ -2097,10 +2077,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - if (is_vmalloc_addr(ht)) - vfree(ht); - else - kfree(ht); + kvfree(ht); return error; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f60dc377d78c73..ffbdb85ffe96d3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1477,13 +1477,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); - if (sdp->sd_quota_bitmap) { - if (is_vmalloc_addr(sdp->sd_quota_bitmap)) - vfree(sdp->sd_quota_bitmap); - else - kfree(sdp->sd_quota_bitmap); - sdp->sd_quota_bitmap = NULL; - } + kvfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index f6da8b71420bd4..7acf1e2faab0ff 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -69,8 +69,13 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) int ret = NF_ACCEPT; if (ct && !nf_ct_is_untracked(ct)) { - if (!nf_ct_is_confirmed(ct)) + if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); + + if (ret == NF_ACCEPT) + ct = (struct nf_conn *)skb_nfct(skb); + } + if (likely(ret == NF_ACCEPT)) nf_ct_deliver_cached_events(ct); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b668d353e2133d..bb75e1c58c2695 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3305,16 +3305,16 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -3326,13 +3326,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -3341,6 +3353,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -3417,13 +3431,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -3450,29 +3484,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) rq->nr_running -= task_delta; - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_rq->throttled_list); - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. - */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -3520,11 +3538,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3535,10 +3552,16 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -3553,8 +3576,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -3565,7 +3586,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -3594,24 +3614,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - cfs_b->runtime -= min(runtime, cfs_b->runtime); } /* @@ -3738,10 +3751,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock(&cfs_b->lock); - cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -3803,8 +3815,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static inline u64 default_cfs_period(void); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 36e06a643c47f3..9ddbe15b7a88e6 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -429,7 +429,10 @@ static int help(struct sk_buff *skb, spin_lock_bh(&nf_ftp_lock); fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); - BUG_ON(fb_ptr == NULL); + if (!fb_ptr) { + spin_unlock_bh(&nf_ftp_lock); + return NF_ACCEPT; + } ends_in_nl = (fb_ptr[datalen - 1] == '\n'); seq = ntohl(th->seq) + datalen; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 15495b956855d3..8343af172deb4a 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -147,7 +147,8 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, /* Get first TPKT pointer */ tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, h323_buffer); - BUG_ON(tpkt == NULL); + if (!tpkt) + goto clear_out; /* Validate TPKT identifier */ if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 0bc277a1440e3b..2068389d8d58c2 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -147,7 +147,10 @@ static int help(struct sk_buff *skb, unsigned int protoff, spin_lock_bh(&irc_buffer_lock); ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, irc_buffer); - BUG_ON(ib_ptr == NULL); + if (!ib_ptr) { + spin_unlock_bh(&irc_buffer_lock); + return NF_ACCEPT; + } data = ib_ptr; data_limit = ib_ptr + skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index ce14c8b3290561..2ea86ea567a16b 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -530,7 +530,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, nexthdr_off = protoff; tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); - BUG_ON(!tcph); + if (!tcph) + return NF_ACCEPT; + nexthdr_off += tcph->doff * 4; datalen = tcplen - tcph->doff * 4; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 60983b01160f13..fcb7b8740c8302 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -63,6 +63,12 @@ static const char *const tcp_conntrack_names[] = { "SYN_SENT2", }; +enum nf_ct_tcp_action { + NFCT_TCP_IGNORE, + NFCT_TCP_INVALID, + NFCT_TCP_ACCEPT, +}; + #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS @@ -394,10 +400,11 @@ static void tcp_options(const struct sk_buff *skb, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; - state->td_scale = - state->flags = 0; + state->td_scale = 0; + state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; @@ -410,6 +417,8 @@ static void tcp_options(const struct sk_buff *skb, length--; continue; default: + if (length < 2) + return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; @@ -449,7 +458,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED @@ -470,6 +480,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, length--; continue; default: + if (length < 2) + return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; @@ -497,30 +509,85 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static bool tcp_in_window(const struct nf_conn *ct, - struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - const struct tcphdr *tcph, - u_int8_t pf) +static void tcp_init_sender(struct ip_ct_tcp_state *sender, + struct ip_ct_tcp_state *receiver, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u32 end, u32 win) +{ + /* SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { + sender->td_scale = 0; + receiver->td_scale = 0; + } +} + +__printf(5, 6) +static noinline_for_stack enum nf_ct_tcp_action +nf_tcp_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const struct ip_ct_tcp_state *sender, + enum nf_ct_tcp_action ret, + const char *fmt, ...) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); + int pf = nf_ct_l3num(ct); + va_list args; + bool be_liberal; + + be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal; + if (be_liberal) + return NFCT_TCP_ACCEPT; + + if (LOG_INVALID(net, IPPROTO_TCP)) { + char scratch[128]; + + va_start(args, fmt); + vsnprintf(scratch, sizeof(scratch), fmt, args); + va_end(args); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "%s", scratch); + } + + return ret; +} + +static enum nf_ct_tcp_action +tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, + unsigned int index, const struct sk_buff *skb, + unsigned int dataoff, const struct tcphdr *tcph, + u_int8_t pf) +{ + struct ip_ct_tcp *state = &ct->proto.tcp; struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; + bool in_recv_win, seq_ok; s32 receiver_offset; - bool res, in_recv_win; + u16 win_raw; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); - win = ntohs(tcph->window); + win_raw = ntohs(tcph->window); + win = win_raw; end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) @@ -548,27 +615,12 @@ static bool tcp_in_window(const struct nf_conn *ct, * Initialize sender data. */ if (tcph->syn) { - /* - * SYN-ACK in reply to a SYN - * or SYN from reply direction in simultaneous open. - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, dataoff, tcph, sender); - /* - * RFC 1323: - * Both sides must send the Window Scale option - * to enable window scaling in either direction. - */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE - && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) - sender->td_scale = - receiver->td_scale = 0; + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win); if (!tcph->ack) /* Simultaneous open */ - return true; + return NFCT_TCP_ACCEPT; } else { /* * We are in the middle of a connection, @@ -579,29 +631,39 @@ static bool tcp_in_window(const struct nf_conn *ct, swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; - /* - * We haven't seen traffic in the other direction yet - * but we have to tweak window tracking to pass III - * and IV until that happens. - */ - if (receiver->td_maxwin == 0) + if (receiver->td_maxwin == 0) { + /* We haven't seen traffic in the other + * direction yet but we have to tweak window + * tracking to pass III and IV until that + * happens. + */ receiver->td_end = receiver->td_maxend = sack; + } else if (sack == receiver->td_end + 1) { + /* Likely a reply to a keepalive. + * Needed for III. + */ + receiver->td_end++; + } + } - } else if (((state->state == TCP_CONNTRACK_SYN_SENT - && dir == IP_CT_DIR_ORIGINAL) - || (state->state == TCP_CONNTRACK_SYN_RECV - && dir == IP_CT_DIR_REPLY)) - && after(end, sender->td_end)) { + } else if (tcph->syn && + after(end, sender->td_end) && + (state->state == TCP_CONNTRACK_SYN_SENT || + state->state == TCP_CONNTRACK_SYN_RECV)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." + * + * Re-init state for this direction, just like for the first + * syn(-ack) reply, it might differ in seq, ack or tcp options. */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win); - tcp_options(skb, dataoff, tcph, sender); + if (dir == IP_CT_DIR_REPLY && !tcph->ack) + return NFCT_TCP_ACCEPT; } if (!(tcph->ack)) { @@ -636,6 +698,46 @@ static bool tcp_in_window(const struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); + seq_ok = before(seq, sender->td_maxend + 1); + if (!seq_ok) { + u32 overshot = end - sender->td_maxend + 1; + bool ack_ok; + + ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); + in_recv_win = receiver->td_maxwin && + after(end, sender->td_end - receiver->td_maxwin - 1); + + if (in_recv_win && + ack_ok && + overshot <= receiver->td_maxwin && + before(sack, receiver->td_end + 1)) { + /* Work around TCPs that send more bytes than allowed by + * the receive window. + * + * If the (marked as invalid) packet is allowed to pass by + * the ruleset and the peer acks this data, then its possible + * all future packets will trigger 'ACK is over upper bound' check. + * + * Thus if only the sequence check fails then do update td_end so + * possible ACK for this data can update internal state. + */ + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + return nf_tcp_log_invalid(skb, ct, sender, NFCT_TCP_IGNORE, + "%u bytes more than expected", overshot); + } + + return nf_tcp_log_invalid(skb, ct, sender, NFCT_TCP_INVALID, + "SEQ is over upper bound %u (over the window of the receiver)", + sender->td_maxend + 1); + } + + if (!before(sack, receiver->td_end + 1)) + return nf_tcp_log_invalid(skb, ct, sender, NFCT_TCP_INVALID, + "ACK is over upper bound %u (ACKed data not seen yet)", + receiver->td_end + 1); + /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); @@ -646,91 +748,127 @@ static bool tcp_in_window(const struct nf_conn *ct, before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); - if (before(seq, sender->td_maxend + 1) && - in_recv_win && - before(sack, receiver->td_end + 1) && - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { - /* - * Take into account window scaling (RFC 1323). - */ - if (!tcph->syn) - win <<= sender->td_scale; - - /* - * Update sender data. - */ - swin = win + (sack - ack); - if (sender->td_maxwin < swin) - sender->td_maxwin = swin; - if (after(end, sender->td_end)) { - sender->td_end = end; - sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; - } - if (tcph->ack) { - if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { - sender->td_maxack = ack; - sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; - } else if (after(ack, sender->td_maxack)) - sender->td_maxack = ack; - } - - /* - * Update receiver data. - */ - if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) - receiver->td_maxwin += end - sender->td_maxend; - if (after(sack + win, receiver->td_maxend - 1)) { - receiver->td_maxend = sack + win; - if (win == 0) - receiver->td_maxend++; + if (!in_recv_win) + return nf_tcp_log_invalid(skb, ct, sender, NFCT_TCP_IGNORE, + "SEQ is under lower bound %u (already ACKed data retransmitted)", + sender->td_end - receiver->td_maxwin - 1); + if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) + return nf_tcp_log_invalid(skb, ct, sender, NFCT_TCP_IGNORE, + "ignored ACK under lower bound %u (possible overly delayed)", + receiver->td_end - MAXACKWINDOW(sender) - 1); + + /* Take into account window scaling (RFC 1323). */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* Update sender data. */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) { + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) { + sender->td_maxack = ack; } - if (ack == receiver->td_end) - receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } - /* - * Check retransmissions. - */ - if (index == TCP_ACK_SET) { - if (state->last_dir == dir - && state->last_seq == seq - && state->last_ack == ack - && state->last_end == end - && state->last_win == win) - state->retrans++; - else { - state->last_dir = dir; - state->last_seq = seq; - state->last_ack = ack; - state->last_end = end; - state->last_win = win; - state->retrans = 0; - } + /* Update receiver data. */ + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* Check retransmissions. */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir && + state->last_seq == seq && + state->last_ack == ack && + state->last_end == end && + state->last_win == win_raw) { + state->retrans++; + } else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win_raw; + state->retrans = 0; } - res = true; - } else { - res = false; - if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - tn->tcp_be_liberal) - res = true; - if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: %s ", - before(seq, sender->td_maxend + 1) ? - in_recv_win ? - before(sack, receiver->td_end + 1) ? - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" - : "ACK is under the lower bound (possible overly delayed ACK)" - : "ACK is over the upper bound (ACKed data not seen yet)" - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); } - pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", - res, sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); - return res; + return NFCT_TCP_ACCEPT; +} + +static void __cold nf_tcp_handle_invalid(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + enum ip_conntrack_dir dir, + int index, + const struct sk_buff *skb, + const unsigned int *timeouts) + +{ + unsigned int timeout; + u32 expires; + + if (!test_bit(IPS_ASSURED_BIT, &ct->status) || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + return; + + /* We don't want to have connections hanging around in ESTABLISHED + * state for long time 'just because' conntrack deemed a FIN/RST + * out-of-window. + * + * Shrink the timeout just like when there is unacked data. + * This speeds up eviction of 'dead' connections where the + * connection and conntracks internal state are out of sync. + */ + switch (index) { + case TCP_RST_SET: + case TCP_FIN_SET: + break; + default: + return; + } + + if (ct->proto.tcp.last_dir != dir && + (ct->proto.tcp.last_index == TCP_FIN_SET || + ct->proto.tcp.last_index == TCP_RST_SET)) { + expires = nf_ct_expires(ct); + if (expires < 120 * HZ) + return; + + timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]); + if (expires > timeout) { + struct net *net = nf_ct_net(ct); + int pf = nf_ct_l3num(ct); + + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "packet (index %d, dir %d) response for index %d lower timeout to %u", + index, dir, ct->proto.tcp.last_index, timeout); + + nf_ct_refresh_acct(ct, ctinfo, skb, timeout); + } + } else { + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + } } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ @@ -807,6 +945,16 @@ static unsigned int *tcp_get_timeouts(struct net *net) return tcp_pernet(net)->timeouts; } +static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) +{ + state->td_end = 0; + state->td_maxend = 0; + state->td_maxwin = 0; + state->td_maxack = 0; + state->td_scale = 0; + state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; +} + /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, @@ -820,6 +968,7 @@ static int tcp_packet(struct nf_conn *ct, struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; + enum nf_ct_tcp_action res; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; @@ -908,8 +1057,7 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; - memset(&ct->proto.tcp.seen[dir], 0, - sizeof(struct ip_ct_tcp_state)); + nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; @@ -949,11 +1097,19 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } + + /* possible challenge ack reply to syn */ + if (old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && + dir == IP_CT_DIR_REPLY) + ct->proto.tcp.last_ack = ntohl(th->ack_seq); + spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + "packet (index %d) in dir %d ignored, state %s", + index, dir, + tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or @@ -1026,16 +1182,33 @@ static int tcp_packet(struct nf_conn *ct, * segments we ignored. */ goto in_window; } - /* Just fall through */ + + /* Reset in response to a challenge-ack we let through earlier */ + if (index == TCP_RST_SET && + old_state == TCP_CONNTRACK_SYN_SENT && + ct->proto.tcp.last_index == TCP_ACK_SET && + ct->proto.tcp.last_dir == IP_CT_DIR_REPLY && + ntohl(th->seq) == ct->proto.tcp.last_ack) + goto in_window; + + break; default: /* Keep compilers happy. */ break; } - if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th, pf)) { + res = tcp_in_window(ct, dir, index, + skb, dataoff, th, pf); + switch (res) { + case NFCT_TCP_IGNORE: + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + case NFCT_TCP_INVALID: + nf_tcp_handle_invalid(ct, ctinfo, dir, index, skb, timeouts); spin_unlock_bh(&ct->lock); return -NF_ACCEPT; + case NFCT_TCP_ACCEPT: + break; } in_window: /* From now on we have got in-window packets */ @@ -1081,6 +1254,16 @@ static int tcp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } + + if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { + /* do not renew timeout on SYN retransmit. + * + * Else port reuse by client or NAT middlebox can keep + * entry alive indefinitely (including nat info). + */ + return NF_ACCEPT; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index a843fcd66a0467..8969efc6b54ee3 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -98,7 +98,10 @@ static int help(struct sk_buff *skb, spin_lock_bh(&nf_sane_lock); sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); - BUG_ON(sb_ptr == NULL); + if (!sb_ptr) { + spin_unlock_bh(&nf_sane_lock); + return NF_ACCEPT; + } if (dir == IP_CT_DIR_ORIGINAL) { if (datalen != sizeof(struct sane_request)) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 68287b05c8472f..8116d4324bacb6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -598,6 +598,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, if (option_len > sizeof(struct geneve_opt)) data_len = option_len - sizeof(struct geneve_opt); + if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4) + return -ERANGE; + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; memset(opt, 0xff, option_len); opt->length = data_len / 4; diff --git a/scripts/kernel.spec b/scripts/kernel.spec index f5508c5e4659e5..cdebfa2e3b2fc7 100644 --- a/scripts/kernel.spec +++ b/scripts/kernel.spec @@ -20,10 +20,10 @@ Summary: The Linux kernel %global distro_build 1160 %define rpmversion 3.10.0 -%define pkgrelease 1160.95.1.el7 +%define pkgrelease 1160.99.1.el7 # allow pkg_release to have configurable %%{?dist} tag -%define specrelease 1160.95.1%{?dist} +%define specrelease 1160.99.1%{?dist} %define pkg_release %{specrelease}%{?buildid} @@ -1804,6 +1804,40 @@ fi %kernel_variant_files %{with_kdump} kdump %changelog +* Thu Aug 10 2023 Rado Vrbovsky [3.10.0-1160.99.1.el7] +- x86/cpu/amd: Add a Zenbleed fix (Waiman Long) [2226841] {CVE-2023-20593} +- x86/cpu/amd: Move the errata checking functionality up (Waiman Long) [2226841] {CVE-2023-20593} +- x86/cpu: Restore AMD's DE_CFG MSR after resume (Waiman Long) [2226841] {CVE-2023-20593} + +* Thu Aug 03 2023 Rado Vrbovsky [3.10.0-1160.98.1.el7] +- GFS2: gfs2_dir_get_hash_table(): avoiding deferred vfree() is easy here... (Andrew Price) [2190450] +- GFS2: use kvfree() instead of open-coding it (Andrew Price) [2190450] + +* Mon Jul 24 2023 Rado Vrbovsky [3.10.0-1160.97.1.el7] +- net/sched: flower: fix possible OOB write in fl_set_geneve_opt() (Davide Caratti) [2216982] {CVE-2023-35788} +- netfilter: conntrack: re-fetch conntrack after insertion (Florian Westphal) [2188190] +- netfilter: conntrack: handle tcp challenge acks during connection reuse (Florian Westphal) [2128262] +- netfilter: conntrack: reduce timeout when receiving out-of-window fin or rst (Florian Westphal) [2128262] +- netfilter: conntrack: remove unneeded indent level (Florian Westphal) [2128262] +- netfilter: conntrack: ignore overly delayed tcp packets (Florian Westphal) [2128262] +- netfilter: conntrack: prepare tcp_in_window for ternary return value (Florian Westphal) [2128262] +- netfilter: conntrack: connection timeout after re-register (Florian Westphal) [2128262] +- netfilter: conntrack: always store window size un-scaled (Florian Westphal) [2128262] +- netfilter: conntrack: work around exceeded receive window (Florian Westphal) [2128262] +- netfilter: conntrack: avoid misleading 'invalid' in log message (Florian Westphal) [2128262] +- netfilter: remove BUG_ON() after skb_header_pointer() (Florian Westphal) [2128262] +- netfilter: nf_conntrack_tcp: re-init for syn packets only (Florian Westphal) [2128262] +- netfilter: nf_conntrack_tcp: preserve liberal flag in tcp options (Florian Westphal) [2128262] +- netfilter: conntrack: re-init state for retransmitted syn-ack (Florian Westphal) [2128262] +- netfilter: conntrack: move synack init code to helper (Florian Westphal) [2128262] +- netfilter: conntrack: do not renew entry stuck in tcp SYN_SENT state (Florian Westphal) [2128262] +- netfilter: nf_conntrack_tcp: Fix stack out of bounds when parsing TCP options (Florian Westphal) [2128262] + +* Fri Jul 07 2023 Jan Stancek [3.10.0-1160.96.1.el7] +- sched/fair: Eliminate bandwidth race between throttling and distribution (Phil Auld) [2180681] +- sched/fair: Fix race between runtime distribution and assignment (Phil Auld) [2180681] +- sched/fair: Don't assign runtime for throttled cfs_rq (Phil Auld) [2180681] + * Fri Jun 23 2023 Rado Vrbovsky [3.10.0-1160.95.1.el7] - perf/s390x: Align the register list to what we support (Michael Petlan) [2207745] - Revert "[tools] s390/perf: add perf register support for floating-point registers" (Michael Petlan) [2207745]