From 3662118266e5f4212ee4ec0f6b92ac476c0269e6 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Fri, 5 Jul 2024 17:52:48 +0300 Subject: [PATCH] Changes representative of linux-5.14.0-427.26.1.el9_4.tar.xz --- Documentation/admin-guide/cgroup-v2.rst | 12 +- Documentation/atomic_bitops.txt | 2 +- Makefile | 4 +- Makefile.rhelver | 2 +- block/blk-cgroup.c | 58 +++--- drivers/misc/cardreader/rtsx_pcr.c | 12 +- drivers/net/ethernet/amazon/ena/ena_netdev.c | 14 +- .../ethernet/hisilicon/hns/hns_dsaf_misc.c | 4 + .../hisilicon/hns3/hns3vf/hclgevf_mbx.c | 3 +- drivers/net/ethernet/intel/ice/ice_ddp.c | 23 ++- .../marvell/octeontx2/af/rvu_debugfs.c | 4 +- .../net/ethernet/mellanox/mlx5/core/en_tx.c | 2 + drivers/net/ethernet/realtek/r8169_main.c | 3 +- drivers/net/phy/micrel.c | 3 +- drivers/nvme/host/core.c | 6 +- drivers/nvme/host/fabrics.h | 7 - drivers/scsi/sg.c | 20 +- drivers/tty/vt/vt.c | 2 +- include/asm-generic/bitops/atomic.h | 6 - include/net/netfilter/nf_tables.h | 18 +- kernel/cgroup/cpuset.c | 174 +++++++++++++----- lib/test_hmm.c | 8 +- mm/migrate.c | 8 + mm/vmscan.c | 4 +- net/can/j1939/j1939-priv.h | 2 +- net/can/j1939/main.c | 2 +- net/can/j1939/socket.c | 20 +- net/ipv4/netfilter/nf_tproxy_ipv4.c | 2 + net/ipv4/tcp_ipv4.c | 8 +- net/netfilter/nf_tables_api.c | 4 +- net/netfilter/nft_set_hash.c | 8 +- net/netfilter/nft_set_pipapo.c | 25 +-- net/netfilter/nft_set_rbtree.c | 148 ++++++++------- net/tipc/msg.c | 6 +- redhat/kernel.changelog-9.4 | 43 +++++ redhat/kernel.spec.template | 10 +- scripts/kernel.spec | 61 +++++- .../selftests/cgroup/test_cpuset_prs.sh | 75 +++++--- 38 files changed, 562 insertions(+), 251 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 67f2e632dd4eea..814096fed2a9e9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2233,8 +2233,12 @@ Cpuset Interface Files is always a subset of it. Users can manually set it to a value that is different from - "cpuset.cpus". The only constraint in setting it is that the - list of CPUs must be exclusive with respect to its sibling. + "cpuset.cpus". One constraint in setting it is that the list of + CPUs must be exclusive with respect to "cpuset.cpus.exclusive" + of its sibling. If "cpuset.cpus.exclusive" of a sibling cgroup + isn't set, its "cpuset.cpus" value, if set, cannot be a subset + of it to leave at least one CPU available when the exclusive + CPUs are taken away. For a parent cgroup, any one of its exclusive CPUs can only be distributed to at most one of its child cgroups. Having an @@ -2250,8 +2254,8 @@ Cpuset Interface Files cpuset-enabled cgroups. This file shows the effective set of exclusive CPUs that - can be used to create a partition root. The content of this - file will always be a subset of "cpuset.cpus" and its parent's + can be used to create a partition root. The content + of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" if it is set. If "cpuset.cpus.exclusive" is not set, it is diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt index 093cdaefdb3733..d8b101c97031b0 100644 --- a/Documentation/atomic_bitops.txt +++ b/Documentation/atomic_bitops.txt @@ -59,7 +59,7 @@ Like with atomic_t, the rule of thumb is: - RMW operations that have a return value are fully ordered. - RMW operations that are conditional are unordered on FAILURE, - otherwise the above rules apply. In the case of test_and_{}_bit() operations, + otherwise the above rules apply. In the case of test_and_set_bit_lock(), if the bit in memory is unchanged by the operation then it is deemed to have failed. diff --git a/Makefile b/Makefile index f062ef48c753da..bf790b27b9115a 100644 --- a/Makefile +++ b/Makefile @@ -1839,8 +1839,10 @@ scripts_build: prepare_after_cross: # disable STACK_VALIDATION to avoid building objtool sed -i '/^CONFIG_STACK_VALIDATION/d' ./include/config/auto.conf || true - # build minimum set of scripts to allow building external modules + # build minimum set of scripts and resolve_btfids to allow building + # external modules $(MAKE) KBUILD_EXTMOD="" M="" scripts_build V=1 + $(MAKE) -C tools/bpf/resolve_btfids PHONY += prepare_after_cross scripts_build diff --git a/Makefile.rhelver b/Makefile.rhelver index 9ac07a769a0902..fef414422f2147 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 4 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 427.24.1 +RHEL_RELEASE = 427.26.1 # # ZSTREAM diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4887ac140e4fb1..9cdf53ce8a9251 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -580,12 +580,45 @@ static void blkg_destroy_all(struct gendisk *disk) spin_unlock_irq(&q->queue_lock); } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void __blkg_clear_stat(struct blkg_iostat_set *bis) +{ + struct blkg_iostat cur = {0}; + unsigned long flags; + + flags = u64_stats_update_begin_irqsave(&bis->sync); + blkg_iostat_set(&bis->cur, &cur); + blkg_iostat_set(&bis->last, &cur); + u64_stats_update_end_irqrestore(&bis->sync, flags); +} + +static void blkg_clear_stat(struct blkcg_gq *blkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); + + __blkg_clear_stat(s); + } + __blkg_clear_stat(&blkg->iostat); +} + static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i, cpu; + int i; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -596,18 +629,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - for_each_possible_cpu(cpu) { - struct blkg_iostat_set *bis = - per_cpu_ptr(blkg->iostat_cpu, cpu); - memset(bis, 0, sizeof(*bis)); - - /* Re-initialize the cleared blkg_iostat_set */ - u64_stats_init(&bis->sync); - bis->blkg = blkg; - } - memset(&blkg->iostat, 0, sizeof(blkg->iostat)); - u64_stats_init(&blkg->iostat.sync); - + blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -910,16 +932,6 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 1a3c39d9445d2e..d62bc99c2d1f2a 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -1002,12 +1002,14 @@ static irqreturn_t rtsx_pci_isr(int irq, void *dev_id) } else { pcr->card_removed |= SD_EXIST; pcr->card_inserted &= ~SD_EXIST; - if (PCI_PID(pcr) == PID_5261) { - rtsx_pci_write_register(pcr, RTS5261_FW_STATUS, - RTS5261_EXPRESS_LINK_FAIL_MASK, 0); - pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS; - } } + + if ((PCI_PID(pcr) == PID_5261) || (PCI_PID(pcr) == PID_5264)) { + rtsx_pci_write_register(pcr, RTS5261_FW_STATUS, + RTS5261_EXPRESS_LINK_FAIL_MASK, 0); + pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS; + } + pcr->dma_error_count = 0; } diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index cbfe7f977270f7..aa27f837e47857 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1204,8 +1204,11 @@ static void ena_unmap_tx_buff(struct ena_ring *tx_ring, static void ena_free_tx_bufs(struct ena_ring *tx_ring) { bool print_once = true; + bool is_xdp_ring; u32 i; + is_xdp_ring = ENA_IS_XDP_INDEX(tx_ring->adapter, tx_ring->qid); + for (i = 0; i < tx_ring->ring_size; i++) { struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; @@ -1225,10 +1228,15 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring) ena_unmap_tx_buff(tx_ring, tx_info); - dev_kfree_skb_any(tx_info->skb); + if (is_xdp_ring) + xdp_return_frame(tx_info->xdpf); + else + dev_kfree_skb_any(tx_info->skb); } - netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, - tx_ring->qid)); + + if (!is_xdp_ring) + netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->qid)); } static void ena_free_all_tx_bufs(struct ena_adapter *adapter) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c index 23d9cbf262c320..740850b64aff50 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c @@ -400,6 +400,10 @@ static void hns_dsaf_ge_srst_by_port(struct dsaf_device *dsaf_dev, u32 port, return; if (!HNS_DSAF_IS_DEBUG(dsaf_dev)) { + /* DSAF_MAX_PORT_NUM is 6, but DSAF_GE_NUM is 8. + We need check to prevent array overflow */ + if (port >= DSAF_MAX_PORT_NUM) + return; reg_val_1 = 0x1 << port; port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off; /* there is difference between V1 and V2 in register.*/ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c index b339b9bc0625a2..180f06cd21ea54 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c @@ -114,7 +114,8 @@ int hclgevf_send_mbx_msg(struct hclgevf_dev *hdev, memcpy(&req->msg, send_msg, sizeof(struct hclge_vf_to_pf_msg)); - trace_hclge_vf_mbx_send(hdev, req); + if (test_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state)) + trace_hclge_vf_mbx_send(hdev, req); /* synchronous send */ if (need_resp) { diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index 50397426d37059..5ec748d2d92f96 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1329,6 +1329,7 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, for (i = 0; i < count; i++) { bool last = false; + int try_cnt = 0; int status; bh = (struct ice_buf_hdr *)(bufs + start + i); @@ -1336,8 +1337,26 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, if (indicate_last) last = ice_is_last_download_buffer(bh, i, count); - status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last, - &offset, &info, NULL); + while (1) { + status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, + last, &offset, &info, + NULL); + if (hw->adminq.sq_last_status != ICE_AQ_RC_ENOSEC && + hw->adminq.sq_last_status != ICE_AQ_RC_EBADSIG) + break; + + try_cnt++; + + if (try_cnt == 5) + break; + + msleep(20); + } + + if (try_cnt) + dev_dbg(ice_hw_to_dev(hw), + "ice_aq_download_pkg number of retries: %d\n", + try_cnt); /* Save AQ status from download package */ if (status) { diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 9533b1d929604a..803039bef684d8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -999,12 +999,10 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp, u16 pcifunc; int ret, lf; - cmd_buf = memdup_user(buffer, count + 1); + cmd_buf = memdup_user_nul(buffer, count); if (IS_ERR(cmd_buf)) return -ENOMEM; - cmd_buf[count] = '\0'; - cmd_buf_tmp = strchr(cmd_buf, '\n'); if (cmd_buf_tmp) { *cmd_buf_tmp = '\0'; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index f0b506e562df31..1ead69c5f5fa3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -401,6 +401,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, mlx5e_skb_cb_hwtstamp_init(skb); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); + /* ensure skb is put on metadata_map before tracking the index */ + wmb(); mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index); if (!netif_tx_queue_stopped(sq->txq) && mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) { diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 45ecbc150c0b69..d8b97b8d8f4cba 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4343,11 +4343,11 @@ static void rtl8169_doorbell(struct rtl8169_private *tp) static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev) { - unsigned int frags = skb_shinfo(skb)->nr_frags; struct rtl8169_private *tp = netdev_priv(dev); unsigned int entry = tp->cur_tx % NUM_TX_DESC; struct TxDesc *txd_first, *txd_last; bool stop_queue, door_bell; + unsigned int frags; u32 opts[2]; if (unlikely(!rtl_tx_slots_avail(tp))) { @@ -4370,6 +4370,7 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, txd_first = tp->TxDescArray + entry; + frags = skb_shinfo(skb)->nr_frags; if (frags) { if (rtl8169_xmit_frags(tp, skb, opts, entry)) goto err_dma_1; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index b6d7981b2d1ee7..2abc3a6f7005b1 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4577,7 +4577,8 @@ static int lan8841_suspend(struct phy_device *phydev) struct kszphy_priv *priv = phydev->priv; struct kszphy_ptp_priv *ptp_priv = &priv->ptp_priv; - ptp_cancel_worker_sync(ptp_priv->ptp_clock); + if (ptp_priv->ptp_clock) + ptp_cancel_worker_sync(ptp_priv->ptp_clock); return genphy_suspend(phydev); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ceb3517214b518..f8e803e2502773 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4397,7 +4397,8 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->ops = ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect and keep alive */ + set->reserved_tags = 2; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_NO_SCHED; if (ctrl->ops->flags & NVME_F_BLOCKING) @@ -4466,7 +4467,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) set->reserved_tags = NVME_AQ_DEPTH; else if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect */ + set->reserved_tags = 1; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 82e7a27ffbde35..80e15ad3936f37 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -18,13 +18,6 @@ /* default is -1: the fail fast mechanism is disabled */ #define NVMF_DEF_FAIL_FAST_TMO -1 -/* - * Reserved one command for internal usage. This command is used for sending - * the connect command, as well as for the keep alive command on the admin - * queue once live. - */ -#define NVMF_RESERVED_TAGS 1 - /* * Define a host as seen by the target. We allocate one at boot, but also * allow the override it when creating controllers. This is both to provide diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index bb9fc14a04c30b..c4a8d176a1d787 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -285,6 +285,7 @@ sg_open(struct inode *inode, struct file *filp) int dev = iminor(inode); int flags = filp->f_flags; struct request_queue *q; + struct scsi_device *device; Sg_device *sdp; Sg_fd *sfp; int retval; @@ -301,11 +302,12 @@ sg_open(struct inode *inode, struct file *filp) /* This driver's module count bumped by fops_get in */ /* Prevent the device driver from vanishing while we sleep */ - retval = scsi_device_get(sdp->device); + device = sdp->device; + retval = scsi_device_get(device); if (retval) goto sg_put; - retval = scsi_autopm_get_device(sdp->device); + retval = scsi_autopm_get_device(device); if (retval) goto sdp_put; @@ -313,7 +315,7 @@ sg_open(struct inode *inode, struct file *filp) * check if O_NONBLOCK. Permits SCSI commands to be issued * during error recovery. Tread carefully. */ if (!((flags & O_NONBLOCK) || - scsi_block_when_processing_errors(sdp->device))) { + scsi_block_when_processing_errors(device))) { retval = -ENXIO; /* we are in error recovery for this device */ goto error_out; @@ -344,7 +346,7 @@ sg_open(struct inode *inode, struct file *filp) if (sdp->open_cnt < 1) { /* no existing opens */ sdp->sgdebug = 0; - q = sdp->device->request_queue; + q = device->request_queue; sdp->sg_tablesize = queue_max_segments(q); } sfp = sg_add_sfp(sdp); @@ -370,10 +372,11 @@ sg_open(struct inode *inode, struct file *filp) error_mutex_locked: mutex_unlock(&sdp->open_rel_lock); error_out: - scsi_autopm_put_device(sdp->device); + scsi_autopm_put_device(device); sdp_put: - scsi_device_put(sdp->device); - goto sg_put; + kref_put(&sdp->d_ref, sg_device_destroy); + scsi_device_put(device); + return retval; } /* Release resources associated with a successful sg_open() @@ -2185,6 +2188,7 @@ sg_remove_sfp_usercontext(struct work_struct *work) { struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work); struct sg_device *sdp = sfp->parentdp; + struct scsi_device *device = sdp->device; Sg_request *srp; unsigned long iflags; @@ -2210,8 +2214,8 @@ sg_remove_sfp_usercontext(struct work_struct *work) "sg_remove_sfp: sfp=0x%p\n", sfp)); kfree(sfp); - scsi_device_put(sdp->device); kref_put(&sdp->d_ref, sg_device_destroy); + scsi_device_put(device); module_put(THIS_MODULE); } diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 5d9a264402e92f..e609062e71f907 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -855,7 +855,7 @@ static void delete_char(struct vc_data *vc, unsigned int nr) unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_delete(vc, nr); - scr_memcpyw(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); + scr_memmovew(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p + vc->vc_cols - vc->state.x - nr, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index 3096f086b5a327..71ab4ba9c25d18 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -39,9 +39,6 @@ arch_test_and_set_bit(unsigned int nr, volatile unsigned long *p) unsigned long mask = BIT_MASK(nr); p += BIT_WORD(nr); - if (READ_ONCE(*p) & mask) - return 1; - old = arch_atomic_long_fetch_or(mask, (atomic_long_t *)p); return !!(old & mask); } @@ -53,9 +50,6 @@ arch_test_and_clear_bit(unsigned int nr, volatile unsigned long *p) unsigned long mask = BIT_MASK(nr); p += BIT_WORD(nr); - if (!(READ_ONCE(*p) & mask)) - return 0; - old = arch_atomic_long_fetch_andnot(mask, (atomic_long_t *)p); return !!(old & mask); } diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 21e725efe4d0d6..7e5223165e8841 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -441,7 +441,7 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - void (*commit)(const struct nft_set *set); + void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); @@ -768,10 +768,16 @@ static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ex return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } -static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, + u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); + time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); +} + +static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +{ + return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, @@ -1682,6 +1688,7 @@ struct nftables_pernet { struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; + u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; @@ -1694,4 +1701,9 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } +static inline u64 nft_net_tstamp(const struct net *net) +{ + return nft_pernet(net)->tstamp; +} + #endif /* _NET_NF_TABLES_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b73c0d48070e24..ab53ae41aec5a1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -87,7 +87,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; @@ -127,19 +127,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; @@ -169,7 +178,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts; /* partition root state */ @@ -222,6 +231,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -718,6 +738,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -834,17 +867,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -966,13 +1023,15 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1000,16 +1059,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1017,20 +1078,39 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; @@ -1073,6 +1153,20 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1232,7 +1326,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1342,7 +1436,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -1536,7 +1630,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1546,12 +1640,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) @@ -1830,8 +1919,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1859,7 +1947,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -2562,7 +2650,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_partition_sd_lb(cs, old_prs); out_free: free_cpumasks(NULL, &tmp); - return 0; + return retval; } /** @@ -2590,17 +2678,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; - if (*buf) compute_effective_exclusive_cpumask(trialcs, NULL); @@ -2615,6 +2698,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; + if (old_prs) { if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; @@ -3078,9 +3164,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei; /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } @@ -4590,7 +4676,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 23fff4db0123c7..4cc553d663d736 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1226,8 +1226,8 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) unsigned long *src_pfns; unsigned long *dst_pfns; - src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); - dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); + dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); migrate_device_range(src_pfns, start_pfn, npages); for (i = 0; i < npages; i++) { @@ -1250,8 +1250,8 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) } migrate_device_pages(src_pfns, dst_pfns, npages); migrate_device_finalize(src_pfns, dst_pfns, npages); - kfree(src_pfns); - kfree(dst_pfns); + kvfree(src_pfns); + kvfree(dst_pfns); } /* Removes free pages from the free list so they can't be re-allocated */ diff --git a/mm/migrate.c b/mm/migrate.c index 591c0ffb249a9e..112a199798f738 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2149,6 +2149,14 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (managed_zone(pgdat->node_zones + z)) break; } + + /* + * If there are no managed zones, it should not proceed + * further. + */ + if (z < 0) + return 0; + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b7df6383fb7ea3..6549d0bc44ff30 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4460,8 +4460,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, walk_mm(lruvec, mm, walk); } while (mm); done: - if (success) + if (success) { inc_max_seq(lruvec, can_swap, force_scan); + wakeup_flusher_threads(WB_REASON_VMSCAN); + } return success; } diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 12369b604ce95a..2cb81ad81ecd62 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -83,7 +83,7 @@ struct j1939_priv { unsigned int tp_max_packet_size; /* lock for j1939_socks list */ - spinlock_t j1939_socks_lock; + rwlock_t j1939_socks_lock; struct list_head j1939_socks; struct kref rx_kref; diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 0e9af9075069d5..1d7e2f554d153f 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -264,7 +264,7 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); - spin_lock_init(&priv->j1939_socks_lock); + rwlock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); spin_lock(&j1939_netdev_lock); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f0a16f3f8bb4d4..5699937bd3bde1 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -80,16 +80,16 @@ static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); - spin_lock_bh(&priv->j1939_socks_lock); + write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); - spin_unlock_bh(&priv->j1939_socks_lock); + write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { - spin_lock_bh(&priv->j1939_socks_lock); + write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); - spin_unlock_bh(&priv->j1939_socks_lock); + write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; @@ -326,13 +326,13 @@ bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) struct j1939_sock *jsk; bool match = false; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); return match; } @@ -341,11 +341,11 @@ void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) @@ -1188,7 +1188,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) struct j1939_sock *jsk; int error_code = ENETDOWN; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) @@ -1196,7 +1196,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) j1939_sk_queue_drop_all(priv, jsk, error_code); } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 69e33179960430..73e66a088e25eb 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -58,6 +58,8 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); + if (!indev) + return daddr; in_dev_for_each_ifa_rcu(ifa, indev) { if (ifa->ifa_flags & IFA_F_SECONDARY) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 74996ae98a87cb..740d2bf959a45c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -151,6 +151,12 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) if (tcptw->tw_ts_recent_stamp && (!twp || (reuse && time_after32(ktime_get_seconds(), tcptw->tw_ts_recent_stamp)))) { + /* inet_twsk_hashdance() sets sk_refcnt after putting twsk + * and releasing the bucket lock. + */ + if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) + return 0; + /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair @@ -171,7 +177,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } - sock_hold(sktw); + return 1; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 482d3a4a8c7902..50263a8a6a3d88 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9429,6 +9429,7 @@ struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { struct nft_set_elem_catchall *catchall, *next; + u64 tstamp = nft_net_tstamp(gc->net); const struct nft_set *set = gc->set; struct nft_elem_priv *elem_priv; struct nft_set_elem elem; @@ -9439,7 +9440,7 @@ struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_expired(ext)) + if (!__nft_set_elem_expired(ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); @@ -10200,6 +10201,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) bool genid_ok; mutex_lock(&nft_net->commit_mutex); + nft_net->tstamp = get_jiffies_64(); genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0691565caa81b4..88cabf2a0d68c1 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -36,6 +36,7 @@ struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; + u64 tstamp; }; static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) @@ -62,7 +63,7 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, return 1; if (nft_set_elem_is_dead(&he->ext)) return 1; - if (nft_set_elem_expired(&he->ext)) + if (__nft_set_elem_expired(&he->ext, x->tstamp)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) return 1; @@ -87,6 +88,7 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -106,6 +108,7 @@ nft_rhash_get(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = elem->key.val.data, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -131,6 +134,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, .genmask = NFT_GENMASK_ANY, .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -175,6 +179,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; struct nft_rhash_elem *prev; @@ -216,6 +221,7 @@ nft_rhash_deactivate(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; rcu_read_lock(); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 2f3efeb3793b7e..7bd60cab6d8852 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -504,6 +504,7 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, * @set: nftables API set representation * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * This is essentially the same as the lookup function, except that it matches * key data against the uncommitted copy and doesn't use preallocated maps for @@ -513,7 +514,8 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, - const u8 *data, u8 genmask) + const u8 *data, u8 genmask, + u64 tstamp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); struct nft_pipapo *priv = nft_set_priv(set); @@ -566,7 +568,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext)) + if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) @@ -606,7 +608,7 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, static struct nft_pipapo_elem *e; e = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + nft_genmask_cur(net), get_jiffies_64()); if (IS_ERR(e)) return ERR_CAST(e); @@ -1173,6 +1175,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_match *m = priv->clone; u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; @@ -1182,7 +1185,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else end = start; - dup = pipapo_get(net, set, start, genmask); + dup = pipapo_get(net, set, start, genmask, tstamp); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1204,7 +1207,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net)); + dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp); } if (PTR_ERR(dup) != -ENOENT) { @@ -1558,14 +1561,14 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @_set: nftables API set representation + * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) +static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { - struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; struct nft_trans_gc *gc; @@ -1600,7 +1603,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) /* synchronous gc never fails, there is no need to set on * NFT_SET_ELEM_DEAD_BIT. */ - if (nft_set_elem_expired(&e->ext)) { + if (__nft_set_elem_expired(&e->ext, tstamp)) { priv->dirty = true; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); @@ -1681,7 +1684,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void nft_pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1775,7 +1778,7 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, { struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net)); + e = pipapo_get(net, set, data, nft_genmask_next(net), nft_net_tstamp(net)); if (IS_ERR(e)) return NULL; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 291ce919dcce79..577d99ffa0b09d 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -19,7 +19,7 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_rwlock_t count; - struct delayed_work gc_work; + unsigned long last_gc; }; struct nft_rbtree_elem { @@ -49,8 +49,7 @@ static int nft_rbtree_cmp(const struct nft_set *set, static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) { - return nft_set_elem_expired(&rbe->ext) || - nft_set_elem_is_dead(&rbe->ext); + return nft_set_elem_expired(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -224,14 +223,15 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set, return &rbe->priv; } -static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) +static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { struct nft_set_elem elem = { .priv = &rbe->priv, }; + lockdep_assert_held_write(&priv->lock); nft_setelem_data_deactivate(net, set, &elem); rb_erase(&rbe->node, &priv->root); } @@ -266,7 +266,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - nft_rbtree_gc_remove(net, set, priv, rbe_prev); + nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); /* There is always room in this trans gc for this element, * memory allocation never actually happens, hence, the warning @@ -280,7 +280,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, nft_trans_gc_elem_add(gc, rbe_prev); } - nft_rbtree_gc_remove(net, set, priv, rbe); + nft_rbtree_gc_elem_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) return ERR_PTR(-ENOMEM); @@ -317,6 +317,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; /* Descend the tree to search for an existing element greater than the @@ -364,7 +365,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* perform garbage collection to avoid bogus overlap reports * but skip new elements in this transaction. */ - if (nft_set_elem_expired(&rbe->ext) && + if (__nft_set_elem_expired(&rbe->ext, tstamp) && nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; @@ -510,6 +511,15 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, return err; } +static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) +{ + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + rb_erase(&rbe->node, &priv->root); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); +} + static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) @@ -517,11 +527,7 @@ static void nft_rbtree_remove(const struct net *net, struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, @@ -550,6 +556,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { @@ -570,7 +577,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; - } else if (nft_set_elem_expired(&rbe->ext)) { + } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; @@ -614,45 +621,39 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, read_unlock_bh(&priv->lock); } -static void nft_rbtree_gc(struct work_struct *work) +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { + struct nft_set_elem elem = { + .priv = &rbe->priv, + }; + + nft_setelem_data_deactivate(net, set, &elem); + nft_rbtree_erase(priv, rbe); +} + +static void nft_rbtree_gc(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; - struct nftables_pernet *nft_net; - struct nft_rbtree *priv; + struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); + struct rb_node *node, *next; struct nft_trans_gc *gc; - struct rb_node *node; - struct nft_set *set; - unsigned int gc_seq; - struct net *net; - priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - nft_net = nft_pernet(net); - gc_seq = READ_ONCE(nft_net->gc_seq); - if (nft_set_gc_is_pending(set)) - goto done; - - gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) - goto done; - - read_lock_bh(&priv->lock); - for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + return; - /* Ruleset has been updated, try later. */ - if (READ_ONCE(nft_net->gc_seq) != gc_seq) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } + for (node = rb_first(&priv->root); node ; node = next) { + next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (nft_set_elem_is_dead(&rbe->ext)) - goto dead_elem; - /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is * always visited before the start element. @@ -661,40 +662,37 @@ static void nft_rbtree_gc(struct work_struct *work) rbe_end = rbe; continue; } - if (!nft_set_elem_expired(&rbe->ext)) + if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - nft_set_elem_dead(&rbe->ext); - - if (!rbe_end) - continue; - - nft_set_elem_dead(&rbe_end->ext); - - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; - nft_trans_gc_elem_add(gc, rbe_end); - rbe_end = NULL; -dead_elem: - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + /* end element needs to be removed first, it has + * no timeout extension. + */ + if (rbe_end) { + nft_rbtree_gc_remove(net, set, priv, rbe_end); + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; + } + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; + nft_rbtree_gc_remove(net, set, priv, rbe); nft_trans_gc_elem_add(gc, rbe); } - gc = nft_trans_gc_catchall_async(gc, gc_seq); - try_later: - read_unlock_bh(&priv->lock); - if (gc) - nft_trans_gc_queue_async_done(gc); -done: - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -715,11 +713,6 @@ static int nft_rbtree_init(const struct nft_set *set, seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); - return 0; } @@ -730,8 +723,6 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx, struct nft_rbtree_elem *rbe; struct rb_node *node; - cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -757,6 +748,21 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_rbtree_commit(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + nft_rbtree_gc(set); +} + +static void nft_rbtree_gc_init(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -770,6 +776,8 @@ const struct nft_set_type nft_set_rbtree_type = { .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, + .commit = nft_rbtree_commit, + .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 5c9fd4791c4ba1..9a6e9bcbf69402 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -156,6 +156,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (!head) goto err; + /* Either the input skb ownership is transferred to headskb + * or the input skb is freed, clear the reference to avoid + * bad access on error path. + */ + *buf = NULL; if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { @@ -179,7 +184,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) *headbuf = NULL; return 1; } - *buf = NULL; return 0; err: kfree_skb(*buf); diff --git a/redhat/kernel.changelog-9.4 b/redhat/kernel.changelog-9.4 index 04dbe25828f21b..dd5798d49ba080 100644 --- a/redhat/kernel.changelog-9.4 +++ b/redhat/kernel.changelog-9.4 @@ -1,3 +1,46 @@ +* Fri Jul 05 2024 Scott Weaver [5.14.0-427.26.1.el9_4] +- net: ena: Fix incorrect descriptor free behavior (Kamal Heib) [RHEL-39217 RHEL-37430] {CVE-2024-35958} +- tcp: Use refcount_inc_not_zero() in tcp_twsk_unique(). (Guillaume Nault) [RHEL-41749 RHEL-39837] {CVE-2024-36904} +- mm/mglru: Revert "don't sync disk for each aging cycle" (Waiman Long) [RHEL-44418] +- tipc: fix UAF in error path (Xin Long) [RHEL-34848 RHEL-34280] {CVE-2024-36886} +- selftest/cgroup: Update test_cpuset_prs.sh to match changes (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Make cpuset.cpus.exclusive independent of cpuset.cpus (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Delay setting of CS_CPU_EXCLUSIVE until valid partition (Waiman Long) [RHEL-45139] +- selftest/cgroup: Fix test_cpuset_prs.sh problems reported by test robot (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix remote root partition creation problem (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Optimize isolated partition only generate_sched_domains() calls (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix retval in update_cpumask() (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix a memory leak in update_exclusive_cpumask() (Waiman Long) [RHEL-45139] +- ice: implement AQ download pkg retry (Petr Oros) [RHEL-38907 RHEL-17318] +- redhat: include resolve_btfids in kernel-devel (Viktor Malik) [RHEL-43426 RHEL-40707] +- blk-cgroup: fix list corruption from resetting io stat (cki-backport-bot) [RHEL-44977] {CVE-2024-38663} +- misc: rtsx: do clear express reg every SD_INT (David Arcari) [RHEL-39985 RHEL-33706] +- misc: rtsx: Fix rts5264 driver status incorrect when card removed (David Arcari) [RHEL-39985 RHEL-33706] +- netfilter: tproxy: bail out if IP has been disabled on the device (cki-backport-bot) [RHEL-44371] {CVE-2024-36270} +- lib/test_hmm.c: handle src_pfns and dst_pfns allocation failure (cki-backport-bot) [RHEL-44263 RHEL-44261] {CVE-2024-38543} +- r8169: Fix possible ring buffer corruption on fragmented Tx packets. (cki-backport-bot) [RHEL-44039] {CVE-2024-38586} +- net: micrel: Fix receiving the timestamp in the frame for lan8841 (cki-backport-bot) [RHEL-43996] {CVE-2024-38593} +- vt: fix memory overlapping when deleting chars in the buffer (Waiman Long) [RHEL-43379 RHEL-27780] {CVE-2022-48627} +- net/mlx5e: Use a memory barrier to enforce PTP WQ xmit submission tracking occurs after populating the metadata_map (Kamal Heib) [RHEL-42728 RHEL-34192] {CVE-2024-26858} +- locking/atomic: Make test_and_*_bit() ordered on failure (Paolo Bonzini) [RHEL-45896] +- mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index (Rafael Aquini) [RHEL-42659 RHEL-31840] {CVE-2024-26783} +- can: j1939: prevent deadlock by changing j1939_socks_lock to rwlock (Jose Ignacio Tornos Martinez) [RHEL-42379 RHEL-31530] {CVE-2023-52638} +- ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port() (Ken Cox) [RHEL-42226 RHEL-38715] {CVE-2021-47548} +Resolves: RHEL-34848, RHEL-38907, RHEL-39217, RHEL-39985, RHEL-41749, RHEL-42226, RHEL-42379, RHEL-42659, RHEL-42728, RHEL-43379, RHEL-43426, RHEL-43996, RHEL-44039, RHEL-44263, RHEL-44371, RHEL-44418, RHEL-44977, RHEL-45139, RHEL-45896, RHEL-43381, RHEL-42730, RHEL-44264, RHEL-44978, RHEL-43997, RHEL-42381, RHEL-42228, RHEL-41751, RHEL-44040, RHEL-39219, RHEL-42661, RHEL-34852, RHEL-44372 + +* Mon Jul 01 2024 Scott Weaver [5.14.0-427.25.1.el9_4] +- nvme: fix reconnection fail due to reserved tag allocation (Maurizio Lombardi) [RHEL-42896 RHEL-36896] {CVE-2024-27435} +- net: hns3: fix use-after-free bug in hclgevf_send_mbx_msg (cki-backport-bot) [RHEL-43625] {CVE-2021-47596} +- scsi: sg: Avoid race in error handling & drop bogus warn (Ewan D. Milne) [RHEL-36106 RHEL-35659] +- scsi: sg: Avoid sg device teardown race (Ewan D. Milne) [RHEL-36106 RHEL-35659] +- netfilter: nf_tables: use timestamp to check for set element timeout (Florian Westphal) [RHEL-38032 RHEL-33985] {CVE-2024-27397} +- netfilter: nft_set_rbtree: Remove unused variable nft_net (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nft_set_rbtree: prefer sync gc to async worker (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nft_set_rbtree: rename gc deactivate+erase function (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nf_tables: de-constify set commit ops function argument (Florian Westphal) [RHEL-38032 RHEL-33985] +- octeontx2-af: avoid off-by-one read from userspace (Kamal Heib) [RHEL-40486 RHEL-39873] {CVE-2024-36957} +Resolves: RHEL-36106, RHEL-38032, RHEL-40486, RHEL-42896, RHEL-43625, RHEL-38050, RHEL-40488, RHEL-42898, RHEL-43626 + * Sun Jun 23 2024 Scott Weaver [5.14.0-427.24.1.el9_4] - net/bnx2x: Prevent access to a freed page in page_pool (Michal Schmidt) [RHEL-43272 RHEL-23117] - bnx2x: new flag for track HW resource allocation (Michal Schmidt) [RHEL-43272 RHEL-23117] diff --git a/redhat/kernel.spec.template b/redhat/kernel.spec.template index c6a4c526c65300..39df1f79b13c51 100644 --- a/redhat/kernel.spec.template +++ b/redhat/kernel.spec.template @@ -2197,12 +2197,12 @@ BuildKernel() { cp --parents tools/build/Build $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents tools/build/fixdep.c $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents tools/objtool/sync-check.sh $RPM_BUILD_ROOT/lib/modules/$KernelVer/build - cp -a --parents tools/bpf/resolve_btfids/main.c $RPM_BUILD_ROOT/lib/modules/$KernelVer/build - cp -a --parents tools/bpf/resolve_btfids/Build $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a --parents tools/bpf/resolve_btfids $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents security/selinux/include/policycap_names.h $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents security/selinux/include/policycap.h $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a --parents tools/include/asm $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/linux $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/uapi/asm $RPM_BUILD_ROOT/lib/modules/$KernelVer/build @@ -2240,6 +2240,9 @@ BuildKernel() { if [ -d arch/%{asmarch}/include ]; then cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ fi + if [ -d tools/arch/%{asmarch}/include ]; then + cp -a --parents tools/arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + fi %ifarch aarch64 # arch/arm64/include/asm/xen references arch/arm cp -a --parents arch/arm/include/asm/xen $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ @@ -2527,6 +2530,7 @@ BuildKernel() { %if %{with_cross} make -C $RPM_BUILD_ROOT/lib/modules/$KernelVer/build M=scripts clean + make -C $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/tools/bpf/resolve_btfids clean sed -i 's/REBUILD_SCRIPTS_FOR_CROSS:=0/REBUILD_SCRIPTS_FOR_CROSS:=1/' $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile %endif @@ -3180,7 +3184,7 @@ then\ )\ fi\ %if %{with_cross}\ - echo "Building scripts"\ + echo "Building scripts and resolve_btfids"\ env --unset=ARCH make -C /usr/src/kernels/%{KVERREL}%{?1:+%{1}} prepare_after_cross\ %endif\ %{nil} diff --git a/scripts/kernel.spec b/scripts/kernel.spec index 69f37f05eaa7d9..49169b2c3c490d 100644 --- a/scripts/kernel.spec +++ b/scripts/kernel.spec @@ -165,15 +165,15 @@ Summary: The Linux kernel # define buildid .local %define specversion 5.14.0 %define patchversion 5.14 -%define pkgrelease 427.24.1 +%define pkgrelease 427.26.1 %define kversion 5 -%define tarfile_release 5.14.0-427.24.1.el9_4 +%define tarfile_release 5.14.0-427.26.1.el9_4 # This is needed to do merge window version magic %define patchlevel 14 # This allows pkg_release to have configurable %%{?dist} tag -%define specrelease 427.24.1%{?buildid}%{?dist} +%define specrelease 427.26.1%{?buildid}%{?dist} # This defines the kabi tarball version -%define kabiversion 5.14.0-427.24.1.el9_4 +%define kabiversion 5.14.0-427.26.1.el9_4 # # End of genspec.sh variables @@ -2199,12 +2199,12 @@ BuildKernel() { cp --parents tools/build/Build $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents tools/build/fixdep.c $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents tools/objtool/sync-check.sh $RPM_BUILD_ROOT/lib/modules/$KernelVer/build - cp -a --parents tools/bpf/resolve_btfids/main.c $RPM_BUILD_ROOT/lib/modules/$KernelVer/build - cp -a --parents tools/bpf/resolve_btfids/Build $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a --parents tools/bpf/resolve_btfids $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents security/selinux/include/policycap_names.h $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp --parents security/selinux/include/policycap.h $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a --parents tools/include/asm $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/linux $RPM_BUILD_ROOT/lib/modules/$KernelVer/build cp -a --parents tools/include/uapi/asm $RPM_BUILD_ROOT/lib/modules/$KernelVer/build @@ -2242,6 +2242,9 @@ BuildKernel() { if [ -d arch/%{asmarch}/include ]; then cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ fi + if [ -d tools/arch/%{asmarch}/include ]; then + cp -a --parents tools/arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + fi %ifarch aarch64 # arch/arm64/include/asm/xen references arch/arm cp -a --parents arch/arm/include/asm/xen $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ @@ -2531,6 +2534,7 @@ BuildKernel() { %if %{with_cross} make -C $RPM_BUILD_ROOT/lib/modules/$KernelVer/build M=scripts clean + make -C $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/tools/bpf/resolve_btfids clean sed -i 's/REBUILD_SCRIPTS_FOR_CROSS:=0/REBUILD_SCRIPTS_FOR_CROSS:=1/' $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile %endif @@ -3184,7 +3188,7 @@ then\ )\ fi\ %if %{with_cross}\ - echo "Building scripts"\ + echo "Building scripts and resolve_btfids"\ env --unset=ARCH make -C /usr/src/kernels/%{KVERREL}%{?1:+%{1}} prepare_after_cross\ %endif\ %{nil} @@ -3731,10 +3735,51 @@ fi # # %changelog -* Mon Jul 08 2024 Release Engineering - 5.14.0-427.24.1 +* Wed Jul 17 2024 Release Engineering - 5.14.0-427.26.1 - Porting to 9.4, debranding and Rocky branding - Ensure aarch64 kernel is not compressed +* Fri Jul 05 2024 Scott Weaver [5.14.0-427.26.1.el9_4] +- net: ena: Fix incorrect descriptor free behavior (Kamal Heib) [RHEL-39217 RHEL-37430] {CVE-2024-35958} +- tcp: Use refcount_inc_not_zero() in tcp_twsk_unique(). (Guillaume Nault) [RHEL-41749 RHEL-39837] {CVE-2024-36904} +- mm/mglru: Revert "don't sync disk for each aging cycle" (Waiman Long) [RHEL-44418] +- tipc: fix UAF in error path (Xin Long) [RHEL-34848 RHEL-34280] {CVE-2024-36886} +- selftest/cgroup: Update test_cpuset_prs.sh to match changes (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Make cpuset.cpus.exclusive independent of cpuset.cpus (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Delay setting of CS_CPU_EXCLUSIVE until valid partition (Waiman Long) [RHEL-45139] +- selftest/cgroup: Fix test_cpuset_prs.sh problems reported by test robot (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix remote root partition creation problem (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Optimize isolated partition only generate_sched_domains() calls (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix retval in update_cpumask() (Waiman Long) [RHEL-45139] +- cgroup/cpuset: Fix a memory leak in update_exclusive_cpumask() (Waiman Long) [RHEL-45139] +- ice: implement AQ download pkg retry (Petr Oros) [RHEL-38907 RHEL-17318] +- redhat: include resolve_btfids in kernel-devel (Viktor Malik) [RHEL-43426 RHEL-40707] +- blk-cgroup: fix list corruption from resetting io stat (cki-backport-bot) [RHEL-44977] {CVE-2024-38663} +- misc: rtsx: do clear express reg every SD_INT (David Arcari) [RHEL-39985 RHEL-33706] +- misc: rtsx: Fix rts5264 driver status incorrect when card removed (David Arcari) [RHEL-39985 RHEL-33706] +- netfilter: tproxy: bail out if IP has been disabled on the device (cki-backport-bot) [RHEL-44371] {CVE-2024-36270} +- lib/test_hmm.c: handle src_pfns and dst_pfns allocation failure (cki-backport-bot) [RHEL-44263 RHEL-44261] {CVE-2024-38543} +- r8169: Fix possible ring buffer corruption on fragmented Tx packets. (cki-backport-bot) [RHEL-44039] {CVE-2024-38586} +- net: micrel: Fix receiving the timestamp in the frame for lan8841 (cki-backport-bot) [RHEL-43996] {CVE-2024-38593} +- vt: fix memory overlapping when deleting chars in the buffer (Waiman Long) [RHEL-43379 RHEL-27780] {CVE-2022-48627} +- net/mlx5e: Use a memory barrier to enforce PTP WQ xmit submission tracking occurs after populating the metadata_map (Kamal Heib) [RHEL-42728 RHEL-34192] {CVE-2024-26858} +- locking/atomic: Make test_and_*_bit() ordered on failure (Paolo Bonzini) [RHEL-45896] +- mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index (Rafael Aquini) [RHEL-42659 RHEL-31840] {CVE-2024-26783} +- can: j1939: prevent deadlock by changing j1939_socks_lock to rwlock (Jose Ignacio Tornos Martinez) [RHEL-42379 RHEL-31530] {CVE-2023-52638} +- ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port() (Ken Cox) [RHEL-42226 RHEL-38715] {CVE-2021-47548} + +* Mon Jul 01 2024 Scott Weaver [5.14.0-427.25.1.el9_4] +- nvme: fix reconnection fail due to reserved tag allocation (Maurizio Lombardi) [RHEL-42896 RHEL-36896] {CVE-2024-27435} +- net: hns3: fix use-after-free bug in hclgevf_send_mbx_msg (cki-backport-bot) [RHEL-43625] {CVE-2021-47596} +- scsi: sg: Avoid race in error handling & drop bogus warn (Ewan D. Milne) [RHEL-36106 RHEL-35659] +- scsi: sg: Avoid sg device teardown race (Ewan D. Milne) [RHEL-36106 RHEL-35659] +- netfilter: nf_tables: use timestamp to check for set element timeout (Florian Westphal) [RHEL-38032 RHEL-33985] {CVE-2024-27397} +- netfilter: nft_set_rbtree: Remove unused variable nft_net (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nft_set_rbtree: prefer sync gc to async worker (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nft_set_rbtree: rename gc deactivate+erase function (Florian Westphal) [RHEL-38032 RHEL-33985] +- netfilter: nf_tables: de-constify set commit ops function argument (Florian Westphal) [RHEL-38032 RHEL-33985] +- octeontx2-af: avoid off-by-one read from userspace (Kamal Heib) [RHEL-40486 RHEL-39873] {CVE-2024-36957} + * Sun Jun 23 2024 Scott Weaver [5.14.0-427.24.1.el9_4] - net/bnx2x: Prevent access to a freed page in page_pool (Michal Schmidt) [RHEL-43272 RHEL-23117] - bnx2x: new flag for track HW resource allocation (Michal Schmidt) [RHEL-43272 RHEL-23117] diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 233d74f9de3527..1c3860446b307f 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -28,6 +28,14 @@ CPULIST=$(cat $CGROUP2/cpuset.cpus.effective) NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//") [[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!" +# Check to see if /dev/console exists and is writable +if [[ -c /dev/console && -w /dev/console ]] +then + CONSOLE=/dev/console +else + CONSOLE=/dev/null +fi + # Set verbose flag and delay factor PROG=$1 VERBOSE=0 @@ -103,8 +111,8 @@ console_msg() { MSG=$1 echo "$MSG" - echo "" > /dev/console - echo "$MSG" > /dev/console + echo "" > $CONSOLE + echo "$MSG" > $CONSOLE pause 0.01 } @@ -161,6 +169,14 @@ test_add_proc() # T = put a task into cgroup # O= = Write to CPU online file of # +# ECPUs - effective CPUs of cpusets +# Pstate - partition root state +# ISOLCPUS - isolated CPUs ([,]) +# +# Note that if there are 2 fields in ISOLCPUS, the first one is for +# sched-debug matching which includes offline CPUs and single-CPU partitions +# while the second one is for matching cpuset.cpus.isolated. +# SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -220,23 +236,29 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1,A2:2-3,A3:2-3 A1:P0,A2:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2,A2:3,A3:3 A1:P0,A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-2,A2:1-2,A3:3 A1:P0,A3:P2 3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3,A2:1-3,A3:2-3,B1:2-3 A1:P0,A3:P0,B1:P-2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1,A3:2-3 A2:P2,A3:P2 1-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3,B1:4-5 A3:P2,B1:P2 2-5" + " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4,A2:1-3,A3:1-3 A2:P2 1-3" + " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4,A2:4,A3:2-3 A3:P2 2-3" # Nested remote/local partition tests " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1,A2:2-3,A3:2-3,B1:4 \ + A1:P0,A2:P1,A3:P0,B1:P1" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ A1:P0,A2:P2,A3:P1 2-4,2-3" + " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1,A2:2,A3:3-4 \ + A1:P0,A2:P2,A3:P1 2" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -262,8 +284,8 @@ TEST_MATRIX=( . . X2-3 P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3" # Invalid to valid local partition direct transition tests - " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3" - " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" + " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:1-3:XA2: A1:P2,A2:P-2 1-3" + " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4,B1:4-6 A1:P-2,B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3" " C0-3:P2 . . C3-5:C4-5 . . . . 0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3" @@ -274,21 +296,18 @@ TEST_MATRIX=( " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ . . X4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ - . . C4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" + . . C4:X . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" # Local partition CPU change tests " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2" " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3" # cpus_allowed/exclusive_cpus update tests " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ + . X:C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . X1 . P2 . 0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \ A1:P0,A3:P-2" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . . C3 P2 . 0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P2 3" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . . X3 P2 . 0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \ A1:P0,A3:P2 3" @@ -296,10 +315,7 @@ TEST_MATRIX=( . . X3 . . 0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . . C3 . . 0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P-2" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . C4 . . . 0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \ + . X4 . . . 0 A1:0-3,A2:1-3,A3:2-3,XA1:4,XA2:,XA3 \ A1:P0,A3:P-2" # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -346,6 +362,9 @@ TEST_MATRIX=( " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1" " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1" + # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it + " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -355,6 +374,9 @@ TEST_MATRIX=( # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3,B1:4-5" + + # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3,B1:4-5" ) # @@ -556,14 +578,15 @@ check_cgroup_states() do set -- $(echo $CHK | sed -e "s/:/ /g") CGRP=$1 + CGRP_DIR=$CGRP STATE=$2 FILE= EVAL=$(expr substr $STATE 2 2) - [[ $CGRP = A2 ]] && CGRP=A1/A2 - [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + [[ $CGRP = A2 ]] && CGRP_DIR=A1/A2 + [[ $CGRP = A3 ]] && CGRP_DIR=A1/A2/A3 case $STATE in - P*) FILE=$CGRP/cpuset.cpus.partition + P*) FILE=$CGRP_DIR/cpuset.cpus.partition ;; *) echo "Unknown state: $STATE!" exit 1 @@ -587,6 +610,16 @@ check_cgroup_states() ;; esac [[ $EVAL != $VAL ]] && return 1 + + # + # For root partition, dump sched-domains info to console if + # verbose mode set for manual comparison with sched debug info. + # + [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && { + DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective) + [[ -n "$DOMS" ]] && + echo " [$CGRP] sched-domain: $DOMS" > $CONSOLE + } done return 0 } @@ -694,9 +727,9 @@ null_isolcpus_check() [[ $VERBOSE -gt 0 ]] || return 0 # Retry a few times before printing error RETRY=0 - while [[ $RETRY -lt 5 ]] + while [[ $RETRY -lt 8 ]] do - pause 0.01 + pause 0.02 check_isolcpus "." [[ $? -eq 0 ]] && return 0 ((RETRY++)) @@ -726,7 +759,7 @@ run_state_test() while [[ $I -lt $CNT ]] do - echo "Running test $I ..." > /dev/console + echo "Running test $I ..." > $CONSOLE [[ $VERBOSE -gt 1 ]] && { echo "" eval echo \${$TEST[$I]} @@ -783,7 +816,7 @@ run_state_test() while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]] do # Wait a bit longer & recheck a few times - pause 0.01 + pause 0.02 ((RETRY++)) NEWLIST=$(cat cpuset.cpus.effective) done